diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/Makefile | 4 | ||||
-rw-r--r-- | net/mptcp/bpf.c | 21 | ||||
-rw-r--r-- | net/mptcp/ctrl.c | 21 | ||||
-rw-r--r-- | net/mptcp/mib.c | 5 | ||||
-rw-r--r-- | net/mptcp/mib.h | 7 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 105 | ||||
-rw-r--r-- | net/mptcp/options.c | 69 | ||||
-rw-r--r-- | net/mptcp/pm.c | 108 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 266 | ||||
-rw-r--r-- | net/mptcp/pm_userspace.c | 429 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 123 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 101 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 21 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 72 |
14 files changed, 1141 insertions, 211 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e54daceac58b..6e7df47c9584 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o + mib.o pm_netlink.o sockopt.o pm_userspace.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o @@ -10,3 +10,5 @@ obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o + +obj-$(CONFIG_BPF_SYSCALL) += bpf.o diff --git a/net/mptcp/bpf.c b/net/mptcp/bpf.c new file mode 100644 index 000000000000..5a0a84ad94af --- /dev/null +++ b/net/mptcp/bpf.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Tessares SA. + * Copyright (c) 2022, SUSE. + * + * Author: Nicolas Rybowski <nicolas.rybowski@tessares.net> + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/bpf.h> +#include "protocol.h" + +struct mptcp_sock *bpf_mptcp_sock_from_subflow(struct sock *sk) +{ + if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && sk_is_mptcp(sk)) + return mptcp_sk(mptcp_subflow_ctx(sk)->conn); + + return NULL; +} diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 8b235468c88f..ae20b7d92e28 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -16,6 +16,11 @@ #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; + +#ifdef CONFIG_SYSCTL +static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; +#endif + struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; @@ -26,6 +31,7 @@ struct mptcp_pernet { u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; + u8 pm_type; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -58,6 +64,11 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net) return mptcp_get_pernet(net)->stale_loss_cnt; } +int mptcp_get_pm_type(const struct net *net) +{ + return mptcp_get_pernet(net)->pm_type; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -65,6 +76,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; + pernet->pm_type = MPTCP_PM_TYPE_KERNEL; } #ifdef CONFIG_SYSCTL @@ -108,6 +120,14 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_douintvec_minmax, }, + { + .procname = "pm_type", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &mptcp_pm_type_max + }, {} }; @@ -128,6 +148,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; + table[5].data = &pernet->pm_type; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index e55d3dfbee0c..0dac2863c6e1 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -24,6 +24,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), @@ -55,6 +56,10 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), + SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), + SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), + SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), + SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 00576179a619..2be3596374f4 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -17,6 +17,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ MPTCP_MIB_DATACSUMERR, /* The data checksum fail */ @@ -48,6 +49,12 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ + MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to + * conflict with another subflow while updating msk rcv wnd + */ + MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f44125dd6697..7f9a71780437 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -66,20 +66,103 @@ out_nosk: return err; } +struct mptcp_diag_ctx { + long s_slot; + long s_num; + unsigned int l_slot; + unsigned int l_num; +}; + +static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + bool net_admin) +{ + struct inet_diag_dump_data *cb_data = cb->data; + struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; + struct nlattr *bc = cb_data->inet_diag_nla_bc; + struct net *net = sock_net(skb->sk); + int i; + + for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + struct sock *sk; + int num = 0; + + ilb = &tcp_hashinfo.lhash2[i]; + + rcu_read_lock(); + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + const struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk); + struct inet_sock *inet = inet_sk(sk); + int ret; + + if (num < diag_ctx->l_num) + goto next_listen; + + if (!ctx || strcmp(inet_csk(sk)->icsk_ulp_ops->name, "mptcp")) + goto next_listen; + + sk = ctx->conn; + if (!sk || !net_eq(sock_net(sk), net)) + goto next_listen; + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_listen; + + ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + + sock_put(sk); + + if (ret < 0) { + spin_unlock(&ilb->lock); + rcu_read_unlock(); + diag_ctx->l_slot = i; + diag_ctx->l_num = num; + return; + } + diag_ctx->l_num = num + 1; + num = 0; +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + rcu_read_unlock(); + + cond_resched(); + diag_ctx->l_num = 0; + } + + diag_ctx->l_num = 0; + diag_ctx->l_slot = i; +} + static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; struct nlattr *bc; + BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); + cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &cb->args[0], &cb->args[1])) != - NULL) { + while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, + &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; struct sock *sk = (struct sock *)msk; int ret = 0; @@ -101,11 +184,14 @@ next: sock_put(sk); if (ret < 0) { /* will retry on the same position */ - cb->args[1]--; + diag_ctx->s_num--; break; } cond_resched(); } + + if ((r->idiag_states & TCPF_LISTEN) && r->id.idiag_dport == 0) + mptcp_diag_dump_listeners(skb, cb, r, net_admin); } static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -116,6 +202,19 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); + + if (inet_sk_state_load(sk) == TCP_LISTEN) { + struct sock *lsk = READ_ONCE(msk->first); + + if (lsk) { + /* override with settings from tcp listener, + * so Send-Q will show accept queue. + */ + r->idiag_rqueue = READ_ONCE(lsk->sk_ack_backlog); + r->idiag_wqueue = READ_ONCE(lsk->sk_max_ack_backlog); + } + } + if (!info) return; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b548cec86c9d..be3b918a6d15 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -825,7 +825,7 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; - if (unlikely(__mptcp_check_fallback(msk))) + if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { @@ -931,7 +931,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && - READ_ONCE(msk->pm.server_side)) + !subflow->request_join) tcp_send_ack(ssk); goto fully_established; } @@ -1133,7 +1133,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &mp_opt.addr); + mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); @@ -1224,20 +1224,62 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } -static void mptcp_set_rwin(const struct tcp_sock *tp) +static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; - const struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow; + u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; - u64 ack_seq; + u32 new_win; + u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); - ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + ack_seq = READ_ONCE(msk->ack_seq); + rcv_wnd_new = ack_seq + tp->rcv_wnd; + + rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); + if (after64(rcv_wnd_new, rcv_wnd_old)) { + u64 rcv_wnd; + + for (;;) { + rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); + + if (rcv_wnd == rcv_wnd_old) + break; + if (before64(rcv_wnd_new, rcv_wnd)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); + goto raise_win; + } + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); + rcv_wnd_old = rcv_wnd; + } + return; + } + + if (rcv_wnd_new != rcv_wnd_old) { +raise_win: + win = rcv_wnd_old - ack_seq; + tp->rcv_wnd = min_t(u64, win, U32_MAX); + new_win = tp->rcv_wnd; - if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (unlikely(th->syn)) + new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; + if (!tp->rx_opt.rcv_wscale && + sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + th->window = htons(new_win); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); + } } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) @@ -1275,7 +1317,7 @@ static void put_len_csum(u16 len, __sum16 csum, void *data) put_unaligned(csum, sumptr); } -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; @@ -1350,8 +1392,11 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { + /* data_len == 0 is reserved for the infinite mapping, + * the checksum will also be set to 0. + */ put_len_csum(mpext->data_len, - mptcp_make_csum(mpext), + (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | @@ -1562,7 +1607,7 @@ mp_capable_done: } if (tp) - mptcp_set_rwin(tp); + mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index aa51b100e033..59a85220edc9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -87,6 +87,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) unsigned int subflows_max; int ret = 0; + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_active(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, @@ -178,7 +181,8 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; - update_subflows = subflow->request_join || subflow->mp_join; + update_subflows = (subflow->request_join || subflow->mp_join) && + mptcp_pm_is_kernel(msk); if (!READ_ONCE(pm->work_pending) && !update_subflows) return; @@ -195,19 +199,28 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, spin_unlock_bh(&pm->lock); } -void mptcp_pm_add_addr_received(struct mptcp_sock *msk, +void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, READ_ONCE(pm->accept_addr)); - mptcp_event_addr_announced(msk, addr); + mptcp_event_addr_announced(ssk, addr); spin_lock_bh(&pm->lock); - if (!READ_ONCE(pm->accept_addr)) { + if (mptcp_pm_is_userspace(msk)) { + if (mptcp_userspace_pm_active(msk)) { + mptcp_pm_announce_addr(msk, addr, true); + mptcp_pm_add_addr_send_ack(msk); + } else { + __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); + } + } else if (!READ_ONCE(pm->accept_addr)) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { @@ -261,19 +274,49 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, spin_unlock_bh(&pm->lock); } -void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) +void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); - subflow->backup = bkup; + msk = mptcp_sk(sk); + if (subflow->backup != bkup) { + subflow->backup = bkup; + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + msk->last_snd = NULL; + else + __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); + mptcp_data_unlock(sk); + } - mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); + mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sock *s = (struct sock *)msk; + pr_debug("fail_seq=%llu", fail_seq); + + if (!READ_ONCE(msk->allow_infinite_fallback)) + return; + + if (!READ_ONCE(subflow->mp_fail_response_expect)) { + pr_debug("send MP_FAIL response and infinite map"); + + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); + subflow->send_infinite_map = 1; + } else if (!sock_flag(sk, SOCK_DEAD)) { + pr_debug("MP_FAIL response received"); + + sk_stop_timer(s, &s->sk_timer); + } } /* path manager helpers */ @@ -381,27 +424,48 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) void mptcp_pm_data_reset(struct mptcp_sock *msk) { - msk->pm.add_addr_signaled = 0; - msk->pm.add_addr_accepted = 0; - msk->pm.local_addr_used = 0; - msk->pm.subflows = 0; - msk->pm.rm_list_tx.nr = 0; - msk->pm.rm_list_rx.nr = 0; - WRITE_ONCE(msk->pm.work_pending, false); - WRITE_ONCE(msk->pm.addr_signal, 0); - WRITE_ONCE(msk->pm.accept_addr, false); - WRITE_ONCE(msk->pm.accept_subflow, false); - WRITE_ONCE(msk->pm.remote_deny_join_id0, false); - msk->pm.status = 0; - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); + struct mptcp_pm_data *pm = &msk->pm; - mptcp_pm_nl_data_init(msk); + pm->add_addr_signaled = 0; + pm->add_addr_accepted = 0; + pm->local_addr_used = 0; + pm->subflows = 0; + pm->rm_list_tx.nr = 0; + pm->rm_list_rx.nr = 0; + WRITE_ONCE(pm->pm_type, pm_type); + + if (pm_type == MPTCP_PM_TYPE_KERNEL) { + bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + + /* pm->work_pending must be only be set to 'true' when + * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL + */ + WRITE_ONCE(pm->work_pending, + (!!mptcp_pm_get_local_addr_max(msk) && + subflows_allowed) || + !!mptcp_pm_get_add_addr_signal_max(msk)); + WRITE_ONCE(pm->accept_addr, + !!mptcp_pm_get_add_addr_accept_max(msk) && + subflows_allowed); + WRITE_ONCE(pm->accept_subflow, subflows_allowed); + } else { + WRITE_ONCE(pm->work_pending, 0); + WRITE_ONCE(pm->accept_addr, 0); + WRITE_ONCE(pm->accept_subflow, 0); + } + + WRITE_ONCE(pm->addr_signal, 0); + WRITE_ONCE(pm->remote_deny_join_id0, false); + pm->status = 0; + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); } void mptcp_pm_data_init(struct mptcp_sock *msk) { spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); + INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); mptcp_pm_data_reset(msk); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index b5e8de6f7507..e099f2a12504 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -22,14 +22,6 @@ static struct genl_family mptcp_genl_family; static int pm_nl_pernet_id; -struct mptcp_pm_addr_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 flags; - int ifindex; - struct socket *lsk; -}; - struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; @@ -55,8 +47,19 @@ struct pm_nl_pernet { #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 -static bool addresses_equal(const struct mptcp_addr_info *a, - const struct mptcp_addr_info *b, bool use_port) +static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) +{ + return net_generic(net, pm_nl_pernet_id); +} + +static struct pm_nl_pernet * +pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) +{ + return pm_nl_get_pernet(sock_net((struct sock *)msk)); +} + +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; @@ -120,7 +123,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); local_address(skc, &cur); - if (addresses_equal(&cur, saddr, saddr->port)) + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } @@ -138,7 +141,7 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); remote_address(skc, &cur); - if (addresses_equal(&cur, daddr, daddr->port)) + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } @@ -206,43 +209,39 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { - const struct pm_nl_pernet *pernet; + const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((const struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->add_addr_accept_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->subflows_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet; + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); return READ_ONCE(pernet->local_addr_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { - struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, @@ -262,7 +261,7 @@ mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, addr, true)) + if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } @@ -279,7 +278,7 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, &saddr, true)) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } @@ -353,8 +352,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, return entry; } -static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry) +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -362,8 +361,16 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - if (mptcp_lookup_anno_list_by_saddr(msk, &entry->addr)) - return false; + add_entry = mptcp_lookup_anno_list_by_saddr(msk, &entry->addr); + + if (add_entry) { + if (mptcp_pm_is_kernel(msk)) + return false; + + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + return true; + } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) @@ -406,7 +413,7 @@ static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int i; for (i = 0; i < nr; i++) { - if (addresses_equal(&addrs[i], addr, addr->port)) + if (mptcp_addresses_equal(&addrs[i], addr, addr->port)) return true; } @@ -442,7 +449,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); - if (deny_id0 && addresses_equal(&addrs[i], &remote, false)) + if (deny_id0 && mptcp_addresses_equal(&addrs[i], &remote, false)) continue; if (!lookup_address_in_vec(addrs, i, &addrs[i]) && @@ -475,7 +482,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { - if ((!lookup_by_id && addresses_equal(&entry->addr, info, true)) || + if ((!lookup_by_id && mptcp_addresses_equal(&entry->addr, info, true)) || (lookup_by_id && entry->addr.id == info->id)) return entry; } @@ -490,7 +497,7 @@ lookup_id_by_addr(const struct pm_nl_pernet *pernet, const struct mptcp_addr_inf rcu_read_lock(); list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, addr, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, addr, entry->addr.port)) { ret = entry->addr.id; break; } @@ -508,7 +515,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct pm_nl_pernet *pernet; unsigned int subflows_max; - pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); @@ -604,7 +611,7 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, unsigned int subflows_max; int i = 0; - pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); rcu_read_lock(); @@ -724,9 +731,11 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info local; local_address((struct sock_common *)ssk, &local); - if (!addresses_equal(&local, addr, addr->port)) + if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; + if (subflow->backup != bkup) + msk->last_snd = NULL; subflow->backup = bkup; subflow->send_mp_prio = 1; subflow->request_bkup = bkup; @@ -796,6 +805,9 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, if (!removed) continue; + if (!mptcp_pm_is_kernel(msk)) + continue; + if (rm_type == MPTCP_MIB_RMADDR) { msk->pm.add_addr_accepted--; WRITE_ONCE(msk->pm.accept_addr, true); @@ -889,9 +901,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, * singled addresses */ list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (addresses_equal(&cur->addr, &entry->addr, - address_use_port(entry) && - address_use_port(cur))) { + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + address_use_port(entry) && + address_use_port(cur))) { /* allow replacing the exiting endpoint only if such * endpoint is an implicit one and the user-space * did not provide an endpoint id @@ -1018,14 +1030,17 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) */ local_address((struct sock_common *)msk, &msk_local); local_address((struct sock_common *)skc, &skc_local); - if (addresses_equal(&msk_local, &skc_local, false)) + if (mptcp_addresses_equal(&msk_local, &skc_local, false)) return 0; - pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_local_id(msk, &skc_local); + + pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { ret = entry->addr.id; break; } @@ -1052,18 +1067,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return ret; } -void mptcp_pm_nl_data_init(struct mptcp_sock *msk) -{ - struct mptcp_pm_data *pm = &msk->pm; - bool subflows; - - subflows = !!mptcp_pm_get_subflows_max(msk); - WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows) || - !!mptcp_pm_get_add_addr_signal_max(msk)); - WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows); - WRITE_ONCE(pm->accept_subflow, subflows); -} - #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -1091,6 +1094,10 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { NLA_POLICY_NESTED(mptcp_pm_addr_policy), [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ATTR_ADDR_REMOTE] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) @@ -1139,11 +1146,12 @@ static int mptcp_pm_family_to_addr(int family) return MPTCP_PM_ADDR_ATTR_ADDR4; } -static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, - bool require_family, - struct mptcp_pm_addr_entry *entry) +static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], + const struct nlattr *attr, + struct genl_info *info, + struct mptcp_addr_info *addr, + bool require_family) { - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err, addr_addr; if (!attr) { @@ -1157,27 +1165,29 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, if (err) return err; - memset(entry, 0, sizeof(*entry)); + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) - goto skip_family; + return err; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } - entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); - if (entry->addr.family != AF_INET + addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) - && entry->addr.family != AF_INET6 + && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } - addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); @@ -1185,22 +1195,47 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + if (addr->family == AF_INET6) + addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif - entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); + + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + + return err; +} + +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + memset(addr, 0, sizeof(*addr)); + + return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); +} + +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err; + + memset(entry, 0, sizeof(*entry)); + + err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); + if (err) + return err; -skip_family: if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } - if (tb[MPTCP_PM_ADDR_ATTR_ID]) - entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); - if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); @@ -1212,7 +1247,7 @@ skip_family: static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) { - return net_generic(genl_info_net(info), pm_nl_pernet_id); + return pm_nl_get_pernet(genl_info_net(info)); } static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) @@ -1223,7 +1258,8 @@ static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - if (!READ_ONCE(msk->fully_established)) + if (!READ_ONCE(msk->fully_established) || + mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); @@ -1247,7 +1283,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry addr, *entry; int ret; - ret = mptcp_pm_parse_addr(attr, info, true, &addr); + ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; @@ -1296,17 +1332,25 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return 0; } -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); *flags = 0; *ifindex = 0; if (id) { + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, + id, + flags, + ifindex); + rcu_read_lock(); - entry = __lookup_addr_by_id(net_generic(net, pm_nl_pernet_id), id); + entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); if (entry) { *flags = entry->flags; *ifindex = entry->ifindex; @@ -1366,6 +1410,9 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, struct sock *sk = (struct sock *)msk; bool remove_subflow; + if (mptcp_pm_is_userspace(msk)) + goto next; + if (list_empty(&msk->conn_list)) { mptcp_pm_remove_anno_addr(msk, addr, false); goto next; @@ -1400,11 +1447,11 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, struct sock *sk = (struct sock *)msk; struct mptcp_addr_info msk_local; - if (list_empty(&msk->conn_list)) + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; local_address((struct sock_common *)msk, &msk_local); - if (!addresses_equal(&msk_local, addr, addr->port)) + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); @@ -1430,7 +1477,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) unsigned int addr_max; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1470,8 +1517,8 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; @@ -1507,9 +1554,11 @@ static void mptcp_nl_remove_addrs_list(struct net *net, while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - lock_sock(sk); - mptcp_pm_remove_addrs_and_subflows(msk, rm_list); - release_sock(sk); + if (!mptcp_pm_is_userspace(msk)) { + lock_sock(sk); + mptcp_pm_remove_addrs_and_subflows(msk, rm_list); + release_sock(sk); + } sock_put(sk); cond_resched(); @@ -1602,7 +1651,7 @@ static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) void *reply; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1653,7 +1702,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, void *hdr; int i; - pernet = net_generic(net, pm_nl_pernet_id); + pernet = pm_nl_get_pernet(net); spin_lock_bh(&pernet->lock); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { @@ -1782,7 +1831,7 @@ static int mptcp_nl_set_flags(struct net *net, while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; - if (list_empty(&msk->conn_list)) + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); @@ -1813,7 +1862,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) u8 bkup = 0, lookup_by_id = 0; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1852,6 +1901,13 @@ static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gf nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } +bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) +{ + return genl_has_listeners(&mptcp_genl_family, + sock_net((const struct sock *)msk), + MPTCP_PM_EV_GRP_OFFSET); +} + static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); @@ -1972,6 +2028,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } @@ -2006,10 +2065,12 @@ nla_put_failure: kfree_skb(skb); } -void mptcp_event_addr_announced(const struct mptcp_sock *msk, +void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { - struct net *net = sock_net((const struct sock *)msk); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; @@ -2031,7 +2092,10 @@ void mptcp_event_addr_announced(const struct mptcp_sock *msk, if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; - if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port)) + if (nla_put_be16(skb, MPTCP_ATTR_DPORT, + info->port == 0 ? + inet_sk(ssk)->inet_dport : + info->port)) goto nla_put_failure; switch (info->family) { @@ -2148,6 +2212,26 @@ static const struct genl_small_ops mptcp_pm_ops[] = { .doit = mptcp_nl_cmd_set_flags, .flags = GENL_ADMIN_PERM, }, + { + .cmd = MPTCP_PM_CMD_ANNOUNCE, + .doit = mptcp_nl_cmd_announce, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_REMOVE, + .doit = mptcp_nl_cmd_remove, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, + .doit = mptcp_nl_cmd_sf_create, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, + .doit = mptcp_nl_cmd_sf_destroy, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family mptcp_genl_family __ro_after_init = { @@ -2165,7 +2249,7 @@ static struct genl_family mptcp_genl_family __ro_after_init = { static int __net_init pm_nl_init_net(struct net *net) { - struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); @@ -2187,7 +2271,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct net *net; list_for_each_entry(net, net_list, exit_list) { - struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); /* net is removed from namespace list, can't race with * other modifiers, also netns core already waited for a diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c new file mode 100644 index 000000000000..f56378e4f597 --- /dev/null +++ b/net/mptcp/pm_userspace.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, Intel Corporation. + */ + +#include "protocol.h" + +void mptcp_free_local_addr_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + if (!mptcp_pm_is_userspace(msk)) + return; + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sock_kfree_s(sk, entry, sizeof(*entry)); + } +} + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) +{ + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_pm_addr_entry *match = NULL; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *e; + bool addr_match = false; + bool id_match = false; + int ret = -EINVAL; + + bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); + if (addr_match && entry->addr.id == 0) + entry->addr.id = e->addr.id; + id_match = (e->addr.id == entry->addr.id); + if (addr_match && id_match) { + match = e; + break; + } else if (addr_match || id_match) { + break; + } + __set_bit(e->addr.id, id_bitmap); + } + + if (!match && !addr_match && !id_match) { + /* Memory for the entry is allocated from the + * sock option buffer. + */ + e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + if (!e) { + spin_unlock_bh(&msk->pm.lock); + return -ENOMEM; + } + + *e = *entry; + if (!e->addr.id) + e->addr.id = find_next_zero_bit(id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + 1); + list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); + ret = e->addr.id; + } else if (match) { + ret = entry->addr.id; + } + + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex) +{ + struct mptcp_pm_addr_entry *entry, *match = NULL; + + *flags = 0; + *ifindex = 0; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (id == entry->addr.id) { + match = entry; + break; + } + } + spin_unlock_bh(&msk->pm.lock); + if (match) { + *flags = match->flags; + *ifindex = match->ifindex; + } + + return 0; +} + +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry new_entry; + __be16 msk_sport = ((struct inet_sock *) + inet_sk((struct sock *)msk))->inet_sport; + + memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); + new_entry.addr = *skc; + new_entry.addr.id = 0; + new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + + if (new_entry.addr.port == msk_sport) + new_entry.addr.port = 0; + + return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); +} + +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_pm_addr_entry addr_val; + struct mptcp_sock *msk; + int err = -EINVAL; + u32 token_val; + + if (!addr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto announce_err; + } + + err = mptcp_pm_parse_entry(addr, info, true, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "error parsing local address"); + goto announce_err; + } + + if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + goto announce_err; + } + + err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "did not match address and id"); + goto announce_err; + } + + lock_sock((struct sock *)msk); + spin_lock_bh(&msk->pm.lock); + + if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + mptcp_pm_announce_addr(msk, &addr_val.addr, false); + mptcp_pm_nl_addr_send_ack(msk); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock((struct sock *)msk); + + err = 0; + announce_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; + struct mptcp_pm_addr_entry *match = NULL; + struct mptcp_pm_addr_entry *entry; + struct mptcp_sock *msk; + LIST_HEAD(free_list); + int err = -EINVAL; + u32 token_val; + u8 id_val; + + if (!id || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + id_val = nla_get_u8(id); + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto remove_err; + } + + lock_sock((struct sock *)msk); + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (entry->addr.id == id_val) { + match = entry; + break; + } + } + + if (!match) { + GENL_SET_ERR_MSG(info, "address with specified id not found"); + release_sock((struct sock *)msk); + goto remove_err; + } + + list_move(&match->list, &free_list); + + mptcp_pm_remove_addrs_and_subflows(msk, &free_list); + + release_sock((struct sock *)msk); + + list_for_each_entry_safe(match, entry, &free_list, list) { + sock_kfree_s((struct sock *)msk, match, sizeof(*match)); + } + + err = 0; + remove_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_r; + struct mptcp_addr_info addr_l; + struct mptcp_sock *msk; + int err = -EINVAL; + struct sock *sk; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto create_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto create_err; + } + + if (addr_l.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); + goto create_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto create_err; + } + + sk = &msk->sk.icsk_inet.sk; + lock_sock(sk); + + err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); + + release_sock(sk); + + create_err: + sock_put((struct sock *)msk); + return err; +} + +static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, + const struct mptcp_addr_info *local, + const struct mptcp_addr_info *remote) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + struct mptcp_subflow_context *subflow; + struct sock *found = NULL; + + if (local->family != remote->family) + return NULL; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + const struct inet_sock *issk; + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + + if (local->family != ssk->sk_family) + continue; + + issk = inet_sk(ssk); + + switch (ssk->sk_family) { + case AF_INET: + if (issk->inet_saddr != local->addr.s_addr || + issk->inet_daddr != remote->addr.s_addr) + continue; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *pinfo = inet6_sk(ssk); + + if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) + continue; + break; + } +#endif + default: + continue; + } + + if (issk->inet_sport == local->port && + issk->inet_dport == remote->port) { + found = ssk; + goto found; + } + } + +found: + release_sock(sk); + + return found; +} + +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_l; + struct mptcp_addr_info addr_r; + struct mptcp_sock *msk; + struct sock *sk, *ssk; + int err = -EINVAL; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto destroy_err; + } + + if (addr_l.family != addr_r.family) { + GENL_SET_ERR_MSG(info, "address families do not match"); + goto destroy_err; + } + + if (!addr_l.port || !addr_r.port) { + GENL_SET_ERR_MSG(info, "missing local or remote port"); + goto destroy_err; + } + + sk = &msk->sk.icsk_inet.sk; + ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); + if (ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, subflow); + err = 0; + } else { + err = -ESRCH; + } + + destroy_err: + sock_put((struct sock *)msk); + return err; +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0cbea3b6d0a4..17e13396024a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - max_seq = READ_ONCE(msk->rcv_wnd_sent); + max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); @@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, - (unsigned long long)msk->rcv_wnd_sent); + (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -1141,18 +1141,21 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, - int avail_size) +static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, + u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); + u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; - if (!before64(data_seq + avail_size, window_end)) { - u64 allowed_size = window_end - data_seq; + mptcp_snd_wnd = window_end - data_seq; + avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - return min_t(unsigned int, allowed_size, avail_size); + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { + tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; @@ -1229,6 +1232,22 @@ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } +static void mptcp_update_infinite_map(struct mptcp_sock *msk, + struct sock *ssk, + struct mptcp_ext *mpext) +{ + if (!mpext) + return; + + mpext->infinite_map = 1; + mpext->data_len = 0; + + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); + mptcp_subflow_ctx(ssk)->send_infinite_map = 0; + pr_fallback(msk); + __mptcp_do_fallback(msk); +} + static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) @@ -1289,7 +1308,7 @@ alloc_skb: } /* Zero window and all data acked? Probe. */ - copy = mptcp_check_allowed_size(msk, data_seq, copy); + copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); @@ -1360,6 +1379,8 @@ alloc_skb: out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); + if (mptcp_subflow_ctx(ssk)->send_infinite_map) + mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; @@ -1480,11 +1501,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; - if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) + if (!ssk || !sk_stream_memory_free(ssk)) return NULL; - burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); + burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); + if (!burst) { + msk->last_snd = NULL; + return ssk; + } + subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, @@ -2012,7 +2038,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int nonblock, int flags, int *addr_len) + int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; @@ -2030,7 +2056,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } - timeo = sock_rcvtimeo(sk, nonblock); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -2149,6 +2175,21 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } +static struct mptcp_subflow_context * +mp_fail_response_expect_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *ret = NULL; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->mp_fail_response_expect)) { + ret = subflow; + break; + } + } + + return ret; +} + static void mptcp_timeout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); @@ -2465,6 +2506,7 @@ static void __mptcp_retrans(struct sock *sk) dfrag->already_sent = max(dfrag->already_sent, info.sent); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); @@ -2476,6 +2518,23 @@ reset_timer: mptcp_reset_timer(sk); } +static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *ssk; + bool slow; + + subflow = mp_fail_response_expect_subflow(msk); + if (subflow) { + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + unlock_sock_fast(ssk, slow); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2516,6 +2575,8 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); + mptcp_mp_fail_no_response(msk); + unlock: release_sock(sk); sock_put(sk); @@ -2539,6 +2600,7 @@ static int __mptcp_init_sock(struct sock *sk) msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; mptcp_pm_data_init(msk); @@ -2733,7 +2795,7 @@ static void __mptcp_destroy_sock(struct sock *sk) /* join list will be eventually flushed (with rst) at sock lock release time*/ list_splice_init(&msk->conn_list, &conn_list); - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; @@ -2841,7 +2903,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); } - sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); if (mptcp_sk(sk)->token) @@ -2916,7 +2978,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -3017,6 +3079,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk) msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); + mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) @@ -3092,15 +3155,19 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } - /* be sure to set the current sk state before tacking actions - * depending on sk_state - */ - if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) - __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); - if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) - __mptcp_error_report(sk); + if (unlikely(&msk->cb_flags)) { + /* be sure to set the current sk state before tacking actions + * depending on sk_state, that is processing MPTCP_ERROR_REPORT + */ + if (__test_and_clear_bit(MPTCP_CONNECTED, &msk->cb_flags)) + __mptcp_set_connected(sk); + if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) + __mptcp_error_report(sk); + if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) + msk->last_snd = NULL; + } __mptcp_update_rmem(sk); } @@ -3204,9 +3271,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); @@ -3237,15 +3304,12 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - if (!msk->pm.server_side) + if (!list_empty(&subflow->node)) goto out; if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; - if (WARN_ON_ONCE(!list_empty(&subflow->node))) - goto err_prohibited; - /* active connections are already on conn_list. * If we can't acquire msk socket lock here, let the release callback * handle it @@ -3271,6 +3335,7 @@ err_prohibited: } subflow->map_seq = READ_ONCE(msk->ack_seq); + WRITE_ONCE(msk->allow_infinite_fallback, false); out: mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); @@ -3703,8 +3768,8 @@ void __init mptcp_proto_init(void) for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + mptcp_napi_poll); napi_enable(&delegated->napi); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5655a63aa6a8..200f89f6d62f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -11,6 +11,7 @@ #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> +#include <net/genetlink.h> #define MPTCP_SUPPORTED_VERSION 1 @@ -124,6 +125,7 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 +#define MPTCP_RESET_SCHEDULER 7 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -182,6 +184,14 @@ enum mptcp_pm_status { */ }; +enum mptcp_pm_type { + MPTCP_PM_TYPE_KERNEL = 0, + MPTCP_PM_TYPE_USERSPACE, + + __MPTCP_PM_TYPE_NR, + __MPTCP_PM_TYPE_MAX = __MPTCP_PM_TYPE_NR - 1, +}; + /* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ #define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) @@ -198,6 +208,7 @@ struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; + struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ @@ -210,6 +221,7 @@ struct mptcp_pm_data { u8 add_addr_signaled; u8 add_addr_accepted; u8 local_addr_used; + u8 pm_type; u8 subflows; u8 status; DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -217,6 +229,14 @@ struct mptcp_pm_data { struct mptcp_rm_list rm_list_rx; }; +struct mptcp_pm_addr_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 flags; + int ifindex; + struct socket *lsk; +}; + struct mptcp_data_frag { struct list_head list; u64 data_seq; @@ -236,7 +256,7 @@ struct mptcp_sock { u64 write_seq; u64 snd_nxt; u64 ack_seq; - u64 rcv_wnd_sent; + atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; int rmem_fwd_alloc; struct sock *last_snd; @@ -262,6 +282,7 @@ struct mptcp_sock { bool rcv_fastclose; bool use_64bit_ack; /* Set when we received a 64-bit DSN */ bool csum_enabled; + bool allow_infinite_fallback; u8 recvmsg_inq:1, cork:1, nodelay:1; @@ -439,6 +460,7 @@ struct mptcp_subflow_context { send_mp_prio : 1, send_mp_fail : 1, send_fastclose : 1, + send_infinite_map : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ @@ -446,6 +468,7 @@ struct mptcp_subflow_context { local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; + bool mp_fail_response_expect; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -572,6 +595,7 @@ unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); +int mptcp_get_pm_type(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -587,6 +611,9 @@ void mptcp_subflow_reset(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port); + /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); @@ -622,19 +649,6 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } -static inline bool mptcp_has_another_subflow(struct sock *ssk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp; - struct mptcp_sock *msk = mptcp_sk(subflow->conn); - - mptcp_for_each_subflow(msk, tmp) { - if (tmp != subflow) - return true; - } - - return false; -} - void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); @@ -729,6 +743,11 @@ __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum su void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr); +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); @@ -739,7 +758,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk); void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, const struct mptcp_subflow_context *subflow); -void mptcp_pm_add_addr_received(struct mptcp_sock *msk, +void mptcp_pm_add_addr_received(const struct sock *ssk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); @@ -749,6 +768,8 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * @@ -757,19 +778,34 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, u8 *flags, int *ifindex); +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list); + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); +void mptcp_free_local_addr_list(struct mptcp_sock *msk); +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); -void mptcp_event_addr_announced(const struct mptcp_sock *msk, const struct mptcp_addr_info *info); +void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); +bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -792,6 +828,16 @@ static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_RM_ADDR_SIGNAL); } +static inline bool mptcp_pm_is_userspace(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_USERSPACE; +} + +static inline bool mptcp_pm_is_kernel(const struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.pm_type) == MPTCP_PM_TYPE_KERNEL; +} + static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; @@ -822,9 +868,9 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_work(struct mptcp_sock *msk); void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); @@ -890,13 +936,28 @@ static inline void mptcp_do_fallback(struct sock *sk) #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +static inline bool mptcp_check_infinite_map(struct sk_buff *skb) +{ + struct mptcp_ext *mpext; + + mpext = skb ? mptcp_get_ext(skb) : NULL; + if (mpext && mpext->infinite_map) + return true; + + return false; +} + +static inline bool is_active_ssk(struct mptcp_subflow_context *subflow) +{ + return (subflow->request_mptcp || subflow->request_join); +} + static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct sock *parent = subflow->conn; return sk->sk_state == TCP_ESTABLISHED && - !mptcp_sk(parent)->pm.server_side && + is_active_ssk(subflow) && !subflow->conn_finished; } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index f949d22f52bd..423d3826ca1e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -756,6 +756,18 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct socket *listener; + + listener = __mptcp_nmpc_socket(msk); + if (!listener) + return 0; /* TCP_DEFER_ACCEPT does not fail */ + + return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); +} + static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -782,6 +794,8 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); case TCP_NODELAY: return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); + case TCP_DEFER_ACCEPT: + return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); } return -EOPNOTSUPP; @@ -853,15 +867,11 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { - struct sock *sk = &msk->sk.icsk_inet.sk; u32 flags = 0; - bool slow; u8 val; memset(info, 0, sizeof(*info)); - slow = lock_sock_fast(sk); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); @@ -882,8 +892,6 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_snd_una = READ_ONCE(msk->snd_una); info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); - - unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); @@ -1148,6 +1156,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_CONGESTION: case TCP_INFO: case TCP_CC_INFO: + case TCP_DEFER_ACCEPT: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be76ada89d96..8841e8cd9ad8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -62,7 +62,9 @@ static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk) { return mptcp_is_fully_established((void *)msk) && - READ_ONCE(msk->pm.accept_subflow); + ((mptcp_pm_is_userspace(msk) && + mptcp_userspace_pm_active(msk)) || + READ_ONCE(msk->pm.accept_subflow)); } /* validate received token and create truncated hmac and nonce for SYN-ACK */ @@ -441,6 +443,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; + subflow->remote_id = mp_opt.join_id; pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -971,6 +974,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); + struct sock *sk = (struct sock *)msk; struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -1009,7 +1013,12 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { + pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); + subflow->map_data_len = 0; + if (!sock_flag(ssk, SOCK_DEAD)) + sk_stop_timer(sk, &sk->sk_timer); + return MAPPING_INVALID; } @@ -1218,35 +1227,43 @@ no_data: return false; fallback: - /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { - if (mptcp_has_another_subflow(ssk)) { - while ((skb = skb_peek(&ssk->sk_receive_queue))) - sk_eat_skb(ssk, skb); + if (!__mptcp_check_fallback(msk)) { + /* RFC 8684 section 3.7. */ + if (subflow->send_mp_fail) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + tcp_send_active_reset(ssk, GFP_ATOMIC); + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + } else if (!sock_flag(ssk, SOCK_DEAD)) { + WRITE_ONCE(subflow->mp_fail_response_expect, true); + sk_reset_timer((struct sock *)msk, + &((struct sock *)msk)->sk_timer, + jiffies + TCP_RTO_MAX); + } + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return true; } - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); - return true; - } - if (!subflow_can_fallback(subflow)) { - /* fatal protocol error, close the socket. - * subflow_error_report() will introduce the appropriate barriers - */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); - subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; - tcp_send_active_reset(ssk, GFP_ATOMIC); - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); - return false; + if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + /* fatal protocol error, close the socket. + * subflow_error_report() will introduce the appropriate barriers + */ + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; + tcp_send_active_reset(ssk, GFP_ATOMIC); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + return false; + } + + __mptcp_do_fallback(msk); } - __mptcp_do_fallback(msk); skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; subflow->map_seq = READ_ONCE(msk->ack_seq); @@ -1461,7 +1478,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (local_id) subflow_set_local_id(subflow, local_id); - mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, + mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; @@ -1498,6 +1515,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); iput(SOCK_INODE(sf)); + WRITE_ONCE(msk->allow_infinite_fallback, false); return err; failed_unlink: |