From 315ca6d98ed3fd7abe235637c28dd2f9f0a77795 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 16 Nov 2016 13:29:48 -0800 Subject: RDS: TCP: set RDS_FLAG_RETRANSMITTED in cp_retrans list As noted in rds_recv_incoming() sequence numbers on data packets can decreas for the failover case, and the Rx path is equipped to recover from this, if the RDS_FLAG_RETRANSMITTED is set on the rds header of an incoming message with a suspect sequence number. The RDS_FLAG_RETRANSMITTED is predicated on the RDS_FLAG_RETRANSMITTED flag in the rds_message, so make sure the flag is set on messages queued for retransmission. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp_send.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 89d09b481f47..dcf4742083ea 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -100,6 +100,9 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags); tc->t_last_expected_una = rm->m_ack_seq + 1; + if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED; + rdsdebug("rm %p tcp nxt %u ack_seq %llu\n", rm, rds_tcp_snd_nxt(tc), (unsigned long long)rm->m_ack_seq); -- cgit v1.2.3 From 905dd4184e0732de41d6ee3c7b06e0cfdd9f0aad Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 16 Nov 2016 13:29:49 -0800 Subject: RDS: TCP: Track peer's connection generation number The RDS transport has to be able to distinguish between two types of failure events: (a) when the transport fails (e.g., TCP connection reset) but the RDS socket/connection layer on both sides stays the same (b) when the peer's RDS layer itself resets (e.g., due to module reload or machine reboot at the peer) In case (a) both sides must reconnect and continue the RDS messaging without any message loss or disruption to the message sequence numbers, and this is achieved by rds_send_path_reset(). In case (b) we should reset all rds_connection state to the new incarnation of the peer. Examples of state that needs to be reset are next expected rx sequence number from, or messages to be retransmitted to, the new incarnation of the peer. To achieve this, the RDS handshake probe added as part of commit 5916e2c1554f ("RDS: TCP: Enable multipath RDS for TCP") is enhanced so that sender and receiver of the RDS ping-probe will add a generation number as part of the RDS_EXTHDR_GEN_NUM extension header. Each peer stores local and remote generation numbers as part of each rds_connection. Changes in generation number will be detected via incoming handshake probe ping request or response and will allow the receiver to reset rds_connection state. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 4 ++++ net/rds/connection.c | 2 ++ net/rds/message.c | 1 + net/rds/rds.h | 8 +++++++- net/rds/recv.c | 36 ++++++++++++++++++++++++++++++++++++ net/rds/send.c | 9 +++++++-- 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 6beaeb1138f3..2ac1e6194be3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -605,10 +605,14 @@ static void rds_exit(void) } module_exit(rds_exit); +u32 rds_gen_num; + static int rds_init(void) { int ret; + net_get_random_once(&rds_gen_num, sizeof(rds_gen_num)); + ret = rds_bind_lock_init(); if (ret) goto out; diff --git a/net/rds/connection.c b/net/rds/connection.c index 13f459dad4ef..b86e188bde32 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -269,6 +269,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { + conn->c_my_gen_num = rds_gen_num; + conn->c_peer_gen_num = 0; hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; diff --git a/net/rds/message.c b/net/rds/message.c index 6cb91061556a..49bfb512d808 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -42,6 +42,7 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), [RDS_EXTHDR_NPATHS] = sizeof(u16), +[RDS_EXTHDR_GEN_NUM] = sizeof(u32), }; diff --git a/net/rds/rds.h b/net/rds/rds.h index 4121e1862444..ebbf909b87ec 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -151,6 +151,9 @@ struct rds_connection { struct rds_conn_path c_path[RDS_MPATH_WORKERS]; wait_queue_head_t c_hs_waitq; /* handshake waitq */ + + u32 c_my_gen_num; + u32 c_peer_gen_num; }; static inline @@ -243,7 +246,8 @@ struct rds_ext_header_rdma_dest { /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ -#define RDS_EXTHDR_NPATHS 4 +#define RDS_EXTHDR_NPATHS 5 +#define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ @@ -338,6 +342,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 +#define RDS_MSG_FLUSH 8 struct rds_message { atomic_t m_refcount; @@ -664,6 +669,7 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ +extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, diff --git a/net/rds/recv.c b/net/rds/recv.c index cbfabdf3ff48..9d0666e5fe35 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -120,6 +120,36 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, /* do nothing if no change in cong state */ } +static void rds_conn_peer_gen_update(struct rds_connection *conn, + u32 peer_gen_num) +{ + int i; + struct rds_message *rm, *tmp; + unsigned long flags; + + WARN_ON(conn->c_trans->t_type != RDS_TRANS_TCP); + if (peer_gen_num != 0) { + if (conn->c_peer_gen_num != 0 && + peer_gen_num != conn->c_peer_gen_num) { + for (i = 0; i < RDS_MPATH_WORKERS; i++) { + struct rds_conn_path *cp; + + cp = &conn->c_path[i]; + spin_lock_irqsave(&cp->cp_lock, flags); + cp->cp_next_tx_seq = 1; + cp->cp_next_rx_seq = 0; + list_for_each_entry_safe(rm, tmp, + &cp->cp_retrans, + m_conn_item) { + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + spin_unlock_irqrestore(&cp->cp_lock, flags); + } + } + conn->c_peer_gen_num = peer_gen_num; + } +} + /* * Process all extension headers that come with this message. */ @@ -163,7 +193,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, union { struct rds_ext_header_version version; u16 rds_npaths; + u32 rds_gen_num; } buffer; + u32 new_peer_gen_num = 0; while (1) { len = sizeof(buffer); @@ -176,6 +208,9 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, buffer.rds_npaths); break; + case RDS_EXTHDR_GEN_NUM: + new_peer_gen_num = buffer.rds_gen_num; + break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); @@ -183,6 +218,7 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, } /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ conn->c_npaths = max_t(int, conn->c_npaths, 1); + rds_conn_peer_gen_update(conn, new_peer_gen_num); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. diff --git a/net/rds/send.c b/net/rds/send.c index 896626b9a0ef..77c8c6e613ad 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -259,8 +259,9 @@ restart: * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->rdma.op_active && - test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || + (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))) { spin_lock_irqsave(&cp->cp_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); @@ -1209,6 +1210,10 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_GEN_NUM, + &cp->cp_conn->c_my_gen_num, + sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); -- cgit v1.2.3 From 1a0e100fb2c9667cea2a7d755faaa83569942f05 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 16 Nov 2016 13:29:50 -0800 Subject: RDS: TCP: Force every connection to be initiated by numerically smaller IP address When 2 RDS peers initiate an RDS-TCP connection simultaneously, there is a potential for "duelling syns" on either/both sides. See commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()") for a description of this condition, and the arbitration logic which ensures that the numerically large IP address in the TCP connection is bound to the RDS_TCP_PORT ("canonical ordering"). The rds_connection should not be marked as RDS_CONN_UP until the arbitration logic has converged for the following reason. The sender may start transmitting RDS datagrams as soon as RDS_CONN_UP is set, and since the sender removes all datagrams from the rds_connection's cp_retrans queue based on TCP acks. If the TCP ack was sent from a tcp socket that got reset as part of duel aribitration (but before data was delivered to the receivers RDS socket layer), the sender may end up prematurely freeing the datagram, and the datagram is no longer reliably deliverable. This patch remedies that condition by making sure that, upon receipt of 3WH completion state change notification of TCP_ESTABLISHED in rds_tcp_state_change, we mark the rds_connection as RDS_CONN_UP if, and only if, the IP addresses and ports for the connection are canonically ordered. In all other cases, rds_tcp_state_change will force an rds_conn_path_drop(), and rds_queue_reconnect() on both peers will restart the connection to ensure canonical ordering. A side-effect of enforcing this condition in rds_tcp_state_change() is that rds_tcp_accept_one_path() can now be refactored for simplicity. It is also no longer possible to encounter an RDS_CONN_UP connection in the arbitration logic in rds_tcp_accept_one(). Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 1 + net/rds/tcp_connect.c | 14 +++++++++++++- net/rds/tcp_listen.c | 29 ++++++++++++----------------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index b86e188bde32..fe9d31c0b22d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -683,6 +683,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); } +EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); void rds_conn_connect_if_down(struct rds_connection *conn) { diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 05f61c533ed3..d6839d96d539 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -60,7 +60,19 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - rds_connect_path_complete(cp, RDS_CONN_CONNECTING); + /* Force the peer to reconnect so that we have the + * TCP ports going from . to + * .. We avoid marking the + * RDS connection as RDS_CONN_UP until the reconnect, + * to avoid RDS datagram loss. + */ + if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr && + rds_conn_path_transition(cp, RDS_CONN_CONNECTING, + RDS_CONN_ERROR)) { + rds_conn_path_drop(cp); + } else { + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); + } break; case TCP_CLOSE_WAIT: case TCP_CLOSE: diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c9c496844cd7..f74bab3ecdca 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -83,25 +83,20 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); - int npaths = conn->c_npaths; - - if (npaths <= 1) { - struct rds_conn_path *cp = &conn->c_path[0]; - int ret; - - ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, - RDS_CONN_CONNECTING); - if (!ret) - rds_conn_path_transition(cp, RDS_CONN_ERROR, - RDS_CONN_CONNECTING); - return cp->cp_transport_data; - } + int npaths = max_t(int, 1, conn->c_npaths); - /* for mprds, paths with cp_index > 0 MUST be initiated by the peer + /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ - if (!peer_is_smaller) + if (!peer_is_smaller) { + /* Make sure we initiate at least one path if this + * has not already been done; rds_start_mprds() will + * take care of additional paths, if necessary. + */ + if (npaths == 1) + rds_conn_path_connect_if_down(&conn->c_path[0]); return NULL; + } for (i = 0; i < npaths; i++) { struct rds_conn_path *cp = &conn->c_path[i]; @@ -171,8 +166,8 @@ int rds_tcp_accept_one(struct socket *sock) mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP && - conn_state != RDS_CONN_ERROR) + WARN_ON(conn_state == RDS_CONN_UP); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Need to resolve a duelling SYN between peers. -- cgit v1.2.3