aboutsummaryrefslogtreecommitdiff
path: root/net/openvswitch/datapath.c
diff options
context:
space:
mode:
authorGravatar Mark Gray <mark.d.gray@redhat.com> 2021-07-15 08:27:54 -0400
committerGravatar David S. Miller <davem@davemloft.net> 2021-07-16 11:06:33 -0700
commitb83d23a2a38b1770da0491257ae81d52307f7816 (patch)
treeaf242f1f73f6b6aa520dc393ec8db20cccd1cdfe /net/openvswitch/datapath.c
parentbnx2x: remove unused variable 'cur_data_offset' (diff)
downloadlinux-b83d23a2a38b1770da0491257ae81d52307f7816.tar.gz
linux-b83d23a2a38b1770da0491257ae81d52307f7816.tar.bz2
linux-b83d23a2a38b1770da0491257ae81d52307f7816.zip
openvswitch: Introduce per-cpu upcall dispatch
The Open vSwitch kernel module uses the upcall mechanism to send packets from kernel space to user space when it misses in the kernel space flow table. The upcall sends packets via a Netlink socket. Currently, a Netlink socket is created for every vport. In this way, there is a 1:1 mapping between a vport and a Netlink socket. When a packet is received by a vport, if it needs to be sent to user space, it is sent via the corresponding Netlink socket. This mechanism, with various iterations of the corresponding user space code, has seen some limitations and issues: * On systems with a large number of vports, there is a correspondingly large number of Netlink sockets which can limit scaling. (https://bugzilla.redhat.com/show_bug.cgi?id=1526306) * Packet reordering on upcalls. (https://bugzilla.redhat.com/show_bug.cgi?id=1844576) * A thundering herd issue. (https://bugzilla.redhat.com/show_bug.cgi?id=1834444) This patch introduces an alternative, feature-negotiated, upcall mode using a per-cpu dispatch rather than a per-vport dispatch. In this mode, the Netlink socket to be used for the upcall is selected based on the CPU of the thread that is executing the upcall. In this way, it resolves the issues above as: a) The number of Netlink sockets scales with the number of CPUs rather than the number of vports. b) Ordering per-flow is maintained as packets are distributed to CPUs based on mechanisms such as RSS and flows are distributed to a single user space thread. c) Packets from a flow can only wake up one user space thread. The corresponding user space code can be found at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html Bugzilla: https://bugzilla.redhat.com/1844576 Signed-off-by: Mark Gray <mark.d.gray@redhat.com> Acked-by: Flavio Leitner <fbl@sysclose.org> Acked-by: Pravin B Shelar <pshelar@ovn.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/openvswitch/datapath.c')
-rw-r--r--net/openvswitch/datapath.c72
1 files changed, 70 insertions, 2 deletions
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index bc164b35e67d..7a4edafdc685 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *,
static void ovs_dp_masks_rebalance(struct work_struct *work);
+static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *);
+
/* Must be called with rcu_read_lock or ovs_mutex. */
const char *ovs_dp_name(const struct datapath *dp)
{
@@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
free_percpu(dp->stats_percpu);
kfree(dp->ports);
ovs_meters_exit(dp);
+ kfree(dp->upcall_portids);
kfree(dp);
}
@@ -239,7 +242,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
- upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
+ if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+ upcall.portid = ovs_dp_get_upcall_portid(dp, smp_processor_id());
+ else
+ upcall.portid = ovs_vport_find_upcall_portid(p, skb);
+
upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
if (unlikely(error))
@@ -1594,16 +1602,67 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb,
DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support);
+static int ovs_dp_set_upcall_portids(struct datapath *dp,
+ const struct nlattr *ids)
+{
+ struct dp_nlsk_pids *old, *dp_nlsk_pids;
+
+ if (!nla_len(ids) || nla_len(ids) % sizeof(u32))
+ return -EINVAL;
+
+ old = ovsl_dereference(dp->upcall_portids);
+
+ dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids),
+ GFP_KERNEL);
+ if (!dp_nlsk_pids)
+ return -ENOMEM;
+
+ dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32);
+ nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids));
+
+ rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids);
+
+ kfree_rcu(old, rcu);
+
+ return 0;
+}
+
+u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id)
+{
+ struct dp_nlsk_pids *dp_nlsk_pids;
+
+ dp_nlsk_pids = rcu_dereference(dp->upcall_portids);
+
+ if (dp_nlsk_pids) {
+ if (cpu_id < dp_nlsk_pids->n_pids) {
+ return dp_nlsk_pids->pids[cpu_id];
+ } else if (dp_nlsk_pids->n_pids > 0 && cpu_id >= dp_nlsk_pids->n_pids) {
+ /* If the number of netlink PIDs is mismatched with the number of
+ * CPUs as seen by the kernel, log this and send the upcall to an
+ * arbitrary socket (0) in order to not drop packets
+ */
+ pr_info_ratelimited("cpu_id mismatch with handler threads");
+ return dp_nlsk_pids->pids[cpu_id % dp_nlsk_pids->n_pids];
+ } else {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
+}
+
static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
{
u32 user_features = 0;
+ int err;
if (a[OVS_DP_ATTR_USER_FEATURES]) {
user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]);
if (user_features & ~(OVS_DP_F_VPORT_PIDS |
OVS_DP_F_UNALIGNED |
- OVS_DP_F_TC_RECIRC_SHARING))
+ OVS_DP_F_TC_RECIRC_SHARING |
+ OVS_DP_F_DISPATCH_UPCALL_PER_CPU))
return -EOPNOTSUPP;
#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
@@ -1624,6 +1683,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[])
dp->user_features = user_features;
+ if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU &&
+ a[OVS_DP_ATTR_PER_CPU_PIDS]) {
+ /* Upcall Netlink Port IDs have been updated */
+ err = ovs_dp_set_upcall_portids(dp,
+ a[OVS_DP_ATTR_PER_CPU_PIDS]);
+ if (err)
+ return err;
+ }
+
if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING)
static_branch_enable(&tc_recirc_sharing_support);
else