aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jakub Kicinski <kuba@kernel.org> 2023-02-07 18:20:02 -0800
committerGravatar Jakub Kicinski <kuba@kernel.org> 2023-02-07 18:20:03 -0800
commitcc74ca303a658dc4fb69e75df5f79d03d7a9b7e5 (patch)
treee446248b006239a53bc21ef49dc7c75262be8870
parentMerge branch 'net-core-use-a-dedicated-kmem_cache-for-skb-head-allocs' (diff)
parentlib/cpumask: update comment for cpumask_local_spread() (diff)
downloadlinux-cc74ca303a658dc4fb69e75df5f79d03d7a9b7e5.tar.gz
linux-cc74ca303a658dc4fb69e75df5f79d03d7a9b7e5.tar.bz2
linux-cc74ca303a658dc4fb69e75df5f79d03d7a9b7e5.zip
Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'
Yury Norov says: ==================== sched: cpumask: improve on cpumask_local_spread() locality cpumask_local_spread() currently checks local node for presence of i'th CPU, and then if it finds nothing makes a flat search among all non-local CPUs. We can do it better by checking CPUs per NUMA hops. This has significant performance implications on NUMA machines, for example when using NUMA-aware allocated memory together with NUMA-aware IRQ affinity hints. Performance tests from patch 8 of this series for mellanox network driver show: TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on). Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121 +-------------------------+-----------+------------------+------------------+ | | BW (Gbps) | TX side CPU util | RX side CPU util | +-------------------------+-----------+------------------+------------------+ | Baseline | 52.3 | 6.4 % | 17.9 % | +-------------------------+-----------+------------------+------------------+ | Applied on TX side only | 52.6 | 5.2 % | 18.5 % | +-------------------------+-----------+------------------+------------------+ | Applied on RX side only | 94.9 | 11.9 % | 27.2 % | +-------------------------+-----------+------------------+------------------+ | Applied on both sides | 95.1 | 8.4 % | 27.3 % | +-------------------------+-----------+------------------+------------------+ Bottleneck in RX side is released, reached linerate (~1.8x speedup). ~30% less cpu util on TX. ==================== Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eq.c18
-rw-r--r--include/linux/cpumask.h20
-rw-r--r--include/linux/find.h33
-rw-r--r--include/linux/topology.h33
-rw-r--r--kernel/sched/topology.c90
-rw-r--r--lib/cpumask.c52
-rw-r--r--lib/find_bit.c9
7 files changed, 230 insertions, 25 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 9b44557e7271..66ec7932f008 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
static int comp_irqs_request(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = dev->priv.eq_table;
+ const struct cpumask *prev = cpu_none_mask;
+ const struct cpumask *mask;
int ncomp_eqs = table->num_comp_eqs;
u16 *cpus;
int ret;
+ int cpu;
int i;
ncomp_eqs = table->num_comp_eqs;
@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
ret = -ENOMEM;
goto free_irqs;
}
- for (i = 0; i < ncomp_eqs; i++)
- cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+
+ i = 0;
+ rcu_read_lock();
+ for_each_numa_hop_mask(mask, dev->priv.numa_node) {
+ for_each_cpu_andnot(cpu, mask, prev) {
+ cpus[i] = cpu;
+ if (++i == ncomp_eqs)
+ goto spread_done;
+ }
+ prev = mask;
+ }
+spread_done:
+ rcu_read_unlock();
ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
kfree(cpus);
if (ret < 0)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index c2aa0aa26b45..7b16aede7ac5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
nr_cpumask_bits, cpumask_check(cpu));
}
+/**
+ * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
+ * @srcp1: the cpumask pointer
+ * @srcp2: the cpumask pointer
+ * @srcp3: the cpumask pointer
+ * @cpu: the N'th cpu to find, starting from 0
+ *
+ * Returns >= nr_cpu_ids if such cpu doesn't exist.
+ */
+static __always_inline
+unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
+ const struct cpumask *srcp2,
+ const struct cpumask *srcp3)
+{
+ return find_nth_and_andnot_bit(cpumask_bits(srcp1),
+ cpumask_bits(srcp2),
+ cpumask_bits(srcp3),
+ nr_cpumask_bits, cpumask_check(cpu));
+}
+
#define CPU_BITS_NONE \
{ \
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
diff --git a/include/linux/find.h b/include/linux/find.h
index ccaf61a0f5fd..4647864a5ffd 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long size, unsigned long n);
+unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
+ const unsigned long *addr3, unsigned long size,
+ unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
return __find_nth_andnot_bit(addr1, addr2, size, n);
}
+/**
+ * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
+ * excluding those set in 3rd region
+ * @addr1: The 1st address to start the search at
+ * @addr2: The 2nd address to start the search at
+ * @addr3: The 3rd address to start the search at
+ * @size: The maximum number of bits to search
+ * @n: The number of set bit, which position is needed, counting from 0
+ *
+ * Returns the bit number of the N'th set bit.
+ * If no such, returns @size.
+ */
+static __always_inline
+unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
+ const unsigned long *addr2,
+ const unsigned long *addr3,
+ unsigned long size, unsigned long n)
+{
+ if (n >= size)
+ return size;
+
+ if (small_const_nbits(size)) {
+ unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);
+
+ return val ? fns(val, n) : size;
+ }
+
+ return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
+}
+
#ifndef find_first_and_bit
/**
* find_first_and_bit - find the first set bit in both memory regions
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4564faafd0e1..fea32377f7c7 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
return cpumask_of_node(cpu_to_node(cpu));
}
+#ifdef CONFIG_NUMA
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
+extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
+#else
+static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ return cpumask_nth(cpu, cpus);
+}
+
+static inline const struct cpumask *
+sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif /* CONFIG_NUMA */
+
+/**
+ * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
+ * from a given node.
+ * @mask: the iteration variable.
+ * @node: the NUMA node to start the search from.
+ *
+ * Requires rcu_lock to be held.
+ *
+ * Yields cpu_online_mask for @node == NUMA_NO_NODE.
+ */
+#define for_each_numa_hop_mask(mask, node) \
+ for (unsigned int __hops = 0; \
+ mask = (node != NUMA_NO_NODE || __hops) ? \
+ sched_numa_hop_mask(node, __hops) : \
+ cpu_online_mask, \
+ !IS_ERR_OR_NULL(mask); \
+ __hops++)
#endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..1233affc106c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,8 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/bsearch.h>
+
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
@@ -2067,6 +2069,94 @@ unlock:
return found;
}
+struct __cmp_key {
+ const struct cpumask *cpus;
+ struct cpumask ***masks;
+ int node;
+ int cpu;
+ int w;
+};
+
+static int hop_cmp(const void *a, const void *b)
+{
+ struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
+ struct cpumask **cur_hop = *(struct cpumask ***)b;
+ struct __cmp_key *k = (struct __cmp_key *)a;
+
+ if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
+ return 1;
+
+ k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
+ if (k->w <= k->cpu)
+ return 0;
+
+ return -1;
+}
+
+/*
+ * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: Nth cpu to find
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
+{
+ struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
+ struct cpumask ***hop_masks;
+ int hop, ret = nr_cpu_ids;
+
+ rcu_read_lock();
+
+ k.masks = rcu_dereference(sched_domains_numa_masks);
+ if (!k.masks)
+ goto unlock;
+
+ hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
+ hop = hop_masks - k.masks;
+
+ ret = hop ?
+ cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
+ cpumask_nth_and(cpu, cpus, k.masks[0][node]);
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
+
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
+ * @node
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Return: On success, a pointer to a cpumask of CPUs at most @hops away from
+ * @node, an error value otherwise.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in distance; see sched_init_numa() for how
+ * distances and masks are handled.
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
+{
+ struct cpumask ***masks;
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ return ERR_PTR(-EBUSY);
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
diff --git a/lib/cpumask.c b/lib/cpumask.c
index c7c392514fd3..e7258836b60b 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
#endif
/**
- * cpumask_local_spread - select the i'th cpu with local numa cpu's first
+ * cpumask_local_spread - select the i'th cpu based on NUMA distances
* @i: index number
* @node: local numa_node
*
- * This function selects an online CPU according to a numa aware policy;
- * local cpus are returned first, followed by non-local ones, then it
- * wraps around.
+ * Returns online CPU according to a numa aware policy; local cpus are returned
+ * first, followed by non-local ones, then it wraps around.
*
- * It's not very efficient, but useful for setup.
+ * For those who wants to enumerate all CPUs based on their NUMA distances,
+ * i.e. call this function in a loop, like:
+ *
+ * for (i = 0; i < num_online_cpus(); i++) {
+ * cpu = cpumask_local_spread(i, node);
+ * do_something(cpu);
+ * }
+ *
+ * There's a better alternative based on for_each()-like iterators:
+ *
+ * for_each_numa_hop_mask(mask, node) {
+ * for_each_cpu_andnot(cpu, mask, prev)
+ * do_something(cpu);
+ * prev = mask;
+ * }
+ *
+ * It's simpler and more verbose than above. Complexity of iterator-based
+ * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
+ * cpumask_local_spread() when called for each cpu is
+ * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
*/
unsigned int cpumask_local_spread(unsigned int i, int node)
{
@@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
/* Wrap: we always want a cpu. */
i %= num_online_cpus();
- if (node == NUMA_NO_NODE) {
- cpu = cpumask_nth(i, cpu_online_mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- } else {
- /* NUMA first. */
- cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node));
- if (cpu < nr_cpu_ids)
- return cpu;
-
- i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node));
-
- /* Skip NUMA nodes, done above. */
- cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node));
- if (cpu < nr_cpu_ids)
- return cpu;
- }
- BUG();
+ cpu = (node == NUMA_NO_NODE) ?
+ cpumask_nth(i, cpu_online_mask) :
+ sched_numa_find_nth_cpu(cpu_online_mask, i, node);
+
+ WARN_ON(cpu >= nr_cpu_ids);
+ return cpu;
}
EXPORT_SYMBOL(cpumask_local_spread);
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 18bc0a7ac8ee..c10920e66788 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l
}
EXPORT_SYMBOL(__find_nth_andnot_bit);
+unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
+ const unsigned long *addr2,
+ const unsigned long *addr3,
+ unsigned long size, unsigned long n)
+{
+ return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
+}
+EXPORT_SYMBOL(__find_nth_and_andnot_bit);
+
#ifndef find_next_and_bit
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long nbits, unsigned long start)