aboutsummaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorGravatar Jakub Kicinski <kuba@kernel.org> 2023-04-12 21:26:04 -0700
committerGravatar Jakub Kicinski <kuba@kernel.org> 2023-04-14 18:56:12 -0700
commit8c48eea3adf3119e0a3fc57bd31f6966f26ee784 (patch)
tree6948aaafccb06960f52ca48a671375e9d21724b6 /include
parentnet: skb: plumb napi state thru skb freeing paths (diff)
downloadlinux-8c48eea3adf3119e0a3fc57bd31f6966f26ee784.tar.gz
linux-8c48eea3adf3119e0a3fc57bd31f6966f26ee784.tar.bz2
linux-8c48eea3adf3119e0a3fc57bd31f6966f26ee784.zip
page_pool: allow caching from safely localized NAPI
Recent patches to mlx5 mentioned a regression when moving from driver local page pool to only using the generic page pool code. Page pool has two recycling paths (1) direct one, which runs in safe NAPI context (basically consumer context, so producing can be lockless); and (2) via a ptr_ring, which takes a spin lock because the freeing can happen from any CPU; producer and consumer may run concurrently. Since the page pool code was added, Eric introduced a revised version of deferred skb freeing. TCP skbs are now usually returned to the CPU which allocated them, and freed in softirq context. This places the freeing (producing of pages back to the pool) enticingly close to the allocation (consumer). If we can prove that we're freeing in the same softirq context in which the consumer NAPI will run - lockless use of the cache is perfectly fine, no need for the lock. Let drivers link the page pool to a NAPI instance. If the NAPI instance is scheduled on the same CPU on which we're freeing - place the pages in the direct cache. With that and patched bnxt (XDP enabled to engage the page pool, sigh, bnxt really needs page pool work :() I see a 2.6% perf boost with a TCP stream test (app on a different physical core than softirq). The CPU use of relevant functions decreases as expected: page_pool_refill_alloc_cache 1.17% -> 0% _raw_spin_lock 2.41% -> 0.98% Only consider lockless path to be safe when NAPI is scheduled - in practice this should cover majority if not all of steady state workloads. It's usually the NAPI kicking in that causes the skb flush. The main case we'll miss out on is when application runs on the same CPU as NAPI. In that case we don't use the deferred skb free path. Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com> Tested-by: Dragos Tatulea <dtatulea@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/netdevice.h3
-rw-r--r--include/linux/skbuff.h20
-rw-r--r--include/net/page_pool.h3
3 files changed, 18 insertions, 8 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 96d27d558b0c..203c0df2046c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -360,8 +360,11 @@ struct napi_struct {
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
+ /* CPU actively polling if netpoll is configured */
int poll_owner;
#endif
+ /* CPU on which NAPI has been scheduled for processing */
+ int list_owner;
struct net_device *dev;
struct gro_list gro_hash[GRO_HASH_BUCKETS];
struct sk_buff *skb;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 494a23a976b0..a823ec3aa326 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+ struct page *page = skb_frag_page(frag);
+
+#ifdef CONFIG_PAGE_POOL
+ if (recycle && page_pool_return_skb_page(page, napi_safe))
+ return;
+#endif
+ put_page(page);
+}
+
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
@@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
*/
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
- struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
- if (recycle && page_pool_return_skb_page(page))
- return;
-#endif
- put_page(page);
+ napi_frag_unref(frag, recycle, false);
}
/**
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b328677..91b808dade82 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -77,6 +77,7 @@ struct page_pool_params {
unsigned int pool_size;
int nid; /* Numa node id to allocate from pages from */
struct device *dev; /* device, for DMA pre-mapping purposes */
+ struct napi_struct *napi; /* Sole consumer of pages, otherwise NULL */
enum dma_data_direction dma_dir; /* DMA mapping direction */
unsigned int max_len; /* max DMA sync memory size */
unsigned int offset; /* DMA addr offset */
@@ -239,7 +240,7 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
return pool->p.dma_dir;
}
-bool page_pool_return_skb_page(struct page *page);
+bool page_pool_return_skb_page(struct page *page, bool napi_safe);
struct page_pool *page_pool_create(const struct page_pool_params *params);