cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints

This commit enhances the ability to troubleshoot the global cgroup_rstat_lock by introducing wrapper helper functions for the lock along with associated tracepoints. Although global, the cgroup_rstat_lock helper APIs and tracepoints take arguments such as cgroup pointer and cpu_in_loop variable. This adjustment is made because flushing occurs per cgroup despite the lock being global. Hence, when troubleshooting, it's important to identify the relevant cgroup. The cpu_in_loop variable is necessary because the global lock may be released within the main flushing loop that traverses CPUs. In the tracepoints, the cpu_in_loop value is set to -1 when acquiring the main lock; otherwise, it denotes the CPU number processed last. The new feature in this patchset is detecting when lock is contended. The tracepoints are implemented with production in mind. For minimum overhead attach to cgroup:cgroup_rstat_lock_contended, which only gets activated when trylock detects lock is contended. A quick production check for issues could be done via this perf commands: perf record -g -e cgroup:cgroup_rstat_lock_contended Next natural question would be asking how long time do lock contenders wait for obtaining the lock. This can be answered by measuring the time between cgroup:cgroup_rstat_lock_contended and cgroup:cgroup_rstat_locked when args->contended is set. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} interval:s:1 {time("%H:%M:%S "); print(@wait_ns); }' Extending with time spend holding the lock will be more expensive as this also looks at all the non-contended cases. Like this bpftrace script: bpftrace -e ' tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs} tracepoint:cgroup:cgroup_rstat_locked { @locked[tid]=nsecs; if (args->contended) { @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}} tracepoint:cgroup:cgroup_rstat_unlock { @locked_ns=hist(nsecs-@locked[tid]); delete(@locked[tid]);} interval:s:1 {time("%H:%M:%S "); print(@wait_ns);print(@locked_ns); }' Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org> Signed-off-by: Tejun Heo <tj@kernel.org>
author: Jesper Dangaard Brouer <hawk@kernel.org> 2024-04-16 19:51:26 +0200
committer: Tejun Heo <tj@kernel.org> 2024-04-16 12:10:42 -1000
commit: fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429 (patch)
tree: 0e253e08ac5881d265f82bf64faa2c8c5c1b558e /kernel/cgroup
parent: cgroup/pids: Remove superfluous zeroing (diff)
download: linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.tar.gz
linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.tar.bz2
linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.zip
1 files changed, 39 insertions, 8 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 07e2284bb499..ff68c904e647 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
 
+#include <trace/events/cgroup.h>
+
 static DEFINE_SPINLOCK(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
@@ -222,6 +224,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
 
 __bpf_hook_end();
 
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments.  The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+	__acquires(&cgroup_rstat_lock)
+{
+	bool contended;
+
+	contended = !spin_trylock_irq(&cgroup_rstat_lock);
+	if (contended) {
+		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+		spin_lock_irq(&cgroup_rstat_lock);
+	}
+	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+	__releases(&cgroup_rstat_lock)
+{
+	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+	spin_unlock_irq(&cgroup_rstat_lock);
+}
+
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +279,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 
 		/* play nice and yield if necessary */
 		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-			spin_unlock_irq(&cgroup_rstat_lock);
+			__cgroup_rstat_unlock(cgrp, cpu);
 			if (!cond_resched())
 				cpu_relax();
-			spin_lock_irq(&cgroup_rstat_lock);
+			__cgroup_rstat_lock(cgrp, cpu);
 		}
 	}
 }
@@ -273,9 +304,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
 {
 	might_sleep();
 
-	spin_lock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_lock(cgrp, -1);
 	cgroup_rstat_flush_locked(cgrp);
-	spin_unlock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_unlock(cgrp, -1);
 }
 
 /**
@@ -291,17 +322,17 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 	__acquires(&cgroup_rstat_lock)
 {
 	might_sleep();
-	spin_lock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_lock(cgrp, -1);
 	cgroup_rstat_flush_locked(cgrp);
 }
 
 /**
  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  */
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
 	__releases(&cgroup_rstat_lock)
 {
-	spin_unlock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_unlock(cgrp, -1);
 }
 
 int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +564,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 #ifdef CONFIG_SCHED_CORE
 		forceidle_time = cgrp->bstat.forceidle_sum;
 #endif
-		cgroup_rstat_flush_release();
+		cgroup_rstat_flush_release(cgrp);
 	} else {
 		root_cgroup_cputime(&bstat);
 		usage = bstat.cputime.sum_exec_runtime;
author	Jesper Dangaard Brouer <hawk@kernel.org>	2024-04-16 19:51:26 +0200
committer	Tejun Heo <tj@kernel.org>	2024-04-16 12:10:42 -1000
commit	fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429 (patch)
tree	0e253e08ac5881d265f82bf64faa2c8c5c1b558e /kernel/cgroup
parent	cgroup/pids: Remove superfluous zeroing (diff)
download	linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.tar.gz linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.tar.bz2 linux-fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429.zip