aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup/rstat.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/rstat.c')
-rw-r--r--kernel/cgroup/rstat.c118
1 files changed, 98 insertions, 20 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 07e2284bb499..fb8b49437573 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <trace/events/cgroup.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, const bool fast_path)
+{
+ unsigned long flags;
+ bool contended;
+
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+ if (contended) {
+ if (fast_path)
+ trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+ raw_spin_lock_irqsave(cpu_lock, flags);
+ }
+
+ if (fast_path)
+ trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+ else
+ trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+ return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+ struct cgroup *cgrp, unsigned long flags,
+ const bool fast_path)
+{
+ if (fast_path)
+ trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+ else
+ trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
/**
* cgroup_rstat_updated - keep track of updated rstat_cpu
* @cgrp: target cgroup
@@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
/* put @cgrp and all ancestors on the corresponding updated lists */
while (true) {
@@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
cgrp = parent;
}
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}
/**
@@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
struct cgroup *head = NULL, *parent, *child;
unsigned long flags;
- /*
- * The _irqsave() is needed because cgroup_rstat_lock is
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
- * this lock with the _irq() suffix only disables interrupts on
- * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
- * interrupts on both configurations. The _irqsave() ensures
- * that interrupts are always disabled and later restored.
- */
- raw_spin_lock_irqsave(cpu_lock, flags);
+ flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
if (child != root)
head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
return head;
}
@@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
__bpf_hook_end();
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+ __acquires(&cgroup_rstat_lock)
+{
+ bool contended;
+
+ contended = !spin_trylock_irq(&cgroup_rstat_lock);
+ if (contended) {
+ trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+ spin_lock_irq(&cgroup_rstat_lock);
+ }
+ trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+ __releases(&cgroup_rstat_lock)
+{
+ trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+ spin_unlock_irq(&cgroup_rstat_lock);
+}
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
/* play nice and yield if necessary */
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, cpu);
if (!cond_resched())
cpu_relax();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, cpu);
}
}
}
@@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
/**
@@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
__acquires(&cgroup_rstat_lock)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_lock(cgrp, -1);
cgroup_rstat_flush_locked(cgrp);
}
/**
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
+ * @cgrp: cgroup used by tracepoint
*/
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock)
{
- spin_unlock_irq(&cgroup_rstat_lock);
+ __cgroup_rstat_unlock(cgrp, -1);
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
#ifdef CONFIG_SCHED_CORE
forceidle_time = cgrp->bstat.forceidle_sum;
#endif
- cgroup_rstat_flush_release();
+ cgroup_rstat_flush_release(cgrp);
} else {
root_cgroup_cputime(&bstat);
usage = bstat.cputime.sum_exec_runtime;