aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cpumap.c35
-rw-r--r--kernel/bpf/inode.c6
-rw-r--r--kernel/bpf/verifier.c32
-rw-r--r--kernel/irq/resend.c7
-rw-r--r--kernel/kprobes.c14
-rw-r--r--kernel/locking/rtmutex.c170
-rw-r--r--kernel/locking/rtmutex_api.c2
-rw-r--r--kernel/locking/rtmutex_common.h47
-rw-r--r--kernel/locking/ww_mutex.h12
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/trace/bpf_trace.c17
-rw-r--r--kernel/trace/ring_buffer.c39
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h10
-rw-r--r--kernel/trace/trace_events.c14
-rw-r--r--kernel/trace/trace_events_hist.c3
-rw-r--r--kernel/trace/trace_events_synth.c104
-rw-r--r--kernel/trace/trace_events_trigger.c2
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_probe.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_seq.c1
-rw-r--r--kernel/trace/tracing_map.h4
-rw-r--r--kernel/workqueue.c43
26 files changed, 445 insertions, 219 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 6ae02be7a48e..286ab3db0fde 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -28,6 +28,7 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/completion.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>
@@ -73,6 +74,7 @@ struct bpf_cpu_map_entry {
struct rcu_head rcu;
struct work_struct kthread_stop_wq;
+ struct completion kthread_running;
};
struct bpf_cpu_map {
@@ -129,11 +131,17 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
* invoked cpu_map_kthread_stop(). Catch any broken behaviour
* gracefully and warn once.
*/
- struct xdp_frame *xdpf;
+ void *ptr;
- while ((xdpf = ptr_ring_consume(ring)))
- if (WARN_ON_ONCE(xdpf))
- xdp_return_frame(xdpf);
+ while ((ptr = ptr_ring_consume(ring))) {
+ WARN_ON_ONCE(1);
+ if (unlikely(__ptr_test_bit(0, &ptr))) {
+ __ptr_clear_bit(0, &ptr);
+ kfree_skb(ptr);
+ continue;
+ }
+ xdp_return_frame(ptr);
+ }
}
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -153,7 +161,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
static void cpu_map_kthread_stop(struct work_struct *work)
{
struct bpf_cpu_map_entry *rcpu;
- int err;
rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
@@ -163,14 +170,7 @@ static void cpu_map_kthread_stop(struct work_struct *work)
rcu_barrier();
/* kthread_stop will wake_up_process and wait for it to complete */
- err = kthread_stop(rcpu->kthread);
- if (err) {
- /* kthread_stop may be called before cpu_map_kthread_run
- * is executed, so we need to release the memory related
- * to rcpu.
- */
- put_cpu_map_entry(rcpu);
- }
+ kthread_stop(rcpu->kthread);
}
static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
@@ -298,11 +298,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
return nframes;
}
-
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
+ complete(&rcpu->kthread_running);
set_current_state(TASK_INTERRUPTIBLE);
/* When kthread gives stop order, then rcpu have been disconnected
@@ -467,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
goto free_ptr_ring;
/* Setup kthread */
+ init_completion(&rcpu->kthread_running);
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
"cpumap/%d/map:%d", cpu,
map->id);
@@ -480,6 +481,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
kthread_bind(rcpu->kthread, cpu);
wake_up_process(rcpu->kthread);
+ /* Make sure kthread has been running, so kthread_stop() will not
+ * stop the kthread prematurely and all pending frames or skbs
+ * will be handled by the kthread before kthread_stop() returns.
+ */
+ wait_for_completion(&rcpu->kthread_running);
+
return rcpu;
free_prog:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4174f76133df..99d0625b6c82 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb,
return ERR_PTR(-ENOSPC);
inode->i_ino = get_next_ino();
- inode->i_atime = current_time(inode);
+ inode->i_atime = inode_set_ctime_current(inode);
inode->i_mtime = inode->i_atime;
- inode->i_ctime = inode->i_atime;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
@@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = current_time(dir);
- dir->i_ctime = dir->i_mtime;
+ dir->i_mtime = inode_set_ctime_current(dir);
}
static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 930b5555cfd3..02a021c524ab 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5573,16 +5573,17 @@ static int update_stack_depth(struct bpf_verifier_env *env,
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
{
- int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
+ int depth = 0, frame = 0, i, subprog_end;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
int j;
+ i = subprog[idx].start;
process_func:
/* protect against potential stack overflow that might happen when
* bpf2bpf calls get combined with tailcalls. Limit the caller's stack
@@ -5621,7 +5622,7 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
- int next_insn;
+ int next_insn, sidx;
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
@@ -5631,14 +5632,14 @@ continue_func:
/* find the callee */
next_insn = i + insn[i].imm + 1;
- idx = find_subprog(env, next_insn);
- if (idx < 0) {
+ sidx = find_subprog(env, next_insn);
+ if (sidx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
next_insn);
return -EFAULT;
}
- if (subprog[idx].is_async_cb) {
- if (subprog[idx].has_tail_call) {
+ if (subprog[sidx].is_async_cb) {
+ if (subprog[sidx].has_tail_call) {
verbose(env, "verifier bug. subprog has tail_call and async cb\n");
return -EFAULT;
}
@@ -5647,6 +5648,7 @@ continue_func:
continue;
}
i = next_insn;
+ idx = sidx;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -5682,6 +5684,22 @@ continue_func:
goto continue_func;
}
+static int check_max_stack_depth(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *si = env->subprog_info;
+ int ret;
+
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (!i || si[i].is_async_cb) {
+ ret = check_max_stack_depth_subprog(env, i);
+ if (ret < 0)
+ return ret;
+ }
+ continue;
+ }
+ return 0;
+}
+
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
static int get_callee_stack_depth(struct bpf_verifier_env *env,
const struct bpf_insn *insn, int idx)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index edec335c0a7a..5f2c66860ac6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -68,11 +68,16 @@ static int irq_sw_resend(struct irq_desc *desc)
*/
if (!desc->parent_irq)
return -EINVAL;
+
+ desc = irq_to_desc(desc->parent_irq);
+ if (!desc)
+ return -EINVAL;
}
/* Add to resend_list and activate the softirq: */
raw_spin_lock(&irq_resend_lock);
- hlist_add_head(&desc->resend_node, &irq_resend_list);
+ if (hlist_unhashed(&desc->resend_node))
+ hlist_add_head(&desc->resend_node, &irq_resend_list);
raw_spin_unlock(&irq_resend_lock);
tasklet_schedule(&resend_tasklet);
return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fc6095d502d..ca385b61d546 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1545,6 +1545,17 @@ static int check_ftrace_location(struct kprobe *p)
return 0;
}
+static bool is_cfi_preamble_symbol(unsigned long addr)
+{
+ char symbuf[KSYM_NAME_LEN];
+
+ if (lookup_symbol_name(addr, symbuf))
+ return false;
+
+ return str_has_prefix("__cfi_", symbuf) ||
+ str_has_prefix("__pfx_", symbuf);
+}
+
static int check_kprobe_address_safe(struct kprobe *p,
struct module **probed_mod)
{
@@ -1563,7 +1574,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
static_call_text_reserved(p->addr, p->addr) ||
- find_bug((unsigned long)p->addr)) {
+ find_bug((unsigned long)p->addr) ||
+ is_cfi_preamble_symbol((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 728f434de2bb..21db0df0eb00 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task)
return prio;
}
+/*
+ * Update the waiter->tree copy of the sort keys.
+ */
static __always_inline void
waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
{
- waiter->prio = __waiter_prio(task);
- waiter->deadline = task->dl.deadline;
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
+
+ waiter->tree.prio = __waiter_prio(task);
+ waiter->tree.deadline = task->dl.deadline;
+}
+
+/*
+ * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
+ */
+static __always_inline void
+waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
+{
+ lockdep_assert_held(&waiter->lock->wait_lock);
+ lockdep_assert_held(&task->pi_lock);
+ lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
+
+ waiter->pi_tree.prio = waiter->tree.prio;
+ waiter->pi_tree.deadline = waiter->tree.deadline;
}
/*
- * Only use with rt_mutex_waiter_{less,equal}()
+ * Only use with rt_waiter_node_{less,equal}()
*/
+#define task_to_waiter_node(p) \
+ &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
#define task_to_waiter(p) \
- &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
+ &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
-static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio < right->prio)
return 1;
@@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left,
return 0;
}
-static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
- struct rt_mutex_waiter *right)
+static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
+ struct rt_waiter_node *right)
{
if (left->prio != right->prio)
return 0;
@@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
struct rt_mutex_waiter *top_waiter)
{
- if (rt_mutex_waiter_less(waiter, top_waiter))
+ if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
return true;
#ifdef RT_MUTEX_BUILD_SPINLOCKS
@@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
* Note that RT tasks are excluded from same priority (lateral)
* steals to prevent the introduction of an unbounded latency.
*/
- if (rt_prio(waiter->prio) || dl_prio(waiter->prio))
+ if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
return false;
- return rt_mutex_waiter_equal(waiter, top_waiter);
+ return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
#else
return false;
#endif
}
#define __node_2_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, tree_entry)
+ rb_entry((node), struct rt_mutex_waiter, tree.entry)
static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
{
struct rt_mutex_waiter *aw = __node_2_waiter(a);
struct rt_mutex_waiter *bw = __node_2_waiter(b);
- if (rt_mutex_waiter_less(aw, bw))
+ if (rt_waiter_node_less(&aw->tree, &bw->tree))
return 1;
if (!build_ww_mutex())
return 0;
- if (rt_mutex_waiter_less(bw, aw))
+ if (rt_waiter_node_less(&bw->tree, &aw->tree))
return 0;
/* NOTE: relies on waiter->ww_ctx being set before insertion */
@@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod
static __always_inline void
rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less);
+ lockdep_assert_held(&lock->wait_lock);
+
+ rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
}
static __always_inline void
rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->tree_entry))
+ lockdep_assert_held(&lock->wait_lock);
+
+ if (RB_EMPTY_NODE(&waiter->tree.entry))
return;
- rb_erase_cached(&waiter->tree_entry, &lock->waiters);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ rb_erase_cached(&waiter->tree.entry, &lock->waiters);
+ RB_CLEAR_NODE(&waiter->tree.entry);
}
-#define __node_2_pi_waiter(node) \
- rb_entry((node), struct rt_mutex_waiter, pi_tree_entry)
+#define __node_2_rt_node(node) \
+ rb_entry((node), struct rt_waiter_node, entry)
-static __always_inline bool
-__pi_waiter_less(struct rb_node *a, const struct rb_node *b)
+static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
{
- return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b));
+ return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
}
static __always_inline void
rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less);
+ lockdep_assert_held(&task->pi_lock);
+
+ rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
}
static __always_inline void
rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
{
- if (RB_EMPTY_NODE(&waiter->pi_tree_entry))
+ lockdep_assert_held(&task->pi_lock);
+
+ if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
return;
- rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
}
-static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
+static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
+ struct task_struct *p)
{
struct task_struct *pi_task = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+ lockdep_assert(rt_mutex_owner(lock) == p);
lockdep_assert_held(&p->pi_lock);
if (task_has_pi_waiters(p))
@@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* Chain walk basics and protection scope
*
* [R] refcount on task
- * [P] task->pi_lock held
+ * [Pn] task->pi_lock held
* [L] rtmutex->wait_lock held
*
+ * Normal locking order:
+ *
+ * rtmutex->wait_lock
+ * task->pi_lock
+ *
* Step Description Protected by
* function arguments:
* @task [R]
@@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st
* again:
* loop_sanity_check();
* retry:
- * [1] lock(task->pi_lock); [R] acquire [P]
- * [2] waiter = task->pi_blocked_on; [P]
- * [3] check_exit_conditions_1(); [P]
- * [4] lock = waiter->lock; [P]
- * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
- * unlock(task->pi_lock); release [P]
+ * [1] lock(task->pi_lock); [R] acquire [P1]
+ * [2] waiter = task->pi_blocked_on; [P1]
+ * [3] check_exit_conditions_1(); [P1]
+ * [4] lock = waiter->lock; [P1]
+ * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L]
+ * unlock(task->pi_lock); release [P1]
* goto retry;
* }
- * [6] check_exit_conditions_2(); [P] + [L]
- * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
- * [8] unlock(task->pi_lock); release [P]
+ * [6] check_exit_conditions_2(); [P1] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P1] + [L]
+ * [8] unlock(task->pi_lock); release [P1]
* put_task_struct(task); release [R]
* [9] check_exit_conditions_3(); [L]
* [10] task = owner(lock); [L]
* get_task_struct(task); [L] acquire [R]
- * lock(task->pi_lock); [L] acquire [P]
- * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
- * [12] check_exit_conditions_4(); [P] + [L]
- * [13] unlock(task->pi_lock); release [P]
+ * lock(task->pi_lock); [L] acquire [P2]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
+ * [12] check_exit_conditions_4(); [P2] + [L]
+ * [13] unlock(task->pi_lock); release [P2]
* unlock(lock->wait_lock); release [L]
* goto again;
+ *
+ * Where P1 is the blocking task and P2 is the lock owner; going up one step
+ * the owner becomes the next blocked task etc..
+ *
+*
*/
static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
enum rtmutex_chainwalk chwalk,
@@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * [4] Get the next lock
+ * [4] Get the next lock; per holding task->pi_lock we can't unblock
+ * and guarantee @lock's existence.
*/
lock = waiter->lock;
/*
* [5] We need to trylock here as we are holding task->pi_lock,
* which is the reverse lock order versus the other rtmutex
* operations.
+ *
+ * Per the above, holding task->pi_lock guarantees lock exists, so
+ * inverting this lock order is infeasible from a life-time
+ * perspective.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irq(&task->pi_lock);
@@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* or
*
* DL CBS enforcement advancing the effective deadline.
- *
- * Even though pi_waiters also uses these fields, and that tree is only
- * updated in [11], we can do this here, since we hold [L], which
- * serializes all pi_waiters access and rb_erase() does not care about
- * the values of the node being removed.
*/
waiter_update_prio(waiter, task);
rt_mutex_enqueue(lock, waiter);
- /* [8] Release the task */
+ /*
+ * [8] Release the (blocking) task in preparation for
+ * taking the owner task in [10].
+ *
+ * Since we hold lock->waiter_lock, task cannot unblock, even if we
+ * release task->pi_lock.
+ */
raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
@@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
return 0;
}
- /* [10] Grab the next task, i.e. the owner of @lock */
+ /*
+ * [10] Grab the next task, i.e. the owner of @lock
+ *
+ * Per holding lock->wait_lock and checking for !owner above, there
+ * must be an owner and it cannot go away.
+ */
task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
@@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* and adjust the priority of the owner.
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
+ waiter_clone_prio(waiter, task);
rt_mutex_enqueue_pi(task, waiter);
- rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(lock, task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
waiter->task = task;
waiter->lock = lock;
waiter_update_prio(waiter, task);
+ waiter_clone_prio(waiter, task);
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
{
struct rt_mutex_waiter *waiter;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
* task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_adjust_prio(current);
+ rt_mutex_adjust_prio(lock, current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(lock, owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index cb9fdff76a8a..a6974d044593 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
+ if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index c47e8361bfb5..1162e07cdaea 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -17,27 +17,44 @@
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
+
+/*
+ * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two
+ * separate trees and they need their own copy of the sort keys because of
+ * different locking requirements.
+ *
+ * @entry: rbtree node to enqueue into the waiters tree
+ * @prio: Priority of the waiter
+ * @deadline: Deadline of the waiter if applicable
+ *
+ * See rt_waiter_node_less() and waiter_*_prio().
+ */
+struct rt_waiter_node {
+ struct rb_node entry;
+ int prio;
+ u64 deadline;
+};
+
/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
- * @tree_entry: pi node to enqueue into the mutex waiters tree
- * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree
+ * @tree: node to enqueue into the mutex waiters tree
+ * @pi_tree: node to enqueue into the mutex owner waiters tree
* @task: task reference to the blocked task
* @lock: Pointer to the rt_mutex on which the waiter blocks
* @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT)
- * @prio: Priority of the waiter
- * @deadline: Deadline of the waiter if applicable
* @ww_ctx: WW context pointer
+ *
+ * @tree is ordered by @lock->wait_lock
+ * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock
*/
struct rt_mutex_waiter {
- struct rb_node tree_entry;
- struct rb_node pi_tree_entry;
+ struct rt_waiter_node tree;
+ struct rt_waiter_node pi_tree;
struct task_struct *task;
struct rt_mutex_base *lock;
unsigned int wake_state;
- int prio;
- u64 deadline;
struct ww_acquire_ctx *ww_ctx;
};
@@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock,
{
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
- return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter;
+ return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter;
}
static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock)
@@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *
struct rb_node *leftmost = rb_first_cached(&lock->waiters);
struct rt_mutex_waiter *w = NULL;
+ lockdep_assert_held(&lock->wait_lock);
+
if (leftmost) {
- w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry);
+ w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry);
BUG_ON(w->lock != lock);
}
return w;
@@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p)
static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p)
{
+ lockdep_assert_held(&p->pi_lock);
+
return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter,
- pi_tree_entry);
+ pi_tree.entry);
}
#define RT_MUTEX_HAS_WAITERS 1UL
@@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
{
debug_rt_mutex_init_waiter(waiter);
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
+ RB_CLEAR_NODE(&waiter->pi_tree.entry);
+ RB_CLEAR_NODE(&waiter->tree.entry);
waiter->wake_state = TASK_NORMAL;
waiter->task = NULL;
}
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 56f139201f24..3ad2cc4823e5 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock)
struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_next(&w->tree_entry);
+ struct rb_node *n = rb_next(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
__ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w)
{
- struct rb_node *n = rb_prev(&w->tree_entry);
+ struct rb_node *n = rb_prev(&w->tree.entry);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline struct rt_mutex_waiter *
@@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock)
struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root);
if (!n)
return NULL;
- return rb_entry(n, struct rt_mutex_waiter, tree_entry);
+ return rb_entry(n, struct rt_mutex_waiter, tree.entry);
}
static inline void
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e1b4bfa938dd..2b4a946a6ff5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
int error;
if (!hibernation_available())
- return 0;
+ return n;
if (len && buf[len-1] == '\n')
len--;
diff --git a/kernel/signal.c b/kernel/signal.c
index b5370fe5c198..128e9bb3d1a2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -562,6 +562,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig)
if (handler != SIG_IGN && handler != SIG_DFL)
return false;
+ /* If dying, we handle all new signals by ignoring them */
+ if (fatal_signal_pending(tsk))
+ return false;
+
/* if ptraced, let the tracer determine */
return !tsk->ptrace;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 05f838929e72..2410e3999ebe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2535,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
return -EINVAL;
break;
- case PR_GET_AUXV:
- if (arg4 || arg5)
- return -EINVAL;
- error = prctl_get_auxv((void __user *)arg2, arg3);
- break;
default:
return -EINVAL;
}
@@ -2694,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
+ case PR_GET_AUXV:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = prctl_get_auxv((void __user *)arg2, arg3);
+ break;
#ifdef CONFIG_KSM
case PR_SET_MEMORY_MERGE:
if (arg3 || arg4 || arg5)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5f2dcabad202..bd1a42b23f3f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
- int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
+ struct bpf_trace_sample_data *sds;
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
},
};
struct perf_sample_data *sd;
- int err;
+ int nest_level, err;
+
+ preempt_disable();
+ sds = this_cpu_ptr(&bpf_trace_sds);
+ nest_level = this_cpu_inc_return(bpf_trace_nest_level);
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
err = -EBUSY;
@@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
perf_sample_save_raw_data(sd, &raw);
err = __bpf_perf_event_output(regs, map, flags, sd);
-
out:
this_cpu_dec(bpf_trace_nest_level);
+ preempt_enable();
return err;
}
@@ -715,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
- int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
struct perf_raw_frag frag = {
.copy = ctx_copy,
.size = ctx_size,
@@ -732,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
struct perf_sample_data *sd;
struct pt_regs *regs;
+ int nest_level;
u64 ret;
+ preempt_disable();
+ nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
+
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
ret = -EBUSY;
goto out;
@@ -748,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
+ preempt_enable();
return ret;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 14d8001140c8..52dea5dd5362 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -523,6 +523,8 @@ struct ring_buffer_per_cpu {
rb_time_t before_stamp;
u64 event_stamp[MAX_NEST];
u64 read_stamp;
+ /* pages removed since last reset */
+ unsigned long pages_removed;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -536,6 +538,7 @@ struct trace_buffer {
unsigned flags;
int cpus;
atomic_t record_disabled;
+ atomic_t resizing;
cpumask_var_t cpumask;
struct lock_class_key *reader_lock_key;
@@ -558,6 +561,7 @@ struct ring_buffer_iter {
struct buffer_page *head_page;
struct buffer_page *cache_reader_page;
unsigned long cache_read;
+ unsigned long cache_pages_removed;
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
@@ -946,6 +950,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
/**
* ring_buffer_wake_waiters - wake up any waiters on this ring buffer
* @buffer: The ring buffer to wake waiters on
+ * @cpu: The CPU buffer to wake waiters on
*
* In the case of a file that represents a ring buffer is closing,
* it is prudent to wake up any waiters that are on this.
@@ -1956,6 +1961,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
to_remove = rb_list_head(to_remove)->next;
head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
}
+ /* Read iterators need to reset themselves when some pages removed */
+ cpu_buffer->pages_removed += nr_removed;
next_page = rb_list_head(to_remove)->next;
@@ -1977,12 +1984,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
cpu_buffer->head_page = list_entry(next_page,
struct buffer_page, list);
- /*
- * change read pointer to make sure any read iterators reset
- * themselves
- */
- cpu_buffer->read = 0;
-
/* pages are removed, resume tracing and then free the pages */
atomic_dec(&cpu_buffer->record_disabled);
raw_spin_unlock_irq(&cpu_buffer->reader_lock);
@@ -2167,7 +2168,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
-
+ atomic_inc(&buffer->resizing);
if (cpu_id == RING_BUFFER_ALL_CPUS) {
/*
@@ -2322,6 +2323,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
atomic_dec(&buffer->record_disabled);
}
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return 0;
@@ -2342,6 +2344,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
}
out_err_unlock:
+ atomic_dec(&buffer->resizing);
mutex_unlock(&buffer->mutex);
return err;
}
@@ -3373,7 +3376,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer)
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
- * @event: The event pointer to commit.
*
* This commits the data to the ring buffer, and releases any locks held.
*
@@ -4392,6 +4394,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
if (iter->head) {
iter->read_stamp = cpu_buffer->read_stamp;
@@ -4846,12 +4849,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
/*
- * Check if someone performed a consuming read to
- * the buffer. A consuming read invalidates the iterator
- * and we need to reset the iterator in this case.
+ * Check if someone performed a consuming read to the buffer
+ * or removed some pages from the buffer. In these cases,
+ * iterator was invalidated and we need to reset it.
*/
if (unlikely(iter->cache_read != cpu_buffer->read ||
- iter->cache_reader_page != cpu_buffer->reader_page))
+ iter->cache_reader_page != cpu_buffer->reader_page ||
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
rb_iter_reset(iter);
again:
@@ -5295,6 +5299,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->last_overrun = 0;
rb_head_page_activate(cpu_buffer);
+ cpu_buffer->pages_removed = 0;
}
/* Must have disabled the cpu buffer then done a synchronize_rcu */
@@ -5353,7 +5358,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
- * @cpu: The CPU buffer to be reset
*/
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
@@ -5541,6 +5545,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (local_read(&cpu_buffer_b->committing))
goto out_dec;
+ /*
+ * When resize is in progress, we cannot swap it because
+ * it will mess the state of the cpu buffer.
+ */
+ if (atomic_read(&buffer_a->resizing))
+ goto out_dec;
+ if (atomic_read(&buffer_b->resizing))
+ goto out_dec;
+
buffer_a->buffers[cpu] = cpu_buffer_b;
buffer_b->buffers[cpu] = cpu_buffer_a;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index be847d45d81c..8e64aaad5361 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1928,9 +1928,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* place on this CPU. We fail to record, but we reset
* the max trace buffer (no one writes directly to it)
* and flag that it failed.
+ * Another reason is resize is in progress.
*/
trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
- "Failed to swap buffers due to commit in progress\n");
+ "Failed to swap buffers due to commit or resize in progress\n");
}
WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
@@ -4212,8 +4213,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
* will point to the same string as current_trace->name.
*/
mutex_lock(&trace_types_lock);
- if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) {
+ /* Close iter->trace before switching to the new current tracer */
+ if (iter->trace->close)
+ iter->trace->close(iter);
*iter->trace = *tr->current_trace;
+ /* Reopen the new current tracer */
+ if (iter->trace->open)
+ iter->trace->open(iter);
+ }
mutex_unlock(&trace_types_lock);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5276,11 +5284,17 @@ int tracing_set_cpumask(struct trace_array *tr,
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled);
ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu);
+#endif
}
}
arch_spin_unlock(&tr->max_lock);
@@ -6704,10 +6718,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
#endif
+static int open_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpumask_empty(tr->pipe_cpumask)) {
+ cpumask_setall(tr->pipe_cpumask);
+ return 0;
+ }
+ } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) {
+ cpumask_set_cpu(cpu, tr->pipe_cpumask);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+static void close_pipe_on_cpu(struct trace_array *tr, int cpu)
+{
+ if (cpu == RING_BUFFER_ALL_CPUS) {
+ WARN_ON(!cpumask_full(tr->pipe_cpumask));
+ cpumask_clear(tr->pipe_cpumask);
+ } else {
+ WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask));
+ cpumask_clear_cpu(cpu, tr->pipe_cpumask);
+ }
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
+ int cpu;
int ret;
ret = tracing_check_open_get_tr(tr);
@@ -6715,13 +6755,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
return ret;
mutex_lock(&trace_types_lock);
+ cpu = tracing_get_cpu(inode);
+ ret = open_pipe_on_cpu(tr, cpu);
+ if (ret)
+ goto fail_pipe_on_cpu;
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
- __trace_array_put(tr);
- goto out;
+ goto fail_alloc_iter;
}
trace_seq_init(&iter->seq);
@@ -6744,7 +6787,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->tr = tr;
iter->array_buffer = &tr->array_buffer;
- iter->cpu_file = tracing_get_cpu(inode);
+ iter->cpu_file = cpu;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -6754,12 +6797,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
nonseekable_open(inode, filp);
tr->trace_ref++;
-out:
+
mutex_unlock(&trace_types_lock);
return ret;
fail:
kfree(iter);
+fail_alloc_iter:
+ close_pipe_on_cpu(tr, cpu);
+fail_pipe_on_cpu:
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
@@ -6776,7 +6822,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
if (iter->trace->pipe_close)
iter->trace->pipe_close(iter);
-
+ close_pipe_on_cpu(tr, iter->cpu_file);
mutex_unlock(&trace_types_lock);
free_cpumask_var(iter->started);
@@ -9440,6 +9486,9 @@ static struct trace_array *trace_array_create(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
+ goto out_free_tr;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9481,6 +9530,7 @@ static struct trace_array *trace_array_create(const char *name)
out_free_tr:
ftrace_free_ftrace_ops(tr);
free_trace_buffers(tr);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -9583,6 +9633,7 @@ static int __remove_instance(struct trace_array *tr)
}
kfree(tr->topts);
+ free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -10380,12 +10431,14 @@ __init static int tracer_alloc_buffers(void)
if (trace_create_savedcmd() < 0)
goto out_free_temp_buffer;
+ if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL))
+ goto out_free_savedcmd;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n");
- goto out_free_savedcmd;
+ goto out_free_pipe_cpumask;
}
-
if (global_trace.buffer_disabled)
tracing_off();
@@ -10438,6 +10491,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_pipe_cpumask:
+ free_cpumask_var(global_trace.pipe_cpumask);
out_free_savedcmd:
free_saved_cmdlines_buffer(savedcmd);
out_free_temp_buffer:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e1edc2197fc8..73eaec158473 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,8 @@ struct trace_array {
struct list_head events;
struct trace_event_file *trace_marker_file;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
+ /* one per_cpu trace_pipe can be opened by only one user */
+ cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
#ifdef CONFIG_FUNCTION_TRACER
@@ -1295,6 +1297,14 @@ static inline void trace_branch_disable(void)
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
+union trace_synth_field {
+ u8 as_u8;
+ u16 as_u16;
+ u32 as_u32;
+ u64 as_u64;
+ struct trace_dynamic_info as_dynamic;
+};
+
struct ftrace_event_field {
struct list_head link;
const char *name;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d6ae4eae510..578f1f7d49a6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
- unsigned long file_flags = file->flags;
int ret = 0;
int disable;
@@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Disable use of trace_buffered_event */
+ trace_buffered_event_disable();
} else
disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
@@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_inc_return(&file->sm_ref) > 1)
break;
set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ /* Enable use of trace_buffered_event */
+ trace_buffered_event_enable();
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
@@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
break;
}
- /* Enable or disable use of trace_buffered_event */
- if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
- (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
- if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
- trace_buffered_event_enable();
- else
- trace_buffered_event_disable();
- }
-
return ret;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index c8c61381eba4..d06938ae0717 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -6668,7 +6668,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
goto out_unreg;
if (has_hist_vars(hist_data) || hist_data->n_var_refs) {
- if (save_hist_vars(hist_data))
+ ret = save_hist_vars(hist_data);
+ if (ret)
goto out_unreg;
}
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index d6a70aff2410..9897d0bfcab7 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event,
struct synth_trace_event {
struct trace_entry ent;
- u64 fields[];
+ union trace_synth_field fields[];
};
static int synth_event_define_fields(struct trace_event_call *call)
@@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type)
static void print_synth_event_num_val(struct trace_seq *s,
char *print_fmt, char *name,
- int size, u64 val, char *space)
+ int size, union trace_synth_field *val, char *space)
{
switch (size) {
case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u8, space);
break;
case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u16, space);
break;
case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ trace_seq_printf(s, print_fmt, name, val->as_u32, space);
break;
default:
@@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
struct trace_seq *s = &iter->seq;
struct synth_trace_event *entry;
struct synth_event *se;
- unsigned int i, n_u64;
+ unsigned int i, j, n_u64;
char print_fmt[32];
const char *fmt;
@@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
/* parameter values */
if (se->fields[i]->is_string) {
if (se->fields[i]->is_dynamic) {
- u32 offset, data_offset;
- char *str_field;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
-
- str_field = (char *)entry + data_offset;
+ union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- str_field,
+ (char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
} else {
trace_seq_printf(s, print_fmt, se->fields[i]->name,
STR_VAR_LEN_MAX,
- (char *)&entry->fields[n_u64],
+ (char *)&entry->fields[n_u64].as_u64,
i == se->n_fields - 1 ? "" : " ");
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
}
} else if (se->fields[i]->is_stack) {
- u32 offset, data_offset, len;
- unsigned long *p, *end;
-
- offset = (u32)entry->fields[n_u64];
- data_offset = offset & 0xffff;
- len = offset >> 16;
-
- p = (void *)entry + data_offset;
- end = (void *)p + len - (sizeof(long) - 1);
+ union trace_synth_field *data = &entry->fields[n_u64];
+ unsigned long *p = (void *)entry + data->as_dynamic.offset;
trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name);
-
- for (; *p && p < end; p++)
- trace_seq_printf(s, "=> %pS\n", (void *)*p);
+ for (j = 1; j < data->as_dynamic.len / sizeof(long); j++)
+ trace_seq_printf(s, "=> %pS\n", (void *)p[j]);
n_u64++;
-
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
@@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
print_synth_event_num_val(s, print_fmt,
se->fields[i]->name,
se->fields[i]->size,
- entry->fields[n_u64],
+ &entry->fields[n_u64],
space);
if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
trace_seq_puts(s, " (");
trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
+ entry->fields[n_u64].as_u64,
__flags);
trace_seq_putc(s, ')');
}
@@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry,
int ret;
if (is_dynamic) {
- u32 data_offset;
+ union trace_synth_field *data = &entry->fields[*n_u64];
- data_offset = struct_size(entry, fields, event->n_u64);
- data_offset += data_size;
-
- len = fetch_store_strlen((unsigned long)str_val);
-
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+ data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size;
+ data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val);
ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);
(*n_u64)++;
} else {
- str_field = (char *)&entry->fields[*n_u64];
+ str_field = (char *)&entry->fields[*n_u64].as_u64;
#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
if ((unsigned long)str_val < TASK_SIZE)
@@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
unsigned int data_size,
unsigned int *n_u64)
{
+ union trace_synth_field *data = &entry->fields[*n_u64];
unsigned int len;
u32 data_offset;
void *data_loc;
@@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
break;
}
- /* Include the zero'd element if it fits */
- if (len < HIST_STACKTRACE_DEPTH)
- len++;
-
len *= sizeof(long);
/* Find the dynamic section to copy the stack into. */
@@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
memcpy(data_loc, stack, len);
/* Fill in the field that holds the offset/len combo */
- data_offset |= len << 16;
- *(u32 *)&entry->fields[*n_u64] = data_offset;
+
+ data->as_dynamic.offset = data_offset;
+ data->as_dynamic.len = len;
(*n_u64)++;
@@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data,
str_val = (char *)(long)var_ref_vals[val_idx];
if (event->dynamic_fields[i]->is_stack) {
- len = *((unsigned long *)str_val);
+ /* reserve one extra element for size */
+ len = *((unsigned long *)str_val) + 1;
len *= sizeof(unsigned long);
} else {
len = fetch_store_strlen((unsigned long)str_val);
@@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data,
switch (field->size) {
case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
+ entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
+ entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
+ entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- entry->fields[n_u64] = val;
+ entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1230,6 +1209,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
* synth_event_gen_cmd_array_start - Start synthetic event command from an array
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
* @fields: An array of type/name field descriptions
* @n_fields: The number of field descriptions contained in the fields array
*
@@ -1790,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -1883,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
switch (field->size) {
case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ state.entry->fields[n_u64].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ state.entry->fields[n_u64].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ state.entry->fields[n_u64].as_u32 = (u32)val;
break;
default:
- state.entry->fields[n_u64] = val;
+ state.entry->fields[n_u64].as_u64 = val;
break;
}
n_u64++;
@@ -2030,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val,
} else {
switch (field->size) {
case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ trace_state->entry->fields[field->offset].as_u8 = (u8)val;
break;
case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ trace_state->entry->fields[field->offset].as_u16 = (u16)val;
break;
case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ trace_state->entry->fields[field->offset].as_u32 = (u32)val;
break;
default:
- trace_state->entry->fields[field->offset] = val;
+ trace_state->entry->fields[field->offset].as_u64 = val;
break;
}
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index e535959939d3..46439e3bcec4 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data)
/**
* event_triggers_call - Call triggers associated with a trace event
* @file: The trace_event_file associated with the event
+ * @buffer: The ring buffer that the event is being written to
* @rec: The trace entry for the event, NULL for unconditional invocation
+ * @event: The event meta data in the ring buffer
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command. If rec is
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 590b3d51afae..ba37f768e2f2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
-
+ else
+ iter->private = NULL;
}
static void irqsoff_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index b2b726bea1f9..c68a72707852 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -386,12 +386,12 @@ static const struct btf_type *find_btf_func_proto(const char *funcname)
/* Get BTF_KIND_FUNC type */
t = btf_type_by_id(btf, id);
- if (!btf_type_is_func(t))
+ if (!t || !btf_type_is_func(t))
return ERR_PTR(-ENOENT);
/* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_func_proto(t))
+ if (!t || !btf_type_is_func_proto(t))
return ERR_PTR(-ENOENT);
return t;
@@ -443,7 +443,7 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code,
if (!ctx->params) {
params = find_btf_func_param(ctx->funcname, &ctx->nr_params,
ctx->flags & TPARG_FL_TPOINT);
- if (IS_ERR(params)) {
+ if (IS_ERR_OR_NULL(params)) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
return PTR_ERR(params);
}
@@ -1273,7 +1273,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
params = find_btf_func_param(ctx->funcname, &nr_params,
ctx->flags & TPARG_FL_TPOINT);
- if (IS_ERR(params)) {
+ if (IS_ERR_OR_NULL(params)) {
if (args_idx != -1) {
/* $arg* requires BTF info */
trace_probe_log_err(0, NOSUP_BTFARG);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 330aee1c1a49..0469a04a355f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter)
{
if (is_graph(iter->tr))
graph_trace_open(iter);
+ else
+ iter->private = NULL;
}
static void wakeup_trace_close(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index e5e299260d0c..bac06ee3b98b 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask);
* trace_seq_vprintf - sequence printing of trace information
* @s: trace sequence descriptor
* @fmt: printf format string
+ * @args: Arguments for the format string
*
* The tracer may use either sequence operations or its own
* copy to user routines. To simplify formatting of a trace
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 2c765ee2a4d4..99c37eeebc16 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
-extern void tracing_map_set_field_descr(struct tracing_map *map,
- unsigned int i,
- unsigned int key_offset,
- tracing_map_cmp_fn_t cmp_fn);
extern int
tracing_map_sort_entries(struct tracing_map *map,
struct tracing_map_sort_key *sort_keys,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 02a8f402eeb5..800b4208dba9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -52,6 +52,7 @@
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
+#include <linux/delay.h>
#include "workqueue_internal.h"
@@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask;
* Per-cpu work items which run for longer than the following threshold are
* automatically considered CPU intensive and excluded from concurrency
* management to prevent them from noticeably delaying other per-cpu work items.
+ * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
+ * The actual value is initialized in wq_cpu_intensive_thresh_init().
*/
-static unsigned long wq_cpu_intensive_thresh_us = 10000;
+static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
static bool wq_disable_numa;
@@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void)
!system_freezable_power_efficient_wq);
}
+static void __init wq_cpu_intensive_thresh_init(void)
+{
+ unsigned long thresh;
+ unsigned long bogo;
+
+ /* if the user set it to a specific value, keep it */
+ if (wq_cpu_intensive_thresh_us != ULONG_MAX)
+ return;
+
+ /*
+ * The default of 10ms is derived from the fact that most modern (as of
+ * 2023) processors can do a lot in 10ms and that it's just below what
+ * most consider human-perceivable. However, the kernel also runs on a
+ * lot slower CPUs including microcontrollers where the threshold is way
+ * too low.
+ *
+ * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
+ * This is by no means accurate but it doesn't have to be. The mechanism
+ * is still useful even when the threshold is fully scaled up. Also, as
+ * the reports would usually be applicable to everyone, some machines
+ * operating on longer thresholds won't significantly diminish their
+ * usefulness.
+ */
+ thresh = 10 * USEC_PER_MSEC;
+
+ /* see init/calibrate.c for lpj -> BogoMIPS calculation */
+ bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
+ if (bogo < 4000)
+ thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);
+
+ pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
+ loops_per_jiffy, bogo, thresh);
+
+ wq_cpu_intensive_thresh_us = thresh;
+}
+
/**
* workqueue_init - bring workqueue subsystem fully online
*
@@ -6528,6 +6567,8 @@ void __init workqueue_init(void)
struct worker_pool *pool;
int cpu, bkt;
+ wq_cpu_intensive_thresh_init();
+
/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some