aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c158
1 files changed, 60 insertions, 98 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4237c8748715..c12b9fdb22a4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -202,6 +202,14 @@ struct cpuset {
};
/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+ struct work_struct work;
+ struct cpuset *cs;
+};
+
+/*
* Exclusive CPUs distributed out to sub-partitions of top_cpuset
*/
static cpumask_var_t subpartitions_cpus;
@@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs)
}
static struct cpuset top_cpuset = {
- .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
- (1 << CS_MEM_EXCLUSIVE)),
+ .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
+ BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
+ .relax_domain_level = -1,
.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
};
@@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
rcu_read_lock();
cs = task_cs(tsk);
- while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+ while (!cpumask_intersects(cs->effective_cpus, pmask))
cs = parent_cs(cs);
- if (unlikely(!cs)) {
- /*
- * The top cpuset doesn't have any online cpu as a
- * consequence of a race between cpuset_hotplug_work
- * and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to be
- * identical to cpu_online_mask.
- */
- goto out_unlock;
- }
- }
- cpumask_and(pmask, pmask, cs->effective_cpus);
-out_unlock:
+ cpumask_and(pmask, pmask, cs->effective_cpus);
rcu_read_unlock();
}
@@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void)
/*
* If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void)
}
#endif /* CONFIG_SMP */
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains_cpuslocked(void)
{
- cpus_read_lock();
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+ cpus_read_lock();
+ rebuild_sched_domains_cpuslocked();
cpus_read_unlock();
}
@@ -2079,14 +2075,11 @@ write_error:
/*
* For partcmd_update without newmask, it is being called from
- * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
- * Update the load balance flag and scheduling domain if
- * cpus_read_trylock() is successful.
+ * cpuset_handle_hotplug(). Update the load balance flag and
+ * scheduling domain accordingly.
*/
- if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+ if ((cmd == partcmd_update) && !newmask)
update_partition_sd_lb(cs, old_prs);
- cpus_read_unlock();
- }
notify_partition_change(cs, old_prs);
return 0;
@@ -2948,7 +2941,7 @@ bool current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
- if (val < -1 || val >= sched_domain_level_max)
+ if (val < -1 || val > sched_domain_level_max + 1)
return -EINVAL;
#endif
@@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
- * cpuset_hotplug_work calls back into cgroup core via
- * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+ * cpuset_handle_hotplug may call back into cgroup core asynchronously
+ * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
@@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
- flush_work(&cpuset_hotplug_work);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
@@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
buf = strstrip(buf);
- /*
- * Convert "root" to ENABLED, and convert "member" to DISABLED.
- */
if (!strcmp(buf, "root"))
val = PRS_ROOT;
else if (!strcmp(buf, "member"))
@@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->effective_mems;
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
- /*
- * Clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (!is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
/*
@@ -4318,8 +4302,6 @@ int __init cpuset_init(void)
nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
- set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
- top_cpuset.relax_domain_level = -1;
INIT_LIST_HEAD(&remote_children);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
@@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
}
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+ struct cpuset_remove_tasks_struct *s;
+
+ s = container_of(work, struct cpuset_remove_tasks_struct, work);
+ remove_tasks_in_empty_cpuset(s->cs);
+ css_put(&s->cs->css);
+ kfree(s);
+}
+
static void
hotplug_update_tasks_legacy(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Move tasks to the nearest ancestor with execution resources,
* This is full cgroup operation which will also call back into
- * cpuset. Should be done outside any lock.
+ * cpuset. Execute it asynchronously using workqueue.
*/
- if (is_empty) {
- mutex_unlock(&cpuset_mutex);
- remove_tasks_in_empty_cpuset(cs);
- mutex_lock(&cpuset_mutex);
+ if (is_empty && cs->css.cgroup->nr_populated_csets &&
+ css_tryget_online(&cs->css)) {
+ struct cpuset_remove_tasks_struct *s;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (WARN_ON_ONCE(!s)) {
+ css_put(&cs->css);
+ return;
+ }
+
+ s->cs = cs;
+ INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+ schedule_work(&s->work);
}
}
@@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void)
force_rebuild = true;
}
-/*
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
- * progress.
- * Return: true if successful, false otherwise
- *
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
-static bool cpuset_hotplug_cpus_read_trylock(void)
-{
- int retries = 0;
-
- while (!cpus_read_trylock()) {
- /*
- * CPU hotplug still in progress. Retry 5 times
- * with a 10ms wait before bailing out.
- */
- if (++retries > 5)
- return false;
- msleep(10);
- }
- return true;
-}
-
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -4493,13 +4470,11 @@ retry:
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && cpumask_empty(&new_cpus) &&
- partition_is_populated(cs, NULL) &&
- cpuset_hotplug_cpus_read_trylock()) {
+ partition_is_populated(cs, NULL)) {
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
cpuset_force_rebuild();
- cpus_read_unlock();
}
/*
@@ -4519,18 +4494,8 @@ retry:
else if (is_partition_valid(parent) && is_partition_invalid(cs))
partcmd = partcmd_update;
- /*
- * cpus_read_lock needs to be held before calling
- * update_parent_effective_cpumask(). To avoid circular lock
- * dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
if (partcmd >= 0) {
- if (!cpuset_hotplug_cpus_read_trylock())
- goto update_tasks;
-
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
- cpus_read_unlock();
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
@@ -4558,8 +4523,7 @@ unlock:
}
/**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * @work: unused
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
*
* This function is called after either CPU or memory configuration has
* changed and updates cpuset accordingly. The top_cpuset is always
@@ -4573,8 +4537,10 @@ unlock:
*
* Note that CPU offlining during suspend is ignored. We don't modify
* cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
*/
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
+ lockdep_assert_cpus_held();
mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* rebuild sched domains if cpus_allowed has changed */
if (cpus_updated || force_rebuild) {
force_rebuild = false;
- rebuild_sched_domains();
+ rebuild_sched_domains_cpuslocked();
}
free_cpumasks(NULL, ptmp);
@@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void)
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
*/
- schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
- flush_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
}
/*
@@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- schedule_work(&cpuset_hotplug_work);
+ cpuset_handle_hotplug();
return NOTIFY_OK;
}