aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/arm/vdso/Makefile4
-rw-r--r--arch/arm64/kernel/vdso/Makefile4
-rw-r--r--arch/arm64/kernel/vdso32/Makefile3
-rw-r--r--arch/csky/kernel/vdso/Makefile4
-rw-r--r--arch/loongarch/vdso/Makefile4
-rw-r--r--arch/mips/vdso/Makefile4
-rw-r--r--arch/powerpc/kernel/vdso/Makefile2
-rw-r--r--arch/riscv/kernel/vdso/Makefile4
-rw-r--r--arch/s390/kernel/vdso32/Makefile3
-rw-r--r--arch/s390/kernel/vdso64/Makefile3
-rw-r--r--arch/x86/entry/vdso/Makefile5
-rw-r--r--include/linux/posix-timers.h17
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/time/posix-cpu-timers.c81
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-sched.c135
-rw-r--r--kernel/time/tick-sched.h67
-rw-r--r--lib/vdso/Makefile13
-rw-r--r--tools/testing/selftests/proc/proc-uptime-001.c25
-rw-r--r--tools/testing/selftests/proc/proc-uptime-002.c27
-rw-r--r--tools/testing/selftests/proc/proc-uptime.h28
-rw-r--r--tools/testing/selftests/timers/posix_timers.c77
24 files changed, 361 insertions, 188 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 30cc8f683dc6..74cf4b8964a7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14741,7 +14741,7 @@ F: include/uapi/linux/nitro_enclaves.h
F: samples/nitro_enclaves/
NOHZ, DYNTICKS SUPPORT
-M: Frederic Weisbecker <fweisbec@gmail.com>
+M: Frederic Weisbecker <frederic@kernel.org>
M: Thomas Gleixner <tglx@linutronix.de>
M: Ingo Molnar <mingo@kernel.org>
L: linux-kernel@vger.kernel.org
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index a7ec06ce3785..515ca33b854c 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
hostprogs := vdsomunge
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index beaf9586338f..fe7a53c6781f 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -6,9 +6,7 @@
# Heavily based on the vDSO Makefiles for other archs.
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso := vgettimeofday.o note.o sigreturn.o
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index f59bd1a4ead6..d014162c5c71 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -3,9 +3,6 @@
# Makefile for vdso32
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32
include $(srctree)/lib/vdso/Makefile
# Same as cc-*option, but using CC_COMPAT instead of CC
diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile
index 0b6909f10667..299e4e41ebc5 100644
--- a/arch/csky/kernel/vdso/Makefile
+++ b/arch/csky/kernel/vdso/Makefile
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_CKCORE_ADDR32|R_CKCORE_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Symbols present in the vdso
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d89e2ac75f7b..461240ab4436 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Objects to go into the VDSO.
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_LARCH_32|R_LARCH_64|R_LARCH_MARK_LA|R_LARCH_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 18af9474ed0e..eb56581f6d73 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -4,9 +4,7 @@
# Sanitizer runtimes are unavailable and cannot be linked here.
KCSAN_SANITIZE := n
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_MIPS_JUMP_SLOT|R_MIPS_GLOB_DAT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso-y := elf.o vgettimeofday.o sigreturn.o
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index 66f723f53be2..4c3f34485f08 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -2,7 +2,7 @@
# List of files in the vdso, has to be asm only for now
-ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index 06e6b27f3bcc..a04b3bc35ca2 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
# Copied from arch/tile/kernel/vdso/Makefile
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_RISCV_32|R_RISCV_64|R_RISCV_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Symbols present in the vdso
vdso-syms = rt_sigreturn
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index 245bddfe9bc0..bafd3147eb4e 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -2,9 +2,8 @@
# List of files in the vdso
KCOV_INSTRUMENT := n
-ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
-ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso32 = vdso_user_wrapper-32.o note-32.o
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 34f9542636e9..a766d286e15f 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -2,9 +2,8 @@
# List of files in the vdso
KCOV_INSTRUMENT := n
-ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
-ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso64 = vdso_user_wrapper.o note.o
obj-cvdso64 = vdso64_generic.o getcpu.o
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 1506a22a4fb6..6a1821bd7d5e 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,10 +3,7 @@
# Building vDSO images for x86.
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
-ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Sanitizer runtimes are unavailable and cannot be linked here.
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 2c6e99ca48af..d607f51404fc 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/alarmtimer.h>
#include <linux/timerqueue.h>
@@ -62,16 +63,18 @@ static inline int clockid_to_fd(const clockid_t clk)
* cpu_timer - Posix CPU timer representation for k_itimer
* @node: timerqueue node to queue in the task/sig
* @head: timerqueue head on which this timer is queued
- * @task: Pointer to target task
+ * @pid: Pointer to target task PID
* @elist: List head for the expiry list
* @firing: Timer is currently firing
+ * @handling: Pointer to the task which handles expiry
*/
struct cpu_timer {
- struct timerqueue_node node;
- struct timerqueue_head *head;
- struct pid *pid;
- struct list_head elist;
- int firing;
+ struct timerqueue_node node;
+ struct timerqueue_head *head;
+ struct pid *pid;
+ struct list_head elist;
+ int firing;
+ struct task_struct __rcu *handling;
};
static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
@@ -135,10 +138,12 @@ struct posix_cputimers {
/**
* posix_cputimers_work - Container for task work based posix CPU timer expiry
* @work: The task work to be scheduled
+ * @mutex: Mutex held around expiry in context of this task work
* @scheduled: @work has been scheduled already, no further processing
*/
struct posix_cputimers_work {
struct callback_head work;
+ struct mutex mutex;
unsigned int scheduled;
};
diff --git a/kernel/signal.c b/kernel/signal.c
index 8cb28f1df294..8f6330f0e9ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
/*
* Now find a thread we can wake up to take the signal off the queue.
*
- * If the main thread wants the signal, it gets first crack.
- * Probably the least surprising to the average bear.
+ * Try the suggested task first (may or may not be the main thread).
*/
if (wants_signal(sig, p))
t = p;
@@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
ret = -1;
rcu_read_lock();
+
+ /*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
t = pid_task(pid, type);
- if (!t || !likely(lock_task_sighand(t, &flags)))
+ if (!t)
+ goto ret;
+ if (type != PIDTYPE_PID && same_thread_group(t, current))
+ t = current;
+ if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b34022c..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a11b39..808a247205a9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..65b8658da829 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ ktime_t next_p;
+ u32 rem;
+
tick_do_timer_cpu = cpu;
- tick_next_period = ktime_get();
+ next_p = ktime_get();
+ div_u64_rem(next_p, TICK_NSEC, &rem);
+ if (rem) {
+ next_p -= rem;
+ next_p += TICK_NSEC;
+ }
+
+ tick_next_period = next_p;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a46506f7ec6d..52254679ec48 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -647,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!ts->idle_active))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (ts->idle_active && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -691,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -701,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -732,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -742,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
-
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -1094,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -1129,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..5ed5a9d41d5a 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,65 +22,82 @@ enum tick_nohz_mode {
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
* it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile
index e814061d6aa0..9f031eafc465 100644
--- a/lib/vdso/Makefile
+++ b/lib/vdso/Makefile
@@ -5,18 +5,13 @@ GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH))
c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c)
-# This cmd checks that the vdso library does not contain absolute relocation
+# This cmd checks that the vdso library does not contain dynamic relocations.
# It has to be called after the linking of the vdso library and requires it
# as a parameter.
#
-# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds
-# to the absolute relocation types printed by "objdump -R" and accepted by the
-# dynamic linker.
-ifndef ARCH_REL_TYPE_ABS
-$(error ARCH_REL_TYPE_ABS is not set)
-endif
-
+# As a workaround for some GNU ld ports which produce unneeded R_*_NONE
+# dynamic relocations, ignore R_*_NONE.
quiet_cmd_vdso_check = VDSOCHK $@
- cmd_vdso_check = if $(OBJDUMP) -R $@ | grep -E -h "$(ARCH_REL_TYPE_ABS)"; \
+ cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \
then (echo >&2 "$@: dynamic relocations are not supported"; \
rm -f $@; /bin/false); fi
diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c
index 781f7a50fc3f..f335eec5067e 100644
--- a/tools/testing/selftests/proc/proc-uptime-001.c
+++ b/tools/testing/selftests/proc/proc-uptime-001.c
@@ -13,7 +13,9 @@
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
-// Test that values in /proc/uptime increment monotonically.
+// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment
+// monotonically. We don't test idle time monotonicity due to broken iowait
+// task counting, cf: comment above get_cpu_idle_time_us()
#undef NDEBUG
#include <assert.h>
#include <stdint.h>
@@ -25,20 +27,31 @@
int main(void)
{
- uint64_t start, u0, u1, i0, i1;
+ uint64_t start, u0, u1, c0, c1;
int fd;
fd = open("/proc/uptime", O_RDONLY);
assert(fd >= 0);
- proc_uptime(fd, &u0, &i0);
+ u0 = proc_uptime(fd);
start = u0;
+ c0 = clock_boottime();
+
do {
- proc_uptime(fd, &u1, &i1);
+ u1 = proc_uptime(fd);
+ c1 = clock_boottime();
+
+ /* Is /proc/uptime monotonic ? */
assert(u1 >= u0);
- assert(i1 >= i0);
+
+ /* Is CLOCK_BOOTTIME monotonic ? */
+ assert(c1 >= c0);
+
+ /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */
+ assert(c0 >= u0);
+
u0 = u1;
- i0 = i1;
+ c0 = c1;
} while (u1 - start < 100);
return 0;
diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c
index 7d0aa22bdc12..ae453daa96c1 100644
--- a/tools/testing/selftests/proc/proc-uptime-002.c
+++ b/tools/testing/selftests/proc/proc-uptime-002.c
@@ -13,8 +13,10 @@
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
-// Test that values in /proc/uptime increment monotonically
-// while shifting across CPUs.
+// Test that boottime value in /proc/uptime and CLOCK_BOOTTIME increment
+// monotonically while shifting across CPUs. We don't test idle time
+// monotonicity due to broken iowait task counting, cf: comment above
+// get_cpu_idle_time_us()
#undef NDEBUG
#include <assert.h>
#include <errno.h>
@@ -42,10 +44,10 @@ static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned lo
int main(void)
{
+ uint64_t u0, u1, c0, c1;
unsigned int len;
unsigned long *m;
unsigned int cpu;
- uint64_t u0, u1, i0, i1;
int fd;
/* find out "nr_cpu_ids" */
@@ -60,7 +62,9 @@ int main(void)
fd = open("/proc/uptime", O_RDONLY);
assert(fd >= 0);
- proc_uptime(fd, &u0, &i0);
+ u0 = proc_uptime(fd);
+ c0 = clock_boottime();
+
for (cpu = 0; cpu < len * 8; cpu++) {
memset(m, 0, len);
m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long)));
@@ -68,11 +72,20 @@ int main(void)
/* CPU might not exist, ignore error */
sys_sched_setaffinity(0, len, m);
- proc_uptime(fd, &u1, &i1);
+ u1 = proc_uptime(fd);
+ c1 = clock_boottime();
+
+ /* Is /proc/uptime monotonic ? */
assert(u1 >= u0);
- assert(i1 >= i0);
+
+ /* Is CLOCK_BOOTTIME monotonic ? */
+ assert(c1 >= c0);
+
+ /* Is CLOCK_BOOTTIME VS /proc/uptime monotonic ? */
+ assert(c0 >= u0);
+
u0 = u1;
- i0 = i1;
+ c0 = c1;
}
return 0;
diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h
index dc6a42b1d6b0..730cce4a3d73 100644
--- a/tools/testing/selftests/proc/proc-uptime.h
+++ b/tools/testing/selftests/proc/proc-uptime.h
@@ -19,10 +19,22 @@
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
+#include <time.h>
#include "proc.h"
-static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle)
+static uint64_t clock_boottime(void)
+{
+ struct timespec ts;
+ int err;
+
+ err = clock_gettime(CLOCK_BOOTTIME, &ts);
+ assert(err >= 0);
+
+ return (ts.tv_sec * 100) + (ts.tv_nsec / 10000000);
+}
+
+static uint64_t proc_uptime(int fd)
{
uint64_t val1, val2;
char buf[64], *p;
@@ -43,18 +55,6 @@ static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle)
assert(p[3] == ' ');
val2 = (p[1] - '0') * 10 + p[2] - '0';
- *uptime = val1 * 100 + val2;
-
- p += 4;
-
- val1 = xstrtoull(p, &p);
- assert(p[0] == '.');
- assert('0' <= p[1] && p[1] <= '9');
- assert('0' <= p[2] && p[2] <= '9');
- assert(p[3] == '\n');
-
- val2 = (p[1] - '0') * 10 + p[2] - '0';
- *idle = val1 * 100 + val2;
- assert(p + 4 == buf + rv);
+ return val1 * 100 + val2;
}
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
index 0ba500056e63..8a17c0e8d82b 100644
--- a/tools/testing/selftests/timers/posix_timers.c
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -188,6 +188,80 @@ static int check_timer_create(int which)
return 0;
}
+int remain;
+__thread int got_signal;
+
+static void *distribution_thread(void *arg)
+{
+ while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+ return NULL;
+}
+
+static void distribution_handler(int nr)
+{
+ if (!__atomic_exchange_n(&got_signal, 1, __ATOMIC_RELAXED))
+ __atomic_fetch_sub(&remain, 1, __ATOMIC_RELAXED);
+}
+
+/*
+ * Test that all running threads _eventually_ receive CLOCK_PROCESS_CPUTIME_ID
+ * timer signals. This primarily tests that the kernel does not favour any one.
+ */
+static int check_timer_distribution(void)
+{
+ int err, i;
+ timer_t id;
+ const int nthreads = 10;
+ pthread_t threads[nthreads];
+ struct itimerspec val = {
+ .it_value.tv_sec = 0,
+ .it_value.tv_nsec = 1000 * 1000,
+ .it_interval.tv_sec = 0,
+ .it_interval.tv_nsec = 1000 * 1000,
+ };
+
+ printf("Check timer_create() per process signal distribution... ");
+ fflush(stdout);
+
+ remain = nthreads + 1; /* worker threads + this thread */
+ signal(SIGALRM, distribution_handler);
+ err = timer_create(CLOCK_PROCESS_CPUTIME_ID, NULL, &id);
+ if (err < 0) {
+ perror("Can't create timer\n");
+ return -1;
+ }
+ err = timer_settime(id, 0, &val, NULL);
+ if (err < 0) {
+ perror("Can't set timer\n");
+ return -1;
+ }
+
+ for (i = 0; i < nthreads; i++) {
+ if (pthread_create(&threads[i], NULL, distribution_thread, NULL)) {
+ perror("Can't create thread\n");
+ return -1;
+ }
+ }
+
+ /* Wait for all threads to receive the signal. */
+ while (__atomic_load_n(&remain, __ATOMIC_RELAXED));
+
+ for (i = 0; i < nthreads; i++) {
+ if (pthread_join(threads[i], NULL)) {
+ perror("Can't join thread\n");
+ return -1;
+ }
+ }
+
+ if (timer_delete(id)) {
+ perror("Can't delete timer\n");
+ return -1;
+ }
+
+ printf("[OK]\n");
+ return 0;
+}
+
int main(int argc, char **argv)
{
printf("Testing posix timers. False negative may happen on CPU execution \n");
@@ -217,5 +291,8 @@ int main(int argc, char **argv)
if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
return ksft_exit_fail();
+ if (check_timer_distribution() < 0)
+ return ksft_exit_fail();
+
return ksft_exit_pass();
}