aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig38
-rw-r--r--arch/x86/Kconfig.cpu13
-rw-r--r--arch/x86/entry/common.c17
-rw-r--r--arch/x86/entry/entry_32.S24
-rw-r--r--arch/x86/entry/entry_64.S36
-rw-r--r--arch/x86/entry/vdso/Makefile9
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c256
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S2
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S2
-rw-r--r--arch/x86/entry/vdso/vdsox32.lds.S1
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/entry/vsyscall/Makefile2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c37
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c83
-rw-r--r--arch/x86/events/core.c16
-rw-r--r--arch/x86/events/intel/ds.c9
-rw-r--r--arch/x86/events/intel/uncore.c1
-rw-r--r--arch/x86/events/perf_event.h21
-rw-r--r--arch/x86/hyperv/hv_init.c91
-rw-r--r--arch/x86/include/asm/apic.h5
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/cpufeatures.h21
-rw-r--r--arch/x86/include/asm/fpu/xstate.h1
-rw-r--r--arch/x86/include/asm/hpet.h7
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h6
-rw-r--r--arch/x86/include/asm/intel-family.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h81
-rw-r--r--arch/x86/include/asm/msr-index.h9
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/include/asm/pvclock.h2
-rw-r--r--arch/x86/include/asm/time.h1
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h261
-rw-r--r--arch/x86/include/asm/vdso/vsyscall.h44
-rw-r--r--arch/x86/include/asm/vgtod.h75
-rw-r--r--arch/x86/include/asm/vsyscall.h6
-rw-r--r--arch/x86/include/asm/vvar.h7
-rw-r--r--arch/x86/include/uapi/asm/perf_regs.h3
-rw-r--r--arch/x86/kernel/acpi/cstate.c15
-rw-r--r--arch/x86/kernel/apic/apic.c90
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c50
-rw-r--r--arch/x86/kernel/apic/msi.c4
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c12
-rw-r--r--arch/x86/kernel/cpu/bugs.c11
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c3
-rw-r--r--arch/x86/kernel/cpu/common.c58
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c9
-rw-r--r--arch/x86/kernel/cpu/intel.c27
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c15
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c35
-rw-r--r--arch/x86/kernel/cpu/scattered.c4
-rw-r--r--arch/x86/kernel/cpu/umwait.c200
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c167
-rw-r--r--arch/x86/kernel/fpu/core.c52
-rw-r--r--arch/x86/kernel/fpu/init.c19
-rw-r--r--arch/x86/kernel/fpu/xstate.c11
-rw-r--r--arch/x86/kernel/ftrace.c10
-rw-r--r--arch/x86/kernel/head64.c20
-rw-r--r--arch/x86/kernel/hpet.c935
-rw-r--r--arch/x86/kernel/i8253.c25
-rw-r--r--arch/x86/kernel/idt.c3
-rw-r--r--arch/x86/kernel/irq.c2
-rw-r--r--arch/x86/kernel/jailhouse.c2
-rw-r--r--arch/x86/kernel/perf_regs.c7
-rw-r--r--arch/x86/kernel/ptrace.c20
-rw-r--r--arch/x86/kernel/pvclock.c1
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/time.c7
-rw-r--r--arch/x86/kernel/tsc.c57
-rw-r--r--arch/x86/kernel/tsc_msr.c4
-rw-r--r--arch/x86/kernel/unwind_orc.c26
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/pmu.c4
-rw-r--r--arch/x86/kvm/vmx/nested.c30
-rw-r--r--arch/x86/kvm/x86.c19
-rw-r--r--arch/x86/lib/cache-smp.c3
-rw-r--r--arch/x86/mm/fault.c18
-rw-r--r--arch/x86/mm/init_64.c24
-rw-r--r--arch/x86/platform/efi/quirks.c2
87 files changed, 1866 insertions, 1381 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..fbabf59692ff 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_32
select HAVE_DEBUG_STACKOVERFLOW
select MODULES_USE_ELF_REL
select OLD_SIGACTION
+ select GENERIC_VDSO_32
config X86_64
def_bool y
@@ -121,6 +122,7 @@ config X86
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+ select GENERIC_GETTIMEOFDAY
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
@@ -202,6 +204,7 @@ config X86
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
+ select HAVE_GENERIC_VDSO
select HOTPLUG_SMT if SMP
select IRQ_FORCED_THREADING
select NEED_SG_DMA_LENGTH
@@ -2285,7 +2288,7 @@ config COMPAT_VDSO
choice
prompt "vsyscall table for legacy applications"
depends on X86_64
- default LEGACY_VSYSCALL_EMULATE
+ default LEGACY_VSYSCALL_XONLY
help
Legacy user code that does not know how to find the vDSO expects
to be able to issue three syscalls by calling fixed addresses in
@@ -2293,23 +2296,38 @@ choice
it can be used to assist security vulnerability exploitation.
This setting can be changed at boot time via the kernel command
- line parameter vsyscall=[emulate|none].
+ line parameter vsyscall=[emulate|xonly|none].
On a system with recent enough glibc (2.14 or newer) and no
static binaries, you can say None without a performance penalty
to improve security.
- If unsure, select "Emulate".
+ If unsure, select "Emulate execution only".
config LEGACY_VSYSCALL_EMULATE
- bool "Emulate"
+ bool "Full emulation"
help
- The kernel traps and emulates calls into the fixed
- vsyscall address mapping. This makes the mapping
- non-executable, but it still contains known contents,
- which could be used in certain rare security vulnerability
- exploits. This configuration is recommended when userspace
- still uses the vsyscall area.
+ The kernel traps and emulates calls into the fixed vsyscall
+ address mapping. This makes the mapping non-executable, but
+ it still contains readable known contents, which could be
+ used in certain rare security vulnerability exploits. This
+ configuration is recommended when using legacy userspace
+ that still uses vsyscalls along with legacy binary
+ instrumentation tools that require code to be readable.
+
+ An example of this type of legacy userspace is running
+ Pin on an old binary that still uses vsyscalls.
+
+ config LEGACY_VSYSCALL_XONLY
+ bool "Emulate execution only"
+ help
+ The kernel traps and emulates calls into the fixed vsyscall
+ address mapping and does not allow reads. This
+ configuration is recommended when userspace might use the
+ legacy vsyscall area but support for legacy binary
+ instrumentation of legacy code is not needed. It mitigates
+ certain uses of the vsyscall area as an ASLR-bypassing
+ buffer.
config LEGACY_VSYSCALL_NONE
bool "None"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6adce15268bd..8e29c991ba3e 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -480,3 +480,16 @@ config CPU_SUP_UMC_32
CPU might render the kernel unbootable.
If unsure, say N.
+
+config CPU_SUP_ZHAOXIN
+ default y
+ bool "Support Zhaoxin processors" if PROCESSOR_SELECT
+ help
+ This enables detection, tunings and quirks for Zhaoxin processors
+
+ You need this enabled if you want your kernel to run on a
+ Zhaoxin CPU. Disabling this option on other types of CPUs
+ makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin
+ CPU might render the kernel unbootable.
+
+ If unsure, say N.
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 2418804e66b4..536b574b6161 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -72,23 +72,18 @@ static long syscall_trace_enter(struct pt_regs *regs)
struct thread_info *ti = current_thread_info();
unsigned long ret = 0;
- bool emulated = false;
u32 work;
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
BUG_ON(regs != task_pt_regs(current));
- work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+ work = READ_ONCE(ti->flags);
- if (unlikely(work & _TIF_SYSCALL_EMU))
- emulated = true;
-
- if ((emulated || (work & _TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- return -1L;
-
- if (emulated)
- return -1L;
+ if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ ret = tracehook_report_syscall_entry(regs);
+ if (ret || (work & _TIF_SYSCALL_EMU))
+ return -1L;
+ }
#ifdef CONFIG_SECCOMP
/*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b23431be5cb..44c6e6f54bf7 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1104,6 +1104,30 @@ ENTRY(irq_entries_start)
.endr
END(irq_entries_start)
+#ifdef CONFIG_X86_LOCAL_APIC
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_spurious
+ .align 8
+ .endr
+END(spurious_entries_start)
+
+common_spurious:
+ ASM_CLAC
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call smp_spurious_interrupt
+ jmp ret_from_intr
+ENDPROC(common_interrupt)
+#endif
+
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 11aa3b2afa4d..15f0749d0a15 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -375,6 +375,18 @@ ENTRY(irq_entries_start)
.endr
END(irq_entries_start)
+ .align 8
+ENTRY(spurious_entries_start)
+ vector=FIRST_SYSTEM_VECTOR
+ .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ UNWIND_HINT_IRET_REGS
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
+ jmp common_spurious
+ .align 8
+ vector=vector+1
+ .endr
+END(spurious_entries_start)
+
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
@@ -571,10 +583,20 @@ _ASM_NOKPROBE(interrupt_entry)
/* Interrupt entry/exit. */
- /*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_interrupt.
- */
+/*
+ * The interrupt stubs push (~vector+0x80) onto the stack and
+ * then jump to common_spurious/interrupt.
+ */
+common_spurious:
+ addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
+ call interrupt_entry
+ UNWIND_HINT_REGS indirect=1
+ call smp_spurious_interrupt /* rdi points to pt_regs */
+ jmp ret_from_intr
+END(common_spurious)
+_ASM_NOKPROBE(common_spurious)
+
+/* common_interrupt is a hotpath. Align it */
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
@@ -1670,11 +1692,17 @@ nmi_restore:
iretq
END(nmi)
+#ifndef CONFIG_IA32_EMULATION
+/*
+ * This handles SYSCALL from 32-bit code. There is no way to program
+ * MSRs to fully disable 32-bit SYSCALL.
+ */
ENTRY(ignore_sysret)
UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+#endif
ENTRY(rewind_stack_do_exit)
UNWIND_HINT_FUNC
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 42fe42e82baf..39106111be86 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,6 +3,12 @@
# Building vDSO images for x86.
#
+# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
+# the inclusion of generic Makefile.
+ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
+ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+include $(srctree)/lib/vdso/Makefile
+
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
UBSAN_SANITIZE := n
@@ -51,6 +57,7 @@ VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \
$(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso)
+ $(call if_changed,vdso_check)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
hostprogs-y += vdso2c
@@ -121,6 +128,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE
$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
+ $(call if_changed,vdso_check)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1
@@ -160,6 +168,7 @@ $(obj)/vdso32.so.dbg: FORCE \
$(obj)/vdso32/system_call.o \
$(obj)/vdso32/sigreturn.o
$(call if_changed,vdso)
+ $(call if_changed,vdso_check)
#
# The DSO images are built using a special linker script.
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 4aed41f638bb..d9ff616bb0f6 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -1,251 +1,85 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Copyright 2006 Andi Kleen, SUSE Labs.
- *
* Fast user context implementation of clock_gettime, gettimeofday, and time.
*
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * Copyright 2019 ARM Limited
+ *
* 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
* sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
- *
- * The code should have no internal unresolved relocations.
- * Check with readelf after changing.
*/
-
-#include <uapi/linux/time.h>
-#include <asm/vgtod.h>
-#include <asm/vvar.h>
-#include <asm/unistd.h>
-#include <asm/msr.h>
-#include <asm/pvclock.h>
-#include <asm/mshyperv.h>
-#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
+#include <linux/types.h>
-#define gtod (&VVAR(vsyscall_gtod_data))
+#include "../../../../lib/vdso/gettimeofday.c"
-extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts);
-extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz);
+extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz);
extern time_t __vdso_time(time_t *t);
-#ifdef CONFIG_PARAVIRT_CLOCK
-extern u8 pvclock_page[PAGE_SIZE]
- __attribute__((visibility("hidden")));
-#endif
-
-#ifdef CONFIG_HYPERV_TSCPAGE
-extern u8 hvclock_page[PAGE_SIZE]
- __attribute__((visibility("hidden")));
-#endif
-
-#ifndef BUILD_VDSO32
-
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
- long ret;
- asm ("syscall" : "=a" (ret), "=m" (*ts) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) :
- "rcx", "r11");
- return ret;
+ return __cvdso_gettimeofday(tv, tz);
}
-#else
+int gettimeofday(struct __kernel_old_timeval *, struct timezone *)
+ __attribute__((weak, alias("__vdso_gettimeofday")));
-notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
+time_t __vdso_time(time_t *t)
{
- long ret;
-
- asm (
- "mov %%ebx, %%edx \n"
- "mov %[clock], %%ebx \n"
- "call __kernel_vsyscall \n"
- "mov %%edx, %%ebx \n"
- : "=a" (ret), "=m" (*ts)
- : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts)
- : "edx");
- return ret;
+ return __cvdso_time(t);
}
-#endif
+time_t time(time_t *t) __attribute__((weak, alias("__vdso_time")));
-#ifdef CONFIG_PARAVIRT_CLOCK
-static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
-{
- return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
-}
-static notrace u64 vread_pvclock(void)
-{
- const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
- u32 version;
- u64 ret;
-
- /*
- * Note: The kernel and hypervisor must guarantee that cpu ID
- * number maps 1:1 to per-CPU pvclock time info.
- *
- * Because the hypervisor is entirely unaware of guest userspace
- * preemption, it cannot guarantee that per-CPU pvclock time
- * info is updated if the underlying CPU changes or that that
- * version is increased whenever underlying CPU changes.
- *
- * On KVM, we are guaranteed that pvti updates for any vCPU are
- * atomic as seen by *all* vCPUs. This is an even stronger
- * guarantee than we get with a normal seqlock.
- *
- * On Xen, we don't appear to have that guarantee, but Xen still
- * supplies a valid seqlock using the version field.
- *
- * We only do pvclock vdso timing at all if
- * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
- * mean that all vCPUs have matching pvti and that the TSC is
- * synced, so we can just look at vCPU 0's pvti.
- */
-
- do {
- version = pvclock_read_begin(pvti);
-
- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
- return U64_MAX;
-
- ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
- } while (pvclock_read_retry(pvti, version));
-
- return ret;
-}
-#endif
-#ifdef CONFIG_HYPERV_TSCPAGE
-static notrace u64 vread_hvclock(void)
-{
- const struct ms_hyperv_tsc_page *tsc_pg =
- (const struct ms_hyperv_tsc_page *)&hvclock_page;
+#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64)
+/* both 64-bit and x32 use these */
+extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res);
- return hv_read_tsc_page(tsc_pg);
-}
-#endif
-
-notrace static inline u64 vgetcyc(int mode)
+int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts)
{
- if (mode == VCLOCK_TSC)
- return (u64)rdtsc_ordered();
-
- /*
- * For any memory-mapped vclock type, we need to make sure that gcc
- * doesn't cleverly hoist a load before the mode check. Otherwise we
- * might end up touching the memory-mapped page even if the vclock in
- * question isn't enabled, which will segfault. Hence the barriers.
- */
-#ifdef CONFIG_PARAVIRT_CLOCK
- if (mode == VCLOCK_PVCLOCK) {
- barrier();
- return vread_pvclock();
- }
-#endif
-#ifdef CONFIG_HYPERV_TSCPAGE
- if (mode == VCLOCK_HVCLOCK) {
- barrier();
- return vread_hvclock();
- }
-#endif
- return U64_MAX;
+ return __cvdso_clock_gettime(clock, ts);
}
-notrace static int do_hres(clockid_t clk, struct timespec *ts)
-{
- struct vgtod_ts *base = &gtod->basetime[clk];
- u64 cycles, last, sec, ns;
- unsigned int seq;
-
- do {
- seq = gtod_read_begin(gtod);
- cycles = vgetcyc(gtod->vclock_mode);
- ns = base->nsec;
- last = gtod->cycle_last;
- if (unlikely((s64)cycles < 0))
- return vdso_fallback_gettime(clk, ts);
- if (cycles > last)
- ns += (cycles - last) * gtod->mult;
- ns >>= gtod->shift;
- sec = base->sec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
-
- /*
- * Do this outside the loop: a race inside the loop could result
- * in __iter_div_u64_rem() being extremely slow.
- */
- ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
- ts->tv_nsec = ns;
-
- return 0;
-}
+int clock_gettime(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime")));
-notrace static void do_coarse(clockid_t clk, struct timespec *ts)
+int __vdso_clock_getres(clockid_t clock,
+ struct __kernel_timespec *res)
{
- struct vgtod_ts *base = &gtod->basetime[clk];
- unsigned int seq;
-
- do {
- seq = gtod_read_begin(gtod);
- ts->tv_sec = base->sec;
- ts->tv_nsec = base->nsec;
- } while (unlikely(gtod_read_retry(gtod, seq)));
+ return __cvdso_clock_getres(clock, res);
}
+int clock_getres(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
-notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+#else
+/* i386 only */
+extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts);
+extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res);
+
+int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts)
{
- unsigned int msk;
-
- /* Sort out negative (CPU/FD) and invalid clocks */
- if (unlikely((unsigned int) clock >= MAX_CLOCKS))
- return vdso_fallback_gettime(clock, ts);
-
- /*
- * Convert the clockid to a bitmask and use it to check which
- * clocks are handled in the VDSO directly.
- */
- msk = 1U << clock;
- if (likely(msk & VGTOD_HRES)) {
- return do_hres(clock, ts);
- } else if (msk & VGTOD_COARSE) {
- do_coarse(clock, ts);
- return 0;
- }
- return vdso_fallback_gettime(clock, ts);
+ return __cvdso_clock_gettime32(clock, ts);
}
-int clock_gettime(clockid_t, struct timespec *)
+int clock_gettime(clockid_t, struct old_timespec32 *)
__attribute__((weak, alias("__vdso_clock_gettime")));
-notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts)
{
- if (likely(tv != NULL)) {
- struct timespec *ts = (struct timespec *) tv;
-
- do_hres(CLOCK_REALTIME, ts);
- tv->tv_usec /= 1000;
- }
- if (unlikely(tz != NULL)) {
- tz->tz_minuteswest = gtod->tz_minuteswest;
- tz->tz_dsttime = gtod->tz_dsttime;
- }
-
- return 0;
+ return __cvdso_clock_gettime(clock, ts);
}
-int gettimeofday(struct timeval *, struct timezone *)
- __attribute__((weak, alias("__vdso_gettimeofday")));
-/*
- * This will break when the xtime seconds get inaccurate, but that is
- * unlikely
- */
-notrace time_t __vdso_time(time_t *t)
-{
- /* This is atomic on x86 so we don't need any locks. */
- time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec);
+int clock_gettime64(clockid_t, struct __kernel_timespec *)
+ __attribute__((weak, alias("__vdso_clock_gettime64")));
- if (t)
- *t = result;
- return result;
+int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res)
+{
+ return __cvdso_clock_getres_time32(clock, res);
}
-time_t time(time_t *t)
- __attribute__((weak, alias("__vdso_time")));
+
+int clock_getres(clockid_t, struct old_timespec32 *)
+ __attribute__((weak, alias("__vdso_clock_getres")));
+#endif
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index d3a2dce4cfa9..36b644e16272 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -25,6 +25,8 @@ VERSION {
__vdso_getcpu;
time;
__vdso_time;
+ clock_getres;
+ __vdso_clock_getres;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 422764a81d32..c7720995ab1a 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -26,6 +26,8 @@ VERSION
__vdso_clock_gettime;
__vdso_gettimeofday;
__vdso_time;
+ __vdso_clock_getres;
+ __vdso_clock_gettime64;
};
LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 05cd1c5c4a15..16a8050a4fb6 100644
--- a/arch/x86/entry/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
@@ -21,6 +21,7 @@ VERSION {
__vdso_gettimeofday;
__vdso_getcpu;
__vdso_time;
+ __vdso_clock_getres;
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 8db1f594e8b1..349a61d8bf34 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -22,7 +22,7 @@
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
-#include <asm/mshyperv.h>
+#include <clocksource/hyperv_timer.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
index 1ac4dd116c26..93c1b3e949a7 100644
--- a/arch/x86/entry/vsyscall/Makefile
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -2,7 +2,5 @@
#
# Makefile for the x86 low level vsyscall code
#
-obj-y := vsyscall_gtod.o
-
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index d9d81ad7a400..07003f3f1bfc 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -42,9 +42,11 @@
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-static enum { EMULATE, NONE } vsyscall_mode =
+static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init =
#ifdef CONFIG_LEGACY_VSYSCALL_NONE
NONE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY)
+ XONLY;
#else
EMULATE;
#endif
@@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str)
if (str) {
if (!strcmp("emulate", str))
vsyscall_mode = EMULATE;
+ else if (!strcmp("xonly", str))
+ vsyscall_mode = XONLY;
else if (!strcmp("none", str))
vsyscall_mode = NONE;
else
@@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size)
}
}
-bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
@@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
long ret;
unsigned long orig_dx;
+ /* Write faults or kernel-privilege faults never get fixed up. */
+ if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER)
+ return false;
+
+ if (!(error_code & X86_PF_INSTR)) {
+ /* Failed vsyscall read */
+ if (vsyscall_mode == EMULATE)
+ return false;
+
+ /*
+ * User code tried and failed to read the vsyscall page.
+ */
+ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround");
+ return false;
+ }
+
/*
* No point in checking CS -- the only way to get here is a user mode
* trap to a high address, which means that we're in 64-bit user code.
@@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
-static struct vm_area_struct gate_vma = {
+static struct vm_area_struct gate_vma __ro_after_init = {
.vm_start = VSYSCALL_ADDR,
.vm_end = VSYSCALL_ADDR + PAGE_SIZE,
.vm_page_prot = PAGE_READONLY_EXEC,
@@ -357,12 +378,20 @@ void __init map_vsyscall(void)
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- if (vsyscall_mode != NONE) {
+ /*
+ * For full emulation, the page needs to exist for real. In
+ * execute-only mode, there is no PTE at all backing the vsyscall
+ * page.
+ */
+ if (vsyscall_mode == EMULATE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
+ if (vsyscall_mode == XONLY)
+ gate_vma.vm_flags = VM_EXEC;
+
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
}
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
deleted file mode 100644
index cfcdba082feb..000000000000
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
- * Copyright 2003 Andi Kleen, SuSE Labs.
- *
- * Modified for x86 32 bit architecture by
- * Stefani Seibold <stefani@seibold.net>
- * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
- *
- * Thanks to hpa@transmeta.com for some useful hint.
- * Special thanks to Ingo Molnar for his early experience with
- * a different vsyscall implementation for Linux/IA32 and for the name.
- *
- */
-
-#include <linux/timekeeper_internal.h>
-#include <asm/vgtod.h>
-#include <asm/vvar.h>
-
-int vclocks_used __read_mostly;
-
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
-
-void update_vsyscall_tz(void)
-{
- vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
- vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
- int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
- struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
- struct vgtod_ts *base;
- u64 nsec;
-
- /* Mark the new vclock used. */
- BUILD_BUG_ON(VCLOCK_MAX >= 32);
- WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
-
- gtod_write_begin(vdata);
-
- /* copy vsyscall data */
- vdata->vclock_mode = vclock_mode;
- vdata->cycle_last = tk->tkr_mono.cycle_last;
- vdata->mask = tk->tkr_mono.mask;
- vdata->mult = tk->tkr_mono.mult;
- vdata->shift = tk->tkr_mono.shift;
-
- base = &vdata->basetime[CLOCK_REALTIME];
- base->sec = tk->xtime_sec;
- base->nsec = tk->tkr_mono.xtime_nsec;
-
- base = &vdata->basetime[CLOCK_TAI];
- base->sec = tk->xtime_sec + (s64)tk->tai_offset;
- base->nsec = tk->tkr_mono.xtime_nsec;
-
- base = &vdata->basetime[CLOCK_MONOTONIC];
- base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec;
- nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
- while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
- nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
- base->sec++;
- }
- base->nsec = nsec;
-
- base = &vdata->basetime[CLOCK_REALTIME_COARSE];
- base->sec = tk->xtime_sec;
- base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
- base = &vdata->basetime[CLOCK_MONOTONIC_COARSE];
- base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
- nsec += tk->wall_to_monotonic.tv_nsec;
- while (nsec >= NSEC_PER_SEC) {
- nsec -= NSEC_PER_SEC;
- base->sec++;
- }
- base->nsec = nsec;
-
- gtod_write_end(vdata);
-}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f315425d8468..3cd94a21bd53 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -561,14 +561,14 @@ int x86_pmu_hw_config(struct perf_event *event)
}
/* sample_regs_user never support XMM registers */
- if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS))
+ if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
return -EINVAL;
/*
* Besides the general purpose registers, XMM registers may
* be collected in PEBS on some platforms, e.g. Icelake
*/
- if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) {
- if (x86_pmu.pebs_no_xmm_regs)
+ if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
+ if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
return -EINVAL;
if (!event->attr.precise_ip)
@@ -2402,13 +2402,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
return;
}
- if (perf_hw_regs(regs)) {
- if (perf_callchain_store(entry, regs->ip))
- return;
+ if (perf_callchain_store(entry, regs->ip))
+ return;
+
+ if (perf_hw_regs(regs))
unwind_start(&state, current, regs, NULL);
- } else {
+ else
unwind_start(&state, current, NULL, (void *)regs->sp);
- }
for (; !unwind_done(&state); unwind_next_frame(&state)) {
addr = unwind_get_return_address(&state);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7acc526b4ad2..505c73dc6a73 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -987,7 +987,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event)
pebs_data_cfg |= PEBS_DATACFG_GP;
if ((sample_type & PERF_SAMPLE_REGS_INTR) &&
- (attr->sample_regs_intr & PEBS_XMM_REGS))
+ (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK))
pebs_data_cfg |= PEBS_DATACFG_XMMS;
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -1964,10 +1964,9 @@ void __init intel_ds_init(void)
x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
- if (x86_pmu.version <= 4) {
+ if (x86_pmu.version <= 4)
x86_pmu.pebs_no_isolation = 1;
- x86_pmu.pebs_no_xmm_regs = 1;
- }
+
if (x86_pmu.pebs) {
char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
char *pebs_qual = "";
@@ -2020,9 +2019,9 @@ void __init intel_ds_init(void)
PERF_SAMPLE_TIME;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
pebs_qual = "-baseline";
+ x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
- x86_pmu.pebs_no_xmm_regs = 1;
x86_pmu.large_pebs_flags &=
~(PERF_SAMPLE_ADDR |
PERF_SAMPLE_TIME |
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 9e3fbd47cb56..089bfcdf2f7f 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1400,6 +1400,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init),
{},
};
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a6ac2f4f76fc..4e346856ee19 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -121,24 +121,6 @@ struct amd_nb {
(1ULL << PERF_REG_X86_R14) | \
(1ULL << PERF_REG_X86_R15))
-#define PEBS_XMM_REGS \
- ((1ULL << PERF_REG_X86_XMM0) | \
- (1ULL << PERF_REG_X86_XMM1) | \
- (1ULL << PERF_REG_X86_XMM2) | \
- (1ULL << PERF_REG_X86_XMM3) | \
- (1ULL << PERF_REG_X86_XMM4) | \
- (1ULL << PERF_REG_X86_XMM5) | \
- (1ULL << PERF_REG_X86_XMM6) | \
- (1ULL << PERF_REG_X86_XMM7) | \
- (1ULL << PERF_REG_X86_XMM8) | \
- (1ULL << PERF_REG_X86_XMM9) | \
- (1ULL << PERF_REG_X86_XMM10) | \
- (1ULL << PERF_REG_X86_XMM11) | \
- (1ULL << PERF_REG_X86_XMM12) | \
- (1ULL << PERF_REG_X86_XMM13) | \
- (1ULL << PERF_REG_X86_XMM14) | \
- (1ULL << PERF_REG_X86_XMM15))
-
/*
* Per register state.
*/
@@ -668,8 +650,7 @@ struct x86_pmu {
pebs_broken :1,
pebs_prec_dist :1,
pebs_no_tlb :1,
- pebs_no_isolation :1,
- pebs_no_xmm_regs :1;
+ pebs_no_isolation :1;
int pebs_record_size;
int pebs_buffer_size;
int max_pebs_events;
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 1608050e9df9..0e033ef11a9f 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -17,64 +17,13 @@
#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
-#include <linux/clockchips.h>
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
-
-#ifdef CONFIG_HYPERV_TSCPAGE
-
-static struct ms_hyperv_tsc_page *tsc_pg;
-
-struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
-{
- return tsc_pg;
-}
-EXPORT_SYMBOL_GPL(hv_get_tsc_page);
-
-static u64 read_hv_clock_tsc(struct clocksource *arg)
-{
- u64 current_tick = hv_read_tsc_page(tsc_pg);
-
- if (current_tick == U64_MAX)
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
-
- return current_tick;
-}
-
-static struct clocksource hyperv_cs_tsc = {
- .name = "hyperv_clocksource_tsc_page",
- .rating = 400,
- .read = read_hv_clock_tsc,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-#endif
-
-static u64 read_hv_clock_msr(struct clocksource *arg)
-{
- u64 current_tick;
- /*
- * Read the partition counter to get the current tick count. This count
- * is set to 0 when the partition is created and is incremented in
- * 100 nanosecond units.
- */
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
- return current_tick;
-}
-
-static struct clocksource hyperv_cs_msr = {
- .name = "hyperv_clocksource_msr",
- .rating = 400,
- .read = read_hv_clock_msr,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
+#include <clocksource/hyperv_timer.h>
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
-struct clocksource *hyperv_cs;
-EXPORT_SYMBOL_GPL(hyperv_cs);
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
@@ -343,42 +292,8 @@ void __init hyperv_init(void)
x86_init.pci.arch_init = hv_pci_init;
- /*
- * Register Hyper-V specific clocksource.
- */
-#ifdef CONFIG_HYPERV_TSCPAGE
- if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) {
- union hv_x64_msr_hypercall_contents tsc_msr;
-
- tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
- if (!tsc_pg)
- goto register_msr_cs;
-
- hyperv_cs = &hyperv_cs_tsc;
-
- rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
-
- tsc_msr.enable = 1;
- tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg);
-
- wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
-
- hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK;
-
- clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
- return;
- }
-register_msr_cs:
-#endif
- /*
- * For 32 bit guests just use the MSR based mechanism for reading
- * the partition counter.
- */
-
- hyperv_cs = &hyperv_cs_msr;
- if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE)
- clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100);
-
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
return;
remove_cpuhp_state:
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1340fa53b575..050e5f9ebf81 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -53,7 +53,7 @@ extern unsigned int apic_verbosity;
extern int local_apic_timer_c2_ok;
extern int disable_apic;
-extern unsigned int lapic_timer_frequency;
+extern unsigned int lapic_timer_period;
extern enum apic_intr_mode_id apic_intr_mode;
enum apic_intr_mode_id {
@@ -155,7 +155,6 @@ static inline int apic_force_enable(unsigned long addr)
extern int apic_force_enable(unsigned long addr);
#endif
-extern void apic_bsp_setup(bool upmode);
extern void apic_ap_setup(void);
/*
@@ -175,6 +174,7 @@ extern void lapic_assign_system_vectors(void);
extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
extern void lapic_online(void);
extern void lapic_offline(void);
+extern bool apic_needs_pit(void);
#else /* !CONFIG_X86_LOCAL_APIC */
static inline void lapic_shutdown(void) { }
@@ -188,6 +188,7 @@ static inline void init_bsp_APIC(void) { }
static inline void apic_intr_mode_init(void) { }
static inline void lapic_assign_system_vectors(void) { }
static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
+static inline bool apic_needs_pit(void) { return true; }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_X2APIC
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1d337c51f7e6..58acda503817 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,8 +22,8 @@ enum cpuid_leafs
CPUID_LNX_3,
CPUID_7_0_EBX,
CPUID_D_1_EAX,
- CPUID_F_0_EDX,
- CPUID_F_1_EDX,
+ CPUID_LNX_4,
+ CPUID_7_1_EAX,
CPUID_8000_0008_EBX,
CPUID_6_EAX,
CPUID_8000_000A_EDX,
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 75f27ee2c263..998c2cc08363 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -239,12 +239,14 @@
#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */
#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
@@ -269,13 +271,19 @@
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
-#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+/*
+ * Extended auxiliary flags: Linux defined - for features scattered in various
+ * CPUID levels like 0xf, etc.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
+#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */
-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
+/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
@@ -322,6 +330,7 @@
#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 7e42b285c856..c6136d79f8c0 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -47,7 +47,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
u64 xstate_mask);
-void fpu__xstate_clear_all_cpu_caps(void);
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
const void *get_xsave_field_ptr(int xfeature_nr);
int using_compacted_format(void);
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 67385d56d4f4..6352dee37cda 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -75,16 +75,15 @@ extern unsigned int hpet_readl(unsigned int a);
extern void force_hpet_resume(void);
struct irq_data;
-struct hpet_dev;
+struct hpet_channel;
struct irq_domain;
extern void hpet_msi_unmask(struct irq_data *data);
extern void hpet_msi_mask(struct irq_data *data);
-extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
-extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
+extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg);
extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
extern int hpet_assign_irq(struct irq_domain *domain,
- struct hpet_dev *dev, int dev_num);
+ struct hpet_channel *hc, int dev_num);
#ifdef CONFIG_HPET_EMULATE_RTC
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 32e666e1231e..cbd97e22d2f3 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -150,8 +150,11 @@ extern char irq_entries_start[];
#define trace_irq_entries_start irq_entries_start
#endif
+extern char spurious_entries_start[];
+
#define VECTOR_UNUSED NULL
-#define VECTOR_RETRIGGERED ((void *)~0UL)
+#define VECTOR_SHUTDOWN ((void *)~0UL)
+#define VECTOR_RETRIGGERED ((void *)~1UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cdf44aa9a501..af78cd72b8f3 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -401,6 +401,12 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+/*
+ * The Hyper-V TimeRefCount register and the TSC
+ * page provide a guest VM clock with 100ns tick rate
+ */
+#define HV_CLOCK_HZ (NSEC_PER_SEC/100)
+
typedef struct _HV_REFERENCE_TSC_PAGE {
__u32 tsc_sequence;
__u32 res1;
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 310118805f57..0278aa66ef62 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -56,6 +56,7 @@
#define INTEL_FAM6_ICELAKE_XEON_D 0x6C
#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D
#define INTEL_FAM6_ICELAKE_MOBILE 0x7E
+#define INTEL_FAM6_ICELAKE_NNPI 0x9D
/* "Small Core" Processors (Atom) */
@@ -76,6 +77,7 @@
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+
#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index cc60e617931c..f4fa8a9d5d0b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -105,6 +105,17 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type)
#define hv_get_crash_ctl(val) \
rdmsrl(HV_X64_MSR_CRASH_CTL, val)
+#define hv_get_time_ref_count(val) \
+ rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val)
+
+#define hv_get_reference_tsc(val) \
+ rdmsrl(HV_X64_MSR_REFERENCE_TSC, val)
+#define hv_set_reference_tsc(val) \
+ wrmsrl(HV_X64_MSR_REFERENCE_TSC, val)
+#define hv_set_clocksource_vdso(val) \
+ ((val).archdata.vclock_mode = VCLOCK_HVCLOCK)
+#define hv_get_raw_timer() rdtsc_ordered()
+
void hyperv_callback_vector(void);
void hyperv_reenlightenment_vector(void);
#ifdef CONFIG_TRACING
@@ -133,7 +144,6 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {}
#if IS_ENABLED(CONFIG_HYPERV)
-extern struct clocksource *hyperv_cs;
extern void *hv_hypercall_pg;
extern void __percpu **hyperv_pcpu_input_arg;
@@ -387,73 +397,4 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
}
#endif /* CONFIG_HYPERV */
-#ifdef CONFIG_HYPERV_TSCPAGE
-struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
-static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
- u64 *cur_tsc)
-{
- u64 scale, offset;
- u32 sequence;
-
- /*
- * The protocol for reading Hyper-V TSC page is specified in Hypervisor
- * Top-Level Functional Specification ver. 3.0 and above. To get the
- * reference time we must do the following:
- * - READ ReferenceTscSequence
- * A special '0' value indicates the time source is unreliable and we
- * need to use something else. The currently published specification
- * versions (up to 4.0b) contain a mistake and wrongly claim '-1'
- * instead of '0' as the special value, see commit c35b82ef0294.
- * - ReferenceTime =
- * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset
- * - READ ReferenceTscSequence again. In case its value has changed
- * since our first reading we need to discard ReferenceTime and repeat
- * the whole sequence as the hypervisor was updating the page in
- * between.
- */
- do {
- sequence = READ_ONCE(tsc_pg->tsc_sequence);
- if (!sequence)
- return U64_MAX;
- /*
- * Make sure we read sequence before we read other values from
- * TSC page.
- */
- smp_rmb();
-
- scale = READ_ONCE(tsc_pg->tsc_scale);
- offset = READ_ONCE(tsc_pg->tsc_offset);
- *cur_tsc = rdtsc_ordered();
-
- /*
- * Make sure we read sequence after we read all other values
- * from TSC page.
- */
- smp_rmb();
-
- } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
-
- return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
-}
-
-static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
-{
- u64 cur_tsc;
-
- return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
-}
-
-#else
-static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
-{
- return NULL;
-}
-
-static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
- u64 *cur_tsc)
-{
- BUG();
- return U64_MAX;
-}
-#endif
#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 979ef971cc78..6b4fc2788078 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -61,6 +61,15 @@
#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
+#define MSR_IA32_UMWAIT_CONTROL 0xe1
+#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0)
+#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1)
+/*
+ * The time field is bit[31:2], but representing a 32bit value with
+ * bit[1:0] zero.
+ */
+#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)
+
#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
#define NHM_C1_AUTO_DEMOTE (1UL << 26)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c34a35c78618..e57d2ca2ed87 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -144,7 +144,8 @@ enum cpuid_regs_idx {
#define X86_VENDOR_TRANSMETA 7
#define X86_VENDOR_NSC 8
#define X86_VENDOR_HYGON 9
-#define X86_VENDOR_NUM 10
+#define X86_VENDOR_ZHAOXIN 10
+#define X86_VENDOR_NUM 11
#define X86_VENDOR_UNKNOWN 0xff
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index b6033680d458..19b695ff2c68 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_PVCLOCK_H
#define _ASM_X86_PVCLOCK_H
-#include <linux/clocksource.h>
+#include <asm/clocksource.h>
#include <asm/pvclock-abi.h>
/* some helper functions for xen and kvm pv clock sources */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index cef818b16045..8ac563abb567 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -7,6 +7,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
+extern bool pit_timer_init(void);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
new file mode 100644
index 000000000000..ae91429129a6
--- /dev/null
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -0,0 +1,261 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Fast user context implementation of clock_gettime, gettimeofday, and time.
+ *
+ * Copyright (C) 2019 ARM Limited.
+ * Copyright 2006 Andi Kleen, SUSE Labs.
+ * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ */
+#ifndef __ASM_VDSO_GETTIMEOFDAY_H
+#define __ASM_VDSO_GETTIMEOFDAY_H
+
+#ifndef __ASSEMBLY__
+
+#include <uapi/linux/time.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+#include <asm/unistd.h>
+#include <asm/msr.h>
+#include <asm/pvclock.h>
+#include <clocksource/hyperv_timer.h>
+
+#define __vdso_data (VVAR(_vdso_data))
+
+#define VDSO_HAS_TIME 1
+
+#define VDSO_HAS_CLOCK_GETRES 1
+
+/*
+ * Declare the memory-mapped vclock data pages. These come from hypervisors.
+ * If we ever reintroduce something like direct access to an MMIO clock like
+ * the HPET again, it will go here as well.
+ *
+ * A load from any of these pages will segfault if the clock in question is
+ * disabled, so appropriate compiler barriers and checks need to be used
+ * to prevent stray loads.
+ *
+ * These declarations MUST NOT be const. The compiler will assume that
+ * an extern const variable has genuinely constant contents, and the
+ * resulting code won't work, since the whole point is that these pages
+ * change over time, possibly while we're accessing them.
+ */
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+/*
+ * This is the vCPU 0 pvclock page. We only use pvclock from the vDSO
+ * if the hypervisor tells us that all vCPUs can get valid data from the
+ * vCPU 0 page.
+ */
+extern struct pvclock_vsyscall_time_info pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifdef CONFIG_HYPERV_TSCPAGE
+extern struct ms_hyperv_tsc_page hvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
+#ifndef BUILD_VDSO32
+
+static __always_inline
+long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret), "=m" (*_ts) :
+ "0" (__NR_clock_gettime), "D" (_clkid), "S" (_ts) :
+ "rcx", "r11");
+
+ return ret;
+}
+
+static __always_inline
+long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
+ struct timezone *_tz)
+{
+ long ret;
+
+ asm("syscall" : "=a" (ret) :
+ "0" (__NR_gettimeofday), "D" (_tv), "S" (_tz) : "memory");
+
+ return ret;
+}
+
+static __always_inline
+long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm ("syscall" : "=a" (ret), "=m" (*_ts) :
+ "0" (__NR_clock_getres), "D" (_clkid), "S" (_ts) :
+ "rcx", "r11");
+
+ return ret;
+}
+
+#else
+
+static __always_inline
+long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_gettime64), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+static __always_inline
+long gettimeofday_fallback(struct __kernel_old_timeval *_tv,
+ struct timezone *_tz)
+{
+ long ret;
+
+ asm(
+ "mov %%ebx, %%edx \n"
+ "mov %2, %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday), "g" (_tv), "c" (_tz)
+ : "memory", "edx");
+
+ return ret;
+}
+
+static __always_inline long
+clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
+{
+ long ret;
+
+ asm (
+ "mov %%ebx, %%edx \n"
+ "mov %[clock], %%ebx \n"
+ "call __kernel_vsyscall \n"
+ "mov %%edx, %%ebx \n"
+ : "=a" (ret), "=m" (*_ts)
+ : "0" (__NR_clock_getres_time64), [clock] "g" (_clkid), "c" (_ts)
+ : "edx");
+
+ return ret;
+}
+
+#endif
+
+#ifdef CONFIG_PARAVIRT_CLOCK
+static u64 vread_pvclock(void)
+{
+ const struct pvclock_vcpu_time_info *pvti = &pvclock_page.pvti;
+ u32 version;
+ u64 ret;
+
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvclock_read_begin(pvti);
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT)))
+ return U64_MAX;
+
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
+ } while (pvclock_read_retry(pvti, version));
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_HYPERV_TSCPAGE
+static u64 vread_hvclock(void)
+{
+ return hv_read_tsc_page(&hvclock_page);
+}
+#endif
+
+static inline u64 __arch_get_hw_counter(s32 clock_mode)
+{
+ if (clock_mode == VCLOCK_TSC)
+ return (u64)rdtsc_ordered();
+ /*
+ * For any memory-mapped vclock type, we need to make sure that gcc
+ * doesn't cleverly hoist a load before the mode check. Otherwise we
+ * might end up touching the memory-mapped page even if the vclock in
+ * question isn't enabled, which will segfault. Hence the barriers.
+ */
+#ifdef CONFIG_PARAVIRT_CLOCK
+ if (clock_mode == VCLOCK_PVCLOCK) {
+ barrier();
+ return vread_pvclock();
+ }
+#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+ if (clock_mode == VCLOCK_HVCLOCK) {
+ barrier();
+ return vread_hvclock();
+ }
+#endif
+ return U64_MAX;
+}
+
+static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
+{
+ return __vdso_data;
+}
+
+/*
+ * x86 specific delta calculation.
+ *
+ * The regular implementation assumes that clocksource reads are globally
+ * monotonic. The TSC can be slightly off across sockets which can cause
+ * the regular delta calculation (@cycles - @last) to return a huge time
+ * jump.
+ *
+ * Therefore it needs to be verified that @cycles are greater than
+ * @last. If not then use @last, which is the base time of the current
+ * conversion period.
+ *
+ * This variant also removes the masking of the subtraction because the
+ * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
+ * which would result in a pointless operation. The compiler cannot
+ * optimize it away as the mask comes from the vdso data and is not compile
+ * time constant.
+ */
+static __always_inline
+u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
+{
+ if (cycles > last)
+ return (cycles - last) * mult;
+ return 0;
+}
+#define vdso_calc_delta vdso_calc_delta
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h
new file mode 100644
index 000000000000..0026ab2123ce
--- /dev/null
+++ b/arch/x86/include/asm/vdso/vsyscall.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_VDSO_VSYSCALL_H
+#define __ASM_VDSO_VSYSCALL_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+
+int vclocks_used __read_mostly;
+
+DEFINE_VVAR(struct vdso_data, _vdso_data);
+/*
+ * Update the vDSO data page to keep in sync with kernel timekeeping.
+ */
+static __always_inline
+struct vdso_data *__x86_get_k_vdso_data(void)
+{
+ return _vdso_data;
+}
+#define __arch_get_k_vdso_data __x86_get_k_vdso_data
+
+static __always_inline
+int __x86_get_clock_mode(struct timekeeper *tk)
+{
+ int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+
+ /* Mark the new vclock used. */
+ BUILD_BUG_ON(VCLOCK_MAX >= 32);
+ WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
+ return vclock_mode;
+}
+#define __arch_get_clock_mode __x86_get_clock_mode
+
+/* The asm-generic header needs to be included after the definitions above */
+#include <asm-generic/vdso/vsyscall.h>
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_VSYSCALL_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 913a133f8e6f..a2638c6124ed 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -3,7 +3,9 @@
#define _ASM_X86_VGTOD_H
#include <linux/compiler.h>
-#include <linux/clocksource.h>
+#include <asm/clocksource.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
#include <uapi/linux/time.h>
@@ -13,81 +15,10 @@ typedef u64 gtod_long_t;
typedef unsigned long gtod_long_t;
#endif
-/*
- * There is one of these objects in the vvar page for each
- * vDSO-accelerated clockid. For high-resolution clocks, this encodes
- * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse
- * clocks, this encodes the actual time.
- *
- * To confuse the reader, for high-resolution clocks, nsec is left-shifted
- * by vsyscall_gtod_data.shift.
- */
-struct vgtod_ts {
- u64 sec;
- u64 nsec;
-};
-
-#define VGTOD_BASES (CLOCK_TAI + 1)
-#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI))
-#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE))
-
-/*
- * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time
- * so be carefull by modifying this structure.
- */
-struct vsyscall_gtod_data {
- unsigned int seq;
-
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
-
- struct vgtod_ts basetime[VGTOD_BASES];
-
- int tz_minuteswest;
- int tz_dsttime;
-};
-extern struct vsyscall_gtod_data vsyscall_gtod_data;
-
extern int vclocks_used;
static inline bool vclock_was_used(int vclock)
{
return READ_ONCE(vclocks_used) & (1 << vclock);
}
-static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s)
-{
- unsigned int ret;
-
-repeat:
- ret = READ_ONCE(s->seq);
- if (unlikely(ret & 1)) {
- cpu_relax();
- goto repeat;
- }
- smp_rmb();
- return ret;
-}
-
-static inline int gtod_read_retry(const struct vsyscall_gtod_data *s,
- unsigned int start)
-{
- smp_rmb();
- return unlikely(s->seq != start);
-}
-
-static inline void gtod_write_begin(struct vsyscall_gtod_data *s)
-{
- ++s->seq;
- smp_wmb();
-}
-
-static inline void gtod_write_end(struct vsyscall_gtod_data *s)
-{
- smp_wmb();
- ++s->seq;
-}
-
#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index b986b2ca688a..ab60a71a8dcb 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
* Called on instruction fetch fault in vsyscall page.
* Returns true if handled.
*/
-extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+extern bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address);
#else
static inline void map_vsyscall(void) {}
-static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
+static inline bool emulate_vsyscall(unsigned long error_code,
+ struct pt_regs *regs, unsigned long address)
{
return false;
}
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index e474f5c6e387..32f5d9a0b90e 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -32,19 +32,20 @@
extern char __vvar_page;
#define DECLARE_VVAR(offset, type, name) \
- extern type vvar_ ## name __attribute__((visibility("hidden")));
+ extern type vvar_ ## name[CS_BASES] \
+ __attribute__((visibility("hidden")));
#define VVAR(name) (vvar_ ## name)
#define DEFINE_VVAR(type, name) \
- type name \
+ type name[CS_BASES] \
__attribute__((section(".vvar_" #name), aligned(16))) __visible
#endif
/* DECLARE_VVAR(offset, type, name) */
-DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
+DECLARE_VVAR(128, struct vdso_data, _vdso_data)
#undef DECLARE_VVAR
diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h
index ac67bbea10ca..7c9d2bb3833b 100644
--- a/arch/x86/include/uapi/asm/perf_regs.h
+++ b/arch/x86/include/uapi/asm/perf_regs.h
@@ -52,4 +52,7 @@ enum perf_event_x86_regs {
/* These include both GPRs and XMMX registers */
PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2,
};
+
+#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1))
+
#endif /* _ASM_X86_PERF_REGS_H */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index a5e5484988fd..caf2edccbad2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
c->x86_stepping >= 0x0e))
flags->bm_check = 1;
}
+
+ if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
+ /*
+ * All Zhaoxin CPUs that support C3 share cache.
+ * And caches should not be flushed by software while
+ * entering C3 type state.
+ */
+ flags->bm_check = 1;
+ /*
+ * On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
+ * So, set bm_control to zero to indicate that ARB_DISABLE
+ * is not required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 177aa8ef2afa..1bd91cb7b320 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -195,7 +195,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
-unsigned int lapic_timer_frequency = 0;
+unsigned int lapic_timer_period = 0;
static void apic_pm_activate(void);
@@ -501,7 +501,7 @@ lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
- __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ __setup_APIC_LVTT(lapic_timer_period, oneshot, 1);
return 0;
}
@@ -805,11 +805,11 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
static int __init lapic_init_clockevent(void)
{
- if (!lapic_timer_frequency)
+ if (!lapic_timer_period)
return -1;
/* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
+ lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR,
TICK_NSEC, lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
@@ -821,6 +821,33 @@ static int __init lapic_init_clockevent(void)
return 0;
}
+bool __init apic_needs_pit(void)
+{
+ /*
+ * If the frequencies are not known, PIT is required for both TSC
+ * and apic timer calibration.
+ */
+ if (!tsc_khz || !cpu_khz)
+ return true;
+
+ /* Is there an APIC at all? */
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return true;
+
+ /* Deadline timer is based on TSC so no further PIT action required */
+ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
+ return false;
+
+ /* APIC timer disabled? */
+ if (disable_apic_timer)
+ return true;
+ /*
+ * The APIC timer frequency is known already, no PIT calibration
+ * required. If unknown, let the PIT be initialized.
+ */
+ return lapic_timer_period == 0;
+}
+
static int __init calibrate_APIC_clock(void)
{
struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
@@ -839,7 +866,7 @@ static int __init calibrate_APIC_clock(void)
*/
if (!lapic_init_clockevent()) {
apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
- lapic_timer_frequency);
+ lapic_timer_period);
/*
* Direct calibration methods must have an always running
* local APIC timer, no need for broadcast timer.
@@ -884,13 +911,13 @@ static int __init calibrate_APIC_clock(void)
pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
&delta, &deltatsc);
- lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
lapic_init_clockevent();
apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- lapic_timer_frequency);
+ lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
@@ -901,13 +928,13 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
"%u.%04u MHz.\n",
- lapic_timer_frequency / (1000000 / HZ),
- lapic_timer_frequency % (1000000 / HZ));
+ lapic_timer_period / (1000000 / HZ),
+ lapic_timer_period % (1000000 / HZ));
/*
* Do a sanity check on the APIC calibration result
*/
- if (lapic_timer_frequency < (1000000 / HZ)) {
+ if (lapic_timer_period < (1000000 / HZ)) {
local_irq_enable();
pr_warning("APIC frequency too slow, disabling apic timer\n");
return -1;
@@ -1351,6 +1378,8 @@ void __init init_bsp_APIC(void)
apic_write(APIC_LVT1, value);
}
+static void __init apic_bsp_setup(bool upmode);
+
/* Init the interrupt delivery mode for the BSP */
void __init apic_intr_mode_init(void)
{
@@ -1464,7 +1493,8 @@ static void apic_pending_intr_clear(void)
if (queued) {
if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) {
ntsc = rdtsc();
- max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ max_loops = (long long)cpu_khz << 10;
+ max_loops -= ntsc - tsc;
} else {
max_loops--;
}
@@ -2040,21 +2070,32 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
entering_irq();
trace_spurious_apic_entry(vector);
+ inc_irq_stat(irq_spurious_count);
+
+ /*
+ * If this is a spurious interrupt then do not acknowledge
+ */
+ if (vector == SPURIOUS_APIC_VECTOR) {
+ /* See SDM vol 3 */
+ pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n",
+ smp_processor_id());
+ goto out;
+ }
+
/*
- * Check if this really is a spurious interrupt and ACK it
- * if it is a vectored one. Just in case...
- * Spurious interrupts should not be ACKed.
+ * If it is a vectored one, verify it's set in the ISR. If set,
+ * acknowledge it.
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
- if (v & (1 << (vector & 0x1f)))
+ if (v & (1 << (vector & 0x1f))) {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n",
+ vector, smp_processor_id());
ack_APIC_irq();
-
- inc_irq_stat(irq_spurious_count);
-
- /* see sw-dev-man vol 3, chapter 7.4.13.5 */
- pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
- "should never happen.\n", vector, smp_processor_id());
-
+ } else {
+ pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n",
+ vector, smp_processor_id());
+ }
+out:
trace_spurious_apic_exit(vector);
exiting_irq();
}
@@ -2415,11 +2456,8 @@ static void __init apic_bsp_up_setup(void)
/**
* apic_bsp_setup - Setup function for local apic and io-apic
* @upmode: Force UP mode (for APIC_init_uniprocessor)
- *
- * Returns:
- * apic_id of BSP APIC
*/
-void __init apic_bsp_setup(bool upmode)
+static void __init apic_bsp_setup(bool upmode)
{
connect_bsp_APIC();
if (upmode)
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index bf083c3f1d73..bbdca603f94a 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -78,7 +78,7 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
int cpu = smp_processor_id();
if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
+ __clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}
@@ -92,7 +92,7 @@ static void flat_send_IPI_allbutself(int vector)
unsigned long mask = cpumask_bits(cpu_online_mask)[0];
if (cpu < BITS_PER_LONG)
- clear_bit(cpu, &mask);
+ __clear_bit(cpu, &mask);
_flat_send_IPI_mask(mask, vector);
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 53aa234a6803..c7bb6c69f21c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -58,6 +58,7 @@
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/timer.h>
+#include <asm/time.h>
#include <asm/i8259.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
@@ -1893,6 +1894,50 @@ static int ioapic_set_affinity(struct irq_data *irq_data,
return ret;
}
+/*
+ * Interrupt shutdown masks the ioapic pin, but the interrupt might already
+ * be in flight, but not yet serviced by the target CPU. That means
+ * __synchronize_hardirq() would return and claim that everything is calmed
+ * down. So free_irq() would proceed and deactivate the interrupt and free
+ * resources.
+ *
+ * Once the target CPU comes around to service it it will find a cleared
+ * vector and complain. While the spurious interrupt is harmless, the full
+ * release of resources might prevent the interrupt from being acknowledged
+ * which keeps the hardware in a weird state.
+ *
+ * Verify that the corresponding Remote-IRR bits are clear.
+ */
+static int ioapic_irq_get_chip_state(struct irq_data *irqd,
+ enum irqchip_irq_state which,
+ bool *state)
+{
+ struct mp_chip_data *mcd = irqd->chip_data;
+ struct IO_APIC_route_entry rentry;
+ struct irq_pin_list *p;
+
+ if (which != IRQCHIP_STATE_ACTIVE)
+ return -EINVAL;
+
+ *state = false;
+ raw_spin_lock(&ioapic_lock);
+ for_each_irq_pin(p, mcd->irq_2_pin) {
+ rentry = __ioapic_read_entry(p->apic, p->pin);
+ /*
+ * The remote IRR is only valid in level trigger mode. It's
+ * meaning is undefined for edge triggered interrupts and
+ * irrelevant because the IO-APIC treats them as fire and
+ * forget.
+ */
+ if (rentry.irr && rentry.trigger) {
+ *state = true;
+ break;
+ }
+ }
+ raw_spin_unlock(&ioapic_lock);
+ return 0;
+}
+
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.irq_startup = startup_ioapic_irq,
@@ -1902,6 +1947,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_eoi = ioapic_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -1914,6 +1960,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_eoi = ioapic_ir_ack_level,
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_get_irqchip_state = ioapic_irq_get_chip_state,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -2083,6 +2130,9 @@ static inline void __init check_timer(void)
unsigned long flags;
int no_pin1 = 0;
+ if (!global_clock_event)
+ return;
+
local_irq_save(flags);
/*
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index dad0dd759de2..7f7533462474 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -370,14 +370,14 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
return d;
}
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
int dev_num)
{
struct irq_alloc_info info;
init_irq_alloc_info(&info, NULL);
info.type = X86_IRQ_ALLOC_TYPE_HPET;
- info.hpet_data = dev;
+ info.hpet_data = hc;
info.hpet_id = hpet_dev_id(domain);
info.hpet_index = dev_num;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index e7cb78aed644..fdacb864c3dd 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -340,7 +340,7 @@ static void clear_irq_vector(struct irq_data *irqd)
trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector,
apicd->prev_cpu);
- per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN;
irq_matrix_free(vector_matrix, apicd->cpu, vector, managed);
apicd->vector = 0;
@@ -349,7 +349,7 @@ static void clear_irq_vector(struct irq_data *irqd)
if (!vector)
return;
- per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED;
+ per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN;
irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed);
apicd->prev_vector = 0;
apicd->move_in_progress = 0;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 7685444a106b..609e499387a1 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -50,7 +50,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
cpumask_copy(tmpmsk, mask);
/* If IPI should not be sent to self, clear current CPU */
if (apic_dest != APIC_DEST_ALLINC)
- cpumask_clear_cpu(smp_processor_id(), tmpmsk);
+ __cpumask_clear_cpu(smp_processor_id(), tmpmsk);
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
for_each_cpu(cpu, tmpmsk) {
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5102bf7c8192..4b4eb06e117c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -24,6 +24,7 @@ obj-y += match.o
obj-y += bugs.o
obj-y += aperfmperf.o
obj-y += cpuid-deps.o
+obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
@@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index e71a6ff8a67e..e2f319dc992d 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -13,6 +13,7 @@
#include <linux/percpu.h>
#include <linux/cpufreq.h>
#include <linux/smp.h>
+#include <linux/sched/isolation.h>
#include "cpu.h"
@@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
+ if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
+ return 0;
+
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
}
@@ -101,9 +105,12 @@ void arch_freq_prepare_all(void)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
- for_each_online_cpu(cpu)
+ for_each_online_cpu(cpu) {
+ if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
+ continue;
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
+ }
if (wait)
msleep(APERFMPERF_REFRESH_DELAY_MS);
@@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu)
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
+ if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
+ return 0;
+
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
return per_cpu(samples.khz, cpu);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 03b4cc0ec3a7..66ca906aa790 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -836,6 +836,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
}
/*
+ * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
+ * bit in the mask to allow guests to use the mitigation even in the
+ * case where the host does not enable it.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
+ }
+
+ /*
* We have three CPU feature flags that are in play here:
* - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
* - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
@@ -852,7 +862,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
}
}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 395d46f78582..c7503be92f35 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
per_cpu(cpu_llc_id, cpu) = node_id;
- } else if (c->x86 == 0x17 &&
- c->x86_model >= 0 && c->x86_model <= 0x1F) {
+ } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
* Core complex ID is ApicId[3] for these processors.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2c57fffebf9b..dad20bc891d5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -801,6 +801,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
}
}
+static void init_cqm(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+}
+
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -823,6 +847,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_7_ECX] = ecx;
c->x86_capability[CPUID_7_EDX] = edx;
+
+ /* Check valid sub-leaf index before accessing it */
+ if (eax >= 1) {
+ cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_1_EAX] = eax;
+ }
}
/* Extended state features: level 0x0000000d */
@@ -832,33 +862,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_D_1_EAX] = eax;
}
- /* Additional Intel-defined flags: level 0x0000000F */
- if (c->cpuid_level >= 0x0000000F) {
-
- /* QoS sub-leaf, EAX=0Fh, ECX=0 */
- cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_F_0_EDX] = edx;
-
- if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
- /* will be overridden if occupancy monitoring exists */
- c->x86_cache_max_rmid = ebx;
-
- /* QoS sub-leaf, EAX=0Fh, ECX=1 */
- cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[CPUID_F_1_EDX] = edx;
-
- if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
- ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
- (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
- c->x86_cache_max_rmid = ecx;
- c->x86_cache_occ_scale = ebx;
- }
- } else {
- c->x86_cache_max_rmid = -1;
- c->x86_cache_occ_scale = -1;
- }
- }
-
/* AMD-defined flags: level 0x80000001 */
eax = cpuid_eax(0x80000000);
c->extended_cpuid_level = eax;
@@ -889,6 +892,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
init_scattered_cpuid_features(c);
init_speculation_control(c);
+ init_cqm(c);
/*
* Clear/Set all flags overridden by options, after probe.
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 2c0bd38a44ab..b5353244749b 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -20,6 +20,7 @@ struct cpuid_dep {
* but it's difficult to tell that to the init reference checker.
*/
static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
{ X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
{ X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
@@ -27,7 +28,11 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_PKU, X86_FEATURE_XSAVE },
{ X86_FEATURE_MPX, X86_FEATURE_XSAVE },
{ X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
{ X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM, X86_FEATURE_FXSR },
{ X86_FEATURE_XMM2, X86_FEATURE_XMM },
{ X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
@@ -59,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{}
};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f17c1a714779..8d6d92ebeb54 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
}
}
+/*
+ * Processors which have self-snooping capability can handle conflicting
+ * memory type across CPUs by snooping its own cache. However, there exists
+ * CPU models in which having conflicting memory types still leads to
+ * unpredictable behavior, machine check errors, or hangs. Clear this
+ * feature to prevent its use on machines with known erratas.
+ */
+static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_model) {
+ case INTEL_FAM6_CORE_YONAH:
+ case INTEL_FAM6_CORE2_MEROM:
+ case INTEL_FAM6_CORE2_MEROM_L:
+ case INTEL_FAM6_CORE2_PENRYN:
+ case INTEL_FAM6_CORE2_DUNNINGTON:
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_G:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_SANDYBRIDGE:
+ setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
+ }
+}
+
static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
+ check_memory_type_self_snoop_errata(c);
/*
* Get the number of SMT siblings early from the extended topology
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index a813987b5552..cb0fdcaf1415 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -789,13 +789,16 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
-static int mc_cpu_online(unsigned int cpu)
+static int mc_cpu_starting(unsigned int cpu)
{
- struct device *dev;
-
- dev = get_cpu_device(cpu);
microcode_update_cpu(cpu);
pr_debug("CPU%d added\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
@@ -872,7 +875,9 @@ int __init microcode_init(void)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
- cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:online",
+ cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
+ mc_cpu_starting, NULL);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 7df29f08871b..062f77279ce3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -80,6 +81,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
exiting_irq();
@@ -89,7 +91,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
{
*vector = HYPERV_STIMER0_VECTOR;
- *irq = 0; /* Unused on x86/x64 */
+ *irq = -1; /* Unused on x86/x64 */
hv_stimer0_handler = handler;
return 0;
}
@@ -266,9 +268,9 @@ static void __init ms_hyperv_init_platform(void)
rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
- lapic_timer_frequency = hv_lapic_frequency;
+ lapic_timer_period = hv_lapic_frequency;
pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
- lapic_timer_frequency);
+ lapic_timer_period);
}
register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9356c1c9024d..aa5c064a6a22 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
- wbinvd();
+
+ /*
+ * Cache flushing is the most time-consuming step when programming
+ * the MTRRs. Fortunately, as per the Intel Software Development
+ * Manual, we can skip it if the processor supports cache self-
+ * snooping.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (boot_cpu_has(X86_FEATURE_PGE)) {
@@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
- wbinvd();
+
+ /* Again, only flush caches if we have to. */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 2131b8bbaad7..2f4824793798 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -796,8 +796,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
struct rdt_resource *r = of->kn->parent->priv;
- u32 sw_shareable = 0, hw_shareable = 0;
- u32 exclusive = 0, pseudo_locked = 0;
+ /*
+ * Use unsigned long even though only 32 bits are used to ensure
+ * test_bit() is used safely.
+ */
+ unsigned long sw_shareable = 0, hw_shareable = 0;
+ unsigned long exclusive = 0, pseudo_locked = 0;
struct rdt_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
@@ -842,10 +846,10 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
}
for (i = r->cache.cbm_len - 1; i >= 0; i--) {
pseudo_locked = dom->plr ? dom->plr->cbm : 0;
- hwb = test_bit(i, (unsigned long *)&hw_shareable);
- swb = test_bit(i, (unsigned long *)&sw_shareable);
- excl = test_bit(i, (unsigned long *)&exclusive);
- psl = test_bit(i, (unsigned long *)&pseudo_locked);
+ hwb = test_bit(i, &hw_shareable);
+ swb = test_bit(i, &sw_shareable);
+ excl = test_bit(i, &exclusive);
+ psl = test_bit(i, &pseudo_locked);
if (hwb && swb)
seq_putc(seq, 'X');
else if (hwb && !swb)
@@ -2486,26 +2490,19 @@ out_destroy:
*/
static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
{
- /*
- * Convert the u32 _val to an unsigned long required by all the bit
- * operations within this function. No more than 32 bits of this
- * converted value can be accessed because all bit operations are
- * additionally provided with cbm_len that is initialized during
- * hardware enumeration using five bits from the EAX register and
- * thus never can exceed 32 bits.
- */
- unsigned long *val = (unsigned long *)_val;
+ unsigned long val = *_val;
unsigned int cbm_len = r->cache.cbm_len;
unsigned long first_bit, zero_bit;
- if (*val == 0)
+ if (val == 0)
return;
- first_bit = find_first_bit(val, cbm_len);
- zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
/* Clear any remaining bits to ensure contiguous region */
- bitmap_clear(val, zero_bit, cbm_len - zero_bit);
+ bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
+ *_val = (u32)val;
}
/*
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 94aa1c72ca98..adf9b71386ef 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -26,6 +26,10 @@ struct cpuid_bit {
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
new file mode 100644
index 000000000000..6a204e7336c1
--- /dev/null
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/syscore_ops.h>
+#include <linux/suspend.h>
+#include <linux/cpu.h>
+
+#include <asm/msr.h>
+
+#define UMWAIT_C02_ENABLE 0
+
+#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
+ (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
+ ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
+
+/*
+ * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
+ * umwait max time is 100000 in TSC-quanta and C0.2 is enabled
+ */
+static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
+ * the sysfs write functions.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
+static void umwait_update_control_msr(void * unused)
+{
+ lockdep_assert_irqs_disabled();
+ wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the global control
+ * value.
+ *
+ * Disable interrupts so the read of umwait_control_cached and the WRMSR
+ * are protected against a concurrent sysfs write. Otherwise the sysfs
+ * write could update the cached value after it had been read on this CPU
+ * and issue the IPI before the old value had been written. The IPI would
+ * interrupt, write the new value and after return from IPI the previous
+ * value would be written by this CPU.
+ *
+ * With interrupts disabled the upcoming CPU either sees the new control
+ * value or the IPI is updating this CPU to the new control value after
+ * interrupts have been reenabled.
+ */
+static int umwait_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ umwait_update_control_msr(NULL);
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
+ * is the only active CPU at this time. The MSR is set up on the APs via the
+ * CPU hotplug callback.
+ *
+ * This function is invoked on resume from suspend and hibernation. On
+ * resume from suspend the restore should be not required, but we neither
+ * trust the firmware nor does it matter if the same value is written
+ * again.
+ */
+static void umwait_syscore_resume(void)
+{
+ umwait_update_control_msr(NULL);
+}
+
+static struct syscore_ops umwait_syscore_ops = {
+ .resume = umwait_syscore_resume,
+};
+
+/* sysfs interface */
+
+/*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled.
+ */
+static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
+{
+ return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
+}
+
+static inline u32 umwait_ctrl_max_time(u32 ctrl)
+{
+ return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+}
+
+static inline void umwait_update_control(u32 maxtime, bool c02_enable)
+{
+ u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+
+ if (!c02_enable)
+ ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
+
+ WRITE_ONCE(umwait_control_cached, ctrl);
+ /* Propagate to all CPUs */
+ on_each_cpu(umwait_update_control_msr, NULL, 1);
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool c02_enable;
+ u32 ctrl;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
+ umwait_update_control(ctrl, c02_enable);
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static ssize_t
+max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
+}
+
+static ssize_t max_time_store(struct device *kobj,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 max_time, ctrl;
+ int ret;
+
+ ret = kstrtou32(buf, 0, &max_time);
+ if (ret)
+ return ret;
+
+ /* bits[1:0] must be zero */
+ if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
+ return -EINVAL;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (max_time != umwait_ctrl_max_time(ctrl))
+ umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(max_time);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ &dev_attr_max_time.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
+static int __init umwait_init(void)
+{
+ struct device *dev;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ return -ENODEV;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
+ umwait_cpu_online, NULL);
+
+ register_syscore_ops(&umwait_syscore_ops);
+
+ /*
+ * Add umwait control interface. Ignore failure, so at least the
+ * default values are set up in case the machine manages to boot.
+ */
+ dev = cpu_subsys.dev_root;
+ return sysfs_create_group(&dev->kobj, &umwait_attr_group);
+}
+device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 0eda91f8eeac..3c648476d4fb 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -157,7 +157,7 @@ static void __init vmware_platform_setup(void)
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
- lapic_timer_frequency = ecx / HZ;
+ lapic_timer_period = ecx / HZ;
pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
ecx);
#endif
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
new file mode 100644
index 000000000000..8e6f2f4b4afe
--- /dev/null
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpufeature.h>
+
+#include "cpu.h"
+
+#define MSR_ZHAOXIN_FCR57 0x00001257
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
+
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
+
+static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* Enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable ACE unit */
+ lo |= ACE_FCR;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* Enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable RNG unit */
+ lo |= RNG_ENABLE;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /*
+ * Store Extended Feature Flags as word 5 of the CPU
+ * capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ cpu_detect_cache_sizes(c);
+}
+
+static void early_init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ if (c->cpuid_level >= 0x00000001) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+ /*
+ * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+ * apicids which are reserved per package. Store the resulting
+ * shift value for the package management code.
+ */
+ if (edx & (1U << 28))
+ c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+ }
+
+}
+
+static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
+{
+ u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+ msr_ctl = vmx_msr_high | vmx_msr_low;
+
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ vmx_msr_low, vmx_msr_high);
+ msr_ctl2 = vmx_msr_high | vmx_msr_low;
+ if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+ (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+ set_cpu_cap(c, X86_FEATURE_VPID);
+ }
+}
+
+static void init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ early_init_zhaoxin(c);
+ init_intel_cacheinfo(c);
+ detect_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (c->x86 >= 0x6)
+ init_zhaoxin_cap(c);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ if (cpu_has(c, X86_FEATURE_VMX))
+ zhaoxin_detect_vmx_virtcap(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ return size;
+}
+#endif
+
+static const struct cpu_dev zhaoxin_cpu_dev = {
+ .c_vendor = "zhaoxin",
+ .c_ident = { " Shanghai " },
+ .c_early_init = early_init_zhaoxin,
+ .c_init = init_zhaoxin,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = zhaoxin_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_ZHAOXIN,
+};
+
+cpu_dev_register(zhaoxin_cpu_dev);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 649fbc3fcf9f..12c70840980e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -43,18 +43,6 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu);
*/
DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
-static void kernel_fpu_disable(void)
-{
- WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
- this_cpu_write(in_kernel_fpu, true);
-}
-
-static void kernel_fpu_enable(void)
-{
- WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
- this_cpu_write(in_kernel_fpu, false);
-}
-
static bool kernel_fpu_disabled(void)
{
return this_cpu_read(in_kernel_fpu);
@@ -94,42 +82,33 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
-static void __kernel_fpu_begin(void)
+void kernel_fpu_begin(void)
{
- struct fpu *fpu = &current->thread.fpu;
+ preempt_disable();
WARN_ON_FPU(!irq_fpu_usable());
+ WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
- kernel_fpu_disable();
+ this_cpu_write(in_kernel_fpu, true);
- if (!(current->flags & PF_KTHREAD)) {
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- set_thread_flag(TIF_NEED_FPU_LOAD);
- /*
- * Ignore return value -- we don't care if reg state
- * is clobbered.
- */
- copy_fpregs_to_fpstate(fpu);
- }
+ if (!(current->flags & PF_KTHREAD) &&
+ !test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ /*
+ * Ignore return value -- we don't care if reg state
+ * is clobbered.
+ */
+ copy_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
}
-
-static void __kernel_fpu_end(void)
-{
- kernel_fpu_enable();
-}
-
-void kernel_fpu_begin(void)
-{
- preempt_disable();
- __kernel_fpu_begin();
-}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
void kernel_fpu_end(void)
{
- __kernel_fpu_end();
+ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+
+ this_cpu_write(in_kernel_fpu, false);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
@@ -155,7 +134,6 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_after_save(fpu);
fpregs_unlock();
}
-EXPORT_SYMBOL_GPL(fpu__save);
/*
* Legacy x87 fpstate state init:
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index ef0030e3fe6b..6ce7e0a23268 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -204,12 +204,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
if (!boot_cpu_has(X86_FEATURE_FPU)) {
- /*
- * Disable xsave as we do not support it if i387
- * emulation is enabled.
- */
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
fpu_kernel_xstate_size = sizeof(struct swregs_state);
} else {
if (boot_cpu_has(X86_FEATURE_FXSR))
@@ -252,17 +246,20 @@ static void __init fpu__init_parse_early_param(void)
char *argptr = arg;
int bit;
+#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
- if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- }
+#endif
if (cmdline_find_option_bool(boot_command_line, "noxsave"))
- fpu__xstate_clear_all_cpu_caps();
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 3c36dd1784db..7b4c52aa929f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -68,15 +68,6 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
unsigned int fpu_user_xstate_size;
/*
- * Clear all of the X86_FEATURE_* bits that are unavailable
- * when the CPU has no XSAVE support.
- */
-void fpu__xstate_clear_all_cpu_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-}
-
-/*
* Return whether the system supports a given xfeature.
*
* Also return the name of the (most advanced) feature that the caller requested:
@@ -709,7 +700,7 @@ static void fpu__init_disable_system_xstate(void)
{
xfeatures_mask = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
- fpu__xstate_clear_all_cpu_caps();
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
}
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0927bb158ffc..76228525acd0 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -22,6 +22,7 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <trace/syscall.h>
@@ -34,16 +35,25 @@
#ifdef CONFIG_DYNAMIC_FTRACE
int ftrace_arch_code_modify_prepare(void)
+ __acquires(&text_mutex)
{
+ /*
+ * Need to grab text_mutex to prevent a race from module loading
+ * and live kernel patching from changing the text permissions while
+ * ftrace has it set to "read/write".
+ */
+ mutex_lock(&text_mutex);
set_kernel_text_rw();
set_all_modules_text_rw();
return 0;
}
int ftrace_arch_code_modify_post_process(void)
+ __releases(&text_mutex)
{
set_all_modules_text_ro();
set_kernel_text_ro();
+ mutex_unlock(&text_mutex);
return 0;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 16b1cbd3a61e..29ffa495bd1c 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -184,24 +184,25 @@ unsigned long __head __startup_64(unsigned long physaddr,
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+ p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
+ physaddr);
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
- i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
- p4d[i + 0] = (pgdval_t)pud + pgtable_flags;
- p4d[i + 1] = (pgdval_t)pud + pgtable_flags;
+ i = physaddr >> P4D_SHIFT;
+ p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
+ p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
} else {
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
}
- i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
- pud[i + 0] = (pudval_t)pmd + pgtable_flags;
- pud[i + 1] = (pudval_t)pmd + pgtable_flags;
+ i = physaddr >> PUD_SHIFT;
+ pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
+ pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
@@ -211,8 +212,9 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry += physaddr;
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
- int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
- pmd[idx] = pmd_entry + i * PMD_SIZE;
+ int idx = i + (physaddr >> PMD_SHIFT);
+
+ pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
}
/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index a0573f2e7763..c43e96a938d0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,32 +1,44 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
-#include <linux/irq.h>
#include <linux/export.h>
#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/i8253.h>
-#include <linux/slab.h>
#include <linux/hpet.h>
-#include <linux/init.h>
#include <linux/cpu.h>
-#include <linux/pm.h>
-#include <linux/io.h>
+#include <linux/irq.h>
-#include <asm/cpufeature.h>
-#include <asm/irqdomain.h>
-#include <asm/fixmap.h>
#include <asm/hpet.h>
#include <asm/time.h>
-#define HPET_MASK CLOCKSOURCE_MASK(32)
+#undef pr_fmt
+#define pr_fmt(fmt) "hpet: " fmt
-#define HPET_DEV_USED_BIT 2
-#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
-#define HPET_DEV_VALID 0x8
-#define HPET_DEV_FSB_CAP 0x1000
-#define HPET_DEV_PERI_CAP 0x2000
+enum hpet_mode {
+ HPET_MODE_UNUSED,
+ HPET_MODE_LEGACY,
+ HPET_MODE_CLOCKEVT,
+ HPET_MODE_DEVICE,
+};
+
+struct hpet_channel {
+ struct clock_event_device evt;
+ unsigned int num;
+ unsigned int cpu;
+ unsigned int irq;
+ unsigned int in_use;
+ enum hpet_mode mode;
+ unsigned int boot_cfg;
+ char name[10];
+};
+
+struct hpet_base {
+ unsigned int nr_channels;
+ unsigned int nr_clockevents;
+ unsigned int boot_cfg;
+ struct hpet_channel *channels;
+};
+
+#define HPET_MASK CLOCKSOURCE_MASK(32)
#define HPET_MIN_CYCLES 128
#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
@@ -39,22 +51,25 @@ u8 hpet_blockid; /* OS timer block num */
bool hpet_msi_disable;
#ifdef CONFIG_PCI_MSI
-static unsigned int hpet_num_timers;
+static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
+static struct irq_domain *hpet_domain;
#endif
+
static void __iomem *hpet_virt_address;
-struct hpet_dev {
- struct clock_event_device evt;
- unsigned int num;
- int cpu;
- unsigned int irq;
- unsigned int flags;
- char name[10];
-};
+static struct hpet_base hpet_base;
+
+static bool hpet_legacy_int_enabled;
+static unsigned long hpet_freq;
-static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+bool boot_hpet_disable;
+bool hpet_force_user;
+static bool hpet_verbose;
+
+static inline
+struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt)
{
- return container_of(evtdev, struct hpet_dev, evt);
+ return container_of(evt, struct hpet_channel, evt);
}
inline unsigned int hpet_readl(unsigned int a)
@@ -67,10 +82,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
writel(d, hpet_virt_address + a);
}
-#ifdef CONFIG_X86_64
-#include <asm/pgtable.h>
-#endif
-
static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
@@ -85,10 +96,6 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-bool boot_hpet_disable;
-bool hpet_force_user;
-static bool hpet_verbose;
-
static int __init hpet_setup(char *str)
{
while (str) {
@@ -120,13 +127,8 @@ static inline int is_hpet_capable(void)
return !boot_hpet_disable && hpet_address;
}
-/*
- * HPET timer interrupt enable / disable
- */
-static bool hpet_legacy_int_enabled;
-
/**
- * is_hpet_enabled - check whether the hpet timer interrupt is enabled
+ * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled
*/
int is_hpet_enabled(void)
{
@@ -136,32 +138,36 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
static void _hpet_print_config(const char *function, int line)
{
- u32 i, timers, l, h;
- printk(KERN_INFO "hpet: %s(%d):\n", function, line);
- l = hpet_readl(HPET_ID);
- h = hpet_readl(HPET_PERIOD);
- timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h);
- l = hpet_readl(HPET_CFG);
- h = hpet_readl(HPET_STATUS);
- printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h);
+ u32 i, id, period, cfg, status, channels, l, h;
+
+ pr_info("%s(%d):\n", function, line);
+
+ id = hpet_readl(HPET_ID);
+ period = hpet_readl(HPET_PERIOD);
+ pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period);
+
+ cfg = hpet_readl(HPET_CFG);
+ status = hpet_readl(HPET_STATUS);
+ pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status);
+
l = hpet_readl(HPET_COUNTER);
h = hpet_readl(HPET_COUNTER+4);
- printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+ pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h);
+
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
- for (i = 0; i < timers; i++) {
+ for (i = 0; i < channels; i++) {
l = hpet_readl(HPET_Tn_CFG(i));
h = hpet_readl(HPET_Tn_CFG(i)+4);
- printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_CMP(i));
h = hpet_readl(HPET_Tn_CMP(i)+4);
- printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h);
+
l = hpet_readl(HPET_Tn_ROUTE(i));
h = hpet_readl(HPET_Tn_ROUTE(i)+4);
- printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n",
- i, l, h);
+ pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h);
}
}
@@ -172,31 +178,20 @@ do { \
} while (0)
/*
- * When the hpet driver (/dev/hpet) is enabled, we need to reserve
+ * When the HPET driver (/dev/hpet) is enabled, we need to reserve
* timer 0 and timer 1 in case of RTC emulation.
*/
#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd);
-
-static void hpet_reserve_platform_timers(unsigned int id)
+static void __init hpet_reserve_platform_timers(void)
{
- struct hpet __iomem *hpet = hpet_virt_address;
- struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
- unsigned int nrtimers, i;
struct hpet_data hd;
-
- nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
+ unsigned int i;
memset(&hd, 0, sizeof(hd));
hd.hd_phys_address = hpet_address;
- hd.hd_address = hpet;
- hd.hd_nirqs = nrtimers;
- hpet_reserve_timer(&hd, 0);
-
-#ifdef CONFIG_HPET_EMULATE_RTC
- hpet_reserve_timer(&hd, 1);
-#endif
+ hd.hd_address = hpet_virt_address;
+ hd.hd_nirqs = hpet_base.nr_channels;
/*
* NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
@@ -206,30 +201,52 @@ static void hpet_reserve_platform_timers(unsigned int id)
hd.hd_irq[0] = HPET_LEGACY_8254;
hd.hd_irq[1] = HPET_LEGACY_RTC;
- for (i = 2; i < nrtimers; timer++, i++) {
- hd.hd_irq[i] = (readl(&timer->hpet_config) &
- Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
- }
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
+
+ if (i >= 2)
+ hd.hd_irq[i] = hc->irq;
- hpet_reserve_msi_timers(&hd);
+ switch (hc->mode) {
+ case HPET_MODE_UNUSED:
+ case HPET_MODE_DEVICE:
+ hc->mode = HPET_MODE_DEVICE;
+ break;
+ case HPET_MODE_CLOCKEVT:
+ case HPET_MODE_LEGACY:
+ hpet_reserve_timer(&hd, hc->num);
+ break;
+ }
+ }
hpet_alloc(&hd);
+}
+static void __init hpet_select_device_channel(void)
+{
+ int i;
+
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
+
+ /* Associate the first unused channel to /dev/hpet */
+ if (hc->mode == HPET_MODE_UNUSED) {
+ hc->mode = HPET_MODE_DEVICE;
+ return;
+ }
+ }
}
+
#else
-static void hpet_reserve_platform_timers(unsigned int id) { }
+static inline void hpet_reserve_platform_timers(void) { }
+static inline void hpet_select_device_channel(void) {}
#endif
-/*
- * Common hpet info
- */
-static unsigned long hpet_freq;
-
-static struct clock_event_device hpet_clockevent;
-
+/* Common HPET functions */
static void hpet_stop_counter(void)
{
u32 cfg = hpet_readl(HPET_CFG);
+
cfg &= ~HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -243,6 +260,7 @@ static void hpet_reset_counter(void)
static void hpet_start_counter(void)
{
unsigned int cfg = hpet_readl(HPET_CFG);
+
cfg |= HPET_CFG_ENABLE;
hpet_writel(cfg, HPET_CFG);
}
@@ -274,24 +292,9 @@ static void hpet_enable_legacy_int(void)
hpet_legacy_int_enabled = true;
}
-static void hpet_legacy_clockevent_register(void)
-{
- /* Start HPET legacy interrupts */
- hpet_enable_legacy_int();
-
- /*
- * Start hpet with the boot cpu mask and make it
- * global after the IO_APIC has been initialized.
- */
- hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index);
- clockevents_config_and_register(&hpet_clockevent, hpet_freq,
- HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
- global_clock_event = &hpet_clockevent;
- printk(KERN_DEBUG "hpet clockevent registered\n");
-}
-
-static int hpet_set_periodic(struct clock_event_device *evt, int timer)
+static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
unsigned int cfg, cmp, now;
uint64_t delta;
@@ -300,11 +303,11 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer)
delta >>= evt->shift;
now = hpet_readl(HPET_COUNTER);
cmp = now + (unsigned int)delta;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- hpet_writel(cmp, HPET_Tn_CMP(timer));
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
+ hpet_writel(cmp, HPET_Tn_CMP(channel));
udelay(1);
/*
* HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
@@ -313,52 +316,55 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer)
* (See AMD-8111 HyperTransport I/O Hub Data Sheet,
* Publication # 24674)
*/
- hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel));
hpet_start_counter();
hpet_print_config();
return 0;
}
-static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
unsigned int cfg;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
return 0;
}
-static int hpet_shutdown(struct clock_event_device *evt, int timer)
+static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
unsigned int cfg;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg = hpet_readl(HPET_Tn_CFG(channel));
cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cfg, HPET_Tn_CFG(channel));
return 0;
}
-static int hpet_resume(struct clock_event_device *evt)
+static int hpet_clkevt_legacy_resume(struct clock_event_device *evt)
{
hpet_enable_legacy_int();
hpet_print_config();
return 0;
}
-static int hpet_next_event(unsigned long delta,
- struct clock_event_device *evt, int timer)
+static int
+hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt)
{
+ unsigned int channel = clockevent_to_channel(evt)->num;
u32 cnt;
s32 res;
cnt = hpet_readl(HPET_COUNTER);
cnt += (u32) delta;
- hpet_writel(cnt, HPET_Tn_CMP(timer));
+ hpet_writel(cnt, HPET_Tn_CMP(channel));
/*
* HPETs are a complete disaster. The compare register is
@@ -387,360 +393,250 @@ static int hpet_next_event(unsigned long delta,
return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
-static int hpet_legacy_shutdown(struct clock_event_device *evt)
+static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating)
{
- return hpet_shutdown(evt, 0);
-}
+ struct clock_event_device *evt = &hc->evt;
-static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
-{
- return hpet_set_oneshot(evt, 0);
-}
+ evt->rating = rating;
+ evt->irq = hc->irq;
+ evt->name = hc->name;
+ evt->cpumask = cpumask_of(hc->cpu);
+ evt->set_state_oneshot = hpet_clkevt_set_state_oneshot;
+ evt->set_next_event = hpet_clkevt_set_next_event;
+ evt->set_state_shutdown = hpet_clkevt_set_state_shutdown;
-static int hpet_legacy_set_periodic(struct clock_event_device *evt)
-{
- return hpet_set_periodic(evt, 0);
+ evt->features = CLOCK_EVT_FEAT_ONESHOT;
+ if (hc->boot_cfg & HPET_TN_PERIODIC) {
+ evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_clkevt_set_state_periodic;
+ }
}
-static int hpet_legacy_resume(struct clock_event_device *evt)
+static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
{
- return hpet_resume(evt);
-}
+ /*
+ * Start HPET with the boot CPU's cpumask and make it global after
+ * the IO_APIC has been initialized.
+ */
+ hc->cpu = boot_cpu_data.cpu_index;
+ strncpy(hc->name, "hpet", sizeof(hc->name));
+ hpet_init_clockevent(hc, 50);
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- return hpet_next_event(delta, evt, 0);
-}
+ hc->evt.tick_resume = hpet_clkevt_legacy_resume;
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
- .name = "hpet",
- .features = CLOCK_EVT_FEAT_PERIODIC |
- CLOCK_EVT_FEAT_ONESHOT,
- .set_state_periodic = hpet_legacy_set_periodic,
- .set_state_oneshot = hpet_legacy_set_oneshot,
- .set_state_shutdown = hpet_legacy_shutdown,
- .tick_resume = hpet_legacy_resume,
- .set_next_event = hpet_legacy_next_event,
- .irq = 0,
- .rating = 50,
-};
+ /*
+ * Legacy horrors and sins from the past. HPET used periodic mode
+ * unconditionally forever on the legacy channel 0. Removing the
+ * below hack and using the conditional in hpet_init_clockevent()
+ * makes at least Qemu and one hardware machine fail to boot.
+ * There are two issues which cause the boot failure:
+ *
+ * #1 After the timer delivery test in IOAPIC and the IOAPIC setup
+ * the next interrupt is not delivered despite the HPET channel
+ * being programmed correctly. Reprogramming the HPET after
+ * switching to IOAPIC makes it work again. After fixing this,
+ * the next issue surfaces:
+ *
+ * #2 Due to the unconditional periodic mode availability the Local
+ * APIC timer calibration can hijack the global clockevents
+ * event handler without causing damage. Using oneshot at this
+ * stage makes if hang because the HPET does not get
+ * reprogrammed due to the handler hijacking. Duh, stupid me!
+ *
+ * Both issues require major surgery and especially the kick HPET
+ * again after enabling IOAPIC results in really nasty hackery.
+ * This 'assume periodic works' magic has survived since HPET
+ * support got added, so it's questionable whether this should be
+ * fixed. Both Qemu and the failing hardware machine support
+ * periodic mode despite the fact that both don't advertise it in
+ * the configuration register and both need that extra kick after
+ * switching to IOAPIC. Seems to be a feature...
+ */
+ hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC;
+ hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic;
+
+ /* Start HPET legacy interrupts */
+ hpet_enable_legacy_int();
+
+ clockevents_config_and_register(&hc->evt, hpet_freq,
+ HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
+ global_clock_event = &hc->evt;
+ pr_debug("Clockevent registered\n");
+}
/*
* HPET MSI Support
*/
#ifdef CONFIG_PCI_MSI
-static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
-static struct hpet_dev *hpet_devs;
-static struct irq_domain *hpet_domain;
-
void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
- /* unmask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
void hpet_msi_mask(struct irq_data *data)
{
- struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
+ struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
- /* mask it */
- cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+ cfg = hpet_readl(HPET_Tn_CFG(hc->num));
cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
- hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
-}
-
-void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
-{
- hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
- hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+ hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
+void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
{
- msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
- msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
- msg->address_hi = 0;
+ hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
+ hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
}
-static int hpet_msi_shutdown(struct clock_event_device *evt)
+static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- return hpet_shutdown(evt, hdev->num);
-}
-
-static int hpet_msi_set_oneshot(struct clock_event_device *evt)
-{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- return hpet_set_oneshot(evt, hdev->num);
-}
-
-static int hpet_msi_set_periodic(struct clock_event_device *evt)
-{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-
- return hpet_set_periodic(evt, hdev->num);
-}
-
-static int hpet_msi_resume(struct clock_event_device *evt)
-{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- struct irq_data *data = irq_get_irq_data(hdev->irq);
+ struct hpet_channel *hc = clockevent_to_channel(evt);
+ struct irq_data *data = irq_get_irq_data(hc->irq);
struct msi_msg msg;
/* Restore the MSI msg and unmask the interrupt */
irq_chip_compose_msi_msg(data, &msg);
- hpet_msi_write(hdev, &msg);
+ hpet_msi_write(hc, &msg);
hpet_msi_unmask(data);
return 0;
}
-static int hpet_msi_next_event(unsigned long delta,
- struct clock_event_device *evt)
-{
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- return hpet_next_event(delta, evt, hdev->num);
-}
-
-static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data)
{
- struct hpet_dev *dev = (struct hpet_dev *)data;
- struct clock_event_device *hevt = &dev->evt;
+ struct hpet_channel *hc = data;
+ struct clock_event_device *evt = &hc->evt;
- if (!hevt->event_handler) {
- printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
- dev->num);
+ if (!evt->event_handler) {
+ pr_info("Spurious interrupt HPET channel %d\n", hc->num);
return IRQ_HANDLED;
}
- hevt->event_handler(hevt);
+ evt->event_handler(evt);
return IRQ_HANDLED;
}
-static int hpet_setup_irq(struct hpet_dev *dev)
+static int hpet_setup_msi_irq(struct hpet_channel *hc)
{
-
- if (request_irq(dev->irq, hpet_interrupt_handler,
+ if (request_irq(hc->irq, hpet_msi_interrupt_handler,
IRQF_TIMER | IRQF_NOBALANCING,
- dev->name, dev))
+ hc->name, hc))
return -1;
- disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
- enable_irq(dev->irq);
+ disable_irq(hc->irq);
+ irq_set_affinity(hc->irq, cpumask_of(hc->cpu));
+ enable_irq(hc->irq);
- printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
- dev->name, dev->irq);
+ pr_debug("%s irq %u for MSI\n", hc->name, hc->irq);
return 0;
}
-/* This should be called in specific @cpu */
-static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+/* Invoked from the hotplug callback on @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu)
{
- struct clock_event_device *evt = &hdev->evt;
-
- WARN_ON(cpu != smp_processor_id());
- if (!(hdev->flags & HPET_DEV_VALID))
- return;
-
- hdev->cpu = cpu;
- per_cpu(cpu_hpet_dev, cpu) = hdev;
- evt->name = hdev->name;
- hpet_setup_irq(hdev);
- evt->irq = hdev->irq;
+ struct clock_event_device *evt = &hc->evt;
- evt->rating = 110;
- evt->features = CLOCK_EVT_FEAT_ONESHOT;
- if (hdev->flags & HPET_DEV_PERI_CAP) {
- evt->features |= CLOCK_EVT_FEAT_PERIODIC;
- evt->set_state_periodic = hpet_msi_set_periodic;
- }
+ hc->cpu = cpu;
+ per_cpu(cpu_hpet_channel, cpu) = hc;
+ hpet_setup_msi_irq(hc);
- evt->set_state_shutdown = hpet_msi_shutdown;
- evt->set_state_oneshot = hpet_msi_set_oneshot;
- evt->tick_resume = hpet_msi_resume;
- evt->set_next_event = hpet_msi_next_event;
- evt->cpumask = cpumask_of(hdev->cpu);
+ hpet_init_clockevent(hc, 110);
+ evt->tick_resume = hpet_clkevt_msi_resume;
clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
0x7FFFFFFF);
}
-#ifdef CONFIG_HPET
-/* Reserve at least one timer for userspace (/dev/hpet) */
-#define RESERVE_TIMERS 1
-#else
-#define RESERVE_TIMERS 0
-#endif
-
-static void hpet_msi_capability_lookup(unsigned int start_timer)
+static struct hpet_channel *hpet_get_unused_clockevent(void)
{
- unsigned int id;
- unsigned int num_timers;
- unsigned int num_timers_used = 0;
- int i, irq;
-
- if (hpet_msi_disable)
- return;
-
- if (boot_cpu_has(X86_FEATURE_ARAT))
- return;
- id = hpet_readl(HPET_ID);
-
- num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
- num_timers++; /* Value read out starts from 0 */
- hpet_print_config();
-
- hpet_domain = hpet_create_irq_domain(hpet_blockid);
- if (!hpet_domain)
- return;
-
- hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL);
- if (!hpet_devs)
- return;
-
- hpet_num_timers = num_timers;
-
- for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
- struct hpet_dev *hdev = &hpet_devs[num_timers_used];
- unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
-
- /* Only consider HPET timer with MSI support */
- if (!(cfg & HPET_TN_FSB_CAP))
- continue;
+ int i;
- hdev->flags = 0;
- if (cfg & HPET_TN_PERIODIC_CAP)
- hdev->flags |= HPET_DEV_PERI_CAP;
- sprintf(hdev->name, "hpet%d", i);
- hdev->num = i;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
- irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
- if (irq <= 0)
+ if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use)
continue;
-
- hdev->irq = irq;
- hdev->flags |= HPET_DEV_FSB_CAP;
- hdev->flags |= HPET_DEV_VALID;
- num_timers_used++;
- if (num_timers_used == num_possible_cpus())
- break;
+ hc->in_use = 1;
+ return hc;
}
-
- printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
- num_timers, num_timers_used);
+ return NULL;
}
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
+static int hpet_cpuhp_online(unsigned int cpu)
{
- int i;
-
- if (!hpet_devs)
- return;
+ struct hpet_channel *hc = hpet_get_unused_clockevent();
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
+ if (hc)
+ init_one_hpet_msi_clockevent(hc, cpu);
+ return 0;
+}
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
+static int hpet_cpuhp_dead(unsigned int cpu)
+{
+ struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu);
- hd->hd_irq[hdev->num] = hdev->irq;
- hpet_reserve_timer(hd, hdev->num);
- }
+ if (!hc)
+ return 0;
+ free_irq(hc->irq, hc);
+ hc->in_use = 0;
+ per_cpu(cpu_hpet_channel, cpu) = NULL;
+ return 0;
}
-#endif
-static struct hpet_dev *hpet_get_unused_timer(void)
+static void __init hpet_select_clockevents(void)
{
- int i;
+ unsigned int i;
- if (!hpet_devs)
- return NULL;
+ hpet_base.nr_clockevents = 0;
- for (i = 0; i < hpet_num_timers; i++) {
- struct hpet_dev *hdev = &hpet_devs[i];
+ /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */
+ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
+ return;
- if (!(hdev->flags & HPET_DEV_VALID))
- continue;
- if (test_and_set_bit(HPET_DEV_USED_BIT,
- (unsigned long *)&hdev->flags))
- continue;
- return hdev;
- }
- return NULL;
-}
+ hpet_print_config();
-struct hpet_work_struct {
- struct delayed_work work;
- struct completion complete;
-};
+ hpet_domain = hpet_create_irq_domain(hpet_blockid);
+ if (!hpet_domain)
+ return;
-static void hpet_work(struct work_struct *w)
-{
- struct hpet_dev *hdev;
- int cpu = smp_processor_id();
- struct hpet_work_struct *hpet_work;
+ for (i = 0; i < hpet_base.nr_channels; i++) {
+ struct hpet_channel *hc = hpet_base.channels + i;
+ int irq;
- hpet_work = container_of(w, struct hpet_work_struct, work.work);
+ if (hc->mode != HPET_MODE_UNUSED)
+ continue;
- hdev = hpet_get_unused_timer();
- if (hdev)
- init_one_hpet_msi_clockevent(hdev, cpu);
+ /* Only consider HPET channel with MSI support */
+ if (!(hc->boot_cfg & HPET_TN_FSB_CAP))
+ continue;
- complete(&hpet_work->complete);
-}
+ sprintf(hc->name, "hpet%d", i);
-static int hpet_cpuhp_online(unsigned int cpu)
-{
- struct hpet_work_struct work;
-
- INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
- init_completion(&work.complete);
- /* FIXME: add schedule_work_on() */
- schedule_delayed_work_on(cpu, &work.work, 0);
- wait_for_completion(&work.complete);
- destroy_delayed_work_on_stack(&work.work);
- return 0;
-}
+ irq = hpet_assign_irq(hpet_domain, hc, hc->num);
+ if (irq <= 0)
+ continue;
-static int hpet_cpuhp_dead(unsigned int cpu)
-{
- struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+ hc->irq = irq;
+ hc->mode = HPET_MODE_CLOCKEVT;
- if (!hdev)
- return 0;
- free_irq(hdev->irq, hdev);
- hdev->flags &= ~HPET_DEV_USED;
- per_cpu(cpu_hpet_dev, cpu) = NULL;
- return 0;
-}
-#else
+ if (++hpet_base.nr_clockevents == num_possible_cpus())
+ break;
+ }
-static void hpet_msi_capability_lookup(unsigned int start_timer)
-{
- return;
+ pr_info("%d channels of %d reserved for per-cpu timers\n",
+ hpet_base.nr_channels, hpet_base.nr_clockevents);
}
-#ifdef CONFIG_HPET
-static void hpet_reserve_msi_timers(struct hpet_data *hd)
-{
- return;
-}
-#endif
+#else
+
+static inline void hpet_select_clockevents(void) { }
#define hpet_cpuhp_online NULL
#define hpet_cpuhp_dead NULL
@@ -754,10 +650,10 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
/*
* Reading the HPET counter is a very slow operation. If a large number of
* CPUs are trying to access the HPET counter simultaneously, it can cause
- * massive delay and slow down system performance dramatically. This may
+ * massive delays and slow down system performance dramatically. This may
* happen when HPET is the default clock source instead of TSC. For a
* really large system with hundreds of CPUs, the slowdown may be so
- * severe that it may actually crash the system because of a NMI watchdog
+ * severe, that it can actually crash the system because of a NMI watchdog
* soft lockup, for example.
*
* If multiple CPUs are trying to access the HPET counter at the same time,
@@ -766,10 +662,9 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
*
* This special feature is only enabled on x86-64 systems. It is unlikely
* that 32-bit x86 systems will have enough CPUs to require this feature
- * with its associated locking overhead. And we also need 64-bit atomic
- * read.
+ * with its associated locking overhead. We also need 64-bit atomic read.
*
- * The lock and the hpet value are stored together and can be read in a
+ * The lock and the HPET value are stored together and can be read in a
* single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
* is 32 bits in size.
*/
@@ -858,15 +753,40 @@ static struct clocksource clocksource_hpet = {
.resume = hpet_resume_counter,
};
-static int hpet_clocksource_register(void)
+/*
+ * AMD SB700 based systems with spread spectrum enabled use a SMM based
+ * HPET emulation to provide proper frequency setting.
+ *
+ * On such systems the SMM code is initialized with the first HPET register
+ * access and takes some time to complete. During this time the config
+ * register reads 0xffffffff. We check for max 1000 loops whether the
+ * config register reads a non-0xffffffff value to make sure that the
+ * HPET is up and running before we proceed any further.
+ *
+ * A counting loop is safe, as the HPET access takes thousands of CPU cycles.
+ *
+ * On non-SB700 based machines this check is only done once and has no
+ * side effects.
+ */
+static bool __init hpet_cfg_working(void)
{
- u64 start, now;
- u64 t1;
+ int i;
+
+ for (i = 0; i < 1000; i++) {
+ if (hpet_readl(HPET_CFG) != 0xFFFFFFFF)
+ return true;
+ }
+
+ pr_warn("Config register invalid. Disabling HPET\n");
+ return false;
+}
+
+static bool __init hpet_counting(void)
+{
+ u64 start, now, t1;
- /* Start the counter */
hpet_restart_counter();
- /* Verify whether hpet counter works */
t1 = hpet_readl(HPET_COUNTER);
start = rdtsc();
@@ -877,30 +797,24 @@ static int hpet_clocksource_register(void)
* 1 GHz == 200us
*/
do {
- rep_nop();
+ if (t1 != hpet_readl(HPET_COUNTER))
+ return true;
now = rdtsc();
} while ((now - start) < 200000UL);
- if (t1 == hpet_readl(HPET_COUNTER)) {
- printk(KERN_WARNING
- "HPET counter not counting. HPET disabled\n");
- return -ENODEV;
- }
-
- clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
- return 0;
+ pr_warn("Counter not counting. HPET disabled\n");
+ return false;
}
-static u32 *hpet_boot_cfg;
-
/**
* hpet_enable - Try to setup the HPET timer. Returns 1 on success.
*/
int __init hpet_enable(void)
{
- u32 hpet_period, cfg, id;
+ u32 hpet_period, cfg, id, irq;
+ unsigned int i, channels;
+ struct hpet_channel *hc;
u64 freq;
- unsigned int i, last;
if (!is_hpet_capable())
return 0;
@@ -909,40 +823,22 @@ int __init hpet_enable(void)
if (!hpet_virt_address)
return 0;
+ /* Validate that the config register is working */
+ if (!hpet_cfg_working())
+ goto out_nohpet;
+
+ /* Validate that the counter is counting */
+ if (!hpet_counting())
+ goto out_nohpet;
+
/*
* Read the period and check for a sane value:
*/
hpet_period = hpet_readl(HPET_PERIOD);
-
- /*
- * AMD SB700 based systems with spread spectrum enabled use a
- * SMM based HPET emulation to provide proper frequency
- * setting. The SMM code is initialized with the first HPET
- * register access and takes some time to complete. During
- * this time the config register reads 0xffffffff. We check
- * for max. 1000 loops whether the config register reads a non
- * 0xffffffff value to make sure that HPET is up and running
- * before we go further. A counting loop is safe, as the HPET
- * access takes thousands of CPU cycles. On non SB700 based
- * machines this check is only done once and has no side
- * effects.
- */
- for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
- if (i == 1000) {
- printk(KERN_WARNING
- "HPET config register value = 0xFFFFFFFF. "
- "Disabling HPET\n");
- goto out_nohpet;
- }
- }
-
if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
goto out_nohpet;
- /*
- * The period is a femto seconds value. Convert it to a
- * frequency.
- */
+ /* The period is a femtoseconds value. Convert it to a frequency. */
freq = FSEC_PER_SEC;
do_div(freq, hpet_period);
hpet_freq = freq;
@@ -954,72 +850,90 @@ int __init hpet_enable(void)
id = hpet_readl(HPET_ID);
hpet_print_config();
- last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
+ /* This is the HPET channel number which is zero based */
+ channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
-#ifdef CONFIG_HPET_EMULATE_RTC
/*
* The legacy routing mode needs at least two channels, tick timer
* and the rtc emulation channel.
*/
- if (!last)
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2)
goto out_nohpet;
-#endif
+ hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL);
+ if (!hc) {
+ pr_warn("Disabling HPET.\n");
+ goto out_nohpet;
+ }
+ hpet_base.channels = hc;
+ hpet_base.nr_channels = channels;
+
+ /* Read, store and sanitize the global configuration */
cfg = hpet_readl(HPET_CFG);
- hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg),
- GFP_KERNEL);
- if (hpet_boot_cfg)
- *hpet_boot_cfg = cfg;
- else
- pr_warn("HPET initial state will not be saved\n");
+ hpet_base.boot_cfg = cfg;
cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
hpet_writel(cfg, HPET_CFG);
if (cfg)
- pr_warn("Unrecognized bits %#x set in global cfg\n", cfg);
+ pr_warn("Global config: Unknown bits %#x\n", cfg);
+
+ /* Read, store and sanitize the per channel configuration */
+ for (i = 0; i < channels; i++, hc++) {
+ hc->num = i;
- for (i = 0; i <= last; ++i) {
cfg = hpet_readl(HPET_Tn_CFG(i));
- if (hpet_boot_cfg)
- hpet_boot_cfg[i + 1] = cfg;
+ hc->boot_cfg = cfg;
+ irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
+ hc->irq = irq;
+
cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
hpet_writel(cfg, HPET_Tn_CFG(i));
+
cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
| HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
| HPET_TN_FSB | HPET_TN_FSB_CAP);
if (cfg)
- pr_warn("Unrecognized bits %#x set in cfg#%u\n",
- cfg, i);
+ pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg);
}
hpet_print_config();
- if (hpet_clocksource_register())
- goto out_nohpet;
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
- hpet_legacy_clockevent_register();
+ hpet_legacy_clockevent_register(&hpet_base.channels[0]);
+ hpet_base.channels[0].mode = HPET_MODE_LEGACY;
+ if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC))
+ hpet_base.channels[1].mode = HPET_MODE_LEGACY;
return 1;
}
return 0;
out_nohpet:
+ kfree(hpet_base.channels);
+ hpet_base.channels = NULL;
+ hpet_base.nr_channels = 0;
hpet_clear_mapping();
hpet_address = 0;
return 0;
}
/*
- * Needs to be late, as the reserve_timer code calls kalloc !
+ * The late initialization runs after the PCI quirks have been invoked
+ * which might have detected a system on which the HPET can be enforced.
+ *
+ * Also, the MSI machinery is not working yet when the HPET is initialized
+ * early.
*
- * Not a problem on i386 as hpet_enable is called from late_time_init,
- * but on x86_64 it is necessary !
+ * If the HPET is enabled, then:
+ *
+ * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y
+ * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents
+ * 3) Setup /dev/hpet if CONFIG_HPET=y
+ * 4) Register hotplug callbacks when clockevents are available
*/
static __init int hpet_late_init(void)
{
int ret;
- if (boot_hpet_disable)
- return -ENODEV;
-
if (!hpet_address) {
if (!force_hpet_address)
return -ENODEV;
@@ -1031,21 +945,14 @@ static __init int hpet_late_init(void)
if (!hpet_virt_address)
return -ENODEV;
- if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
- hpet_msi_capability_lookup(2);
- else
- hpet_msi_capability_lookup(0);
-
- hpet_reserve_platform_timers(hpet_readl(HPET_ID));
+ hpet_select_device_channel();
+ hpet_select_clockevents();
+ hpet_reserve_platform_timers();
hpet_print_config();
- if (hpet_msi_disable)
+ if (!hpet_base.nr_clockevents)
return 0;
- if (boot_cpu_has(X86_FEATURE_ARAT))
- return 0;
-
- /* This notifier should be called after workqueue is ready */
ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
hpet_cpuhp_online, NULL);
if (ret)
@@ -1064,47 +971,47 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
- if (is_hpet_capable() && hpet_virt_address) {
- unsigned int cfg = hpet_readl(HPET_CFG), id, last;
-
- if (hpet_boot_cfg)
- cfg = *hpet_boot_cfg;
- else if (hpet_legacy_int_enabled) {
- cfg &= ~HPET_CFG_LEGACY;
- hpet_legacy_int_enabled = false;
- }
- cfg &= ~HPET_CFG_ENABLE;
- hpet_writel(cfg, HPET_CFG);
+ unsigned int i;
+ u32 cfg;
- if (!hpet_boot_cfg)
- return;
+ if (!is_hpet_capable() || !hpet_virt_address)
+ return;
- id = hpet_readl(HPET_ID);
- last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+ /* Restore boot configuration with the enable bit cleared */
+ cfg = hpet_base.boot_cfg;
+ cfg &= ~HPET_CFG_ENABLE;
+ hpet_writel(cfg, HPET_CFG);
- for (id = 0; id <= last; ++id)
- hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
+ /* Restore the channel boot configuration */
+ for (i = 0; i < hpet_base.nr_channels; i++)
+ hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i));
- if (*hpet_boot_cfg & HPET_CFG_ENABLE)
- hpet_writel(*hpet_boot_cfg, HPET_CFG);
- }
+ /* If the HPET was enabled at boot time, reenable it */
+ if (hpet_base.boot_cfg & HPET_CFG_ENABLE)
+ hpet_writel(hpet_base.boot_cfg, HPET_CFG);
}
#ifdef CONFIG_HPET_EMULATE_RTC
-/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET
+/*
+ * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET
* is enabled, we support RTC interrupt functionality in software.
+ *
* RTC has 3 kinds of interrupts:
- * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock
- * is updated
- * 2) Alarm Interrupt - generate an interrupt at a specific time of day
- * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
- * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2)
- * (1) and (2) above are implemented using polling at a frequency of
- * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt
- * overhead. (DEFAULT_RTC_INT_FREQ)
- * For (3), we use interrupts at 64Hz or user specified periodic
- * frequency, whichever is higher.
+ *
+ * 1) Update Interrupt - generate an interrupt, every second, when the
+ * RTC clock is updated
+ * 2) Alarm Interrupt - generate an interrupt at a specific time of day
+ * 3) Periodic Interrupt - generate periodic interrupt, with frequencies
+ * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2)
+ *
+ * (1) and (2) above are implemented using polling at a frequency of 64 Hz:
+ * DEFAULT_RTC_INT_FREQ.
+ *
+ * The exact frequency is a tradeoff between accuracy and interrupt overhead.
+ *
+ * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency,
+ * if it's higher.
*/
#include <linux/mc146818rtc.h>
#include <linux/rtc.h>
@@ -1125,7 +1032,7 @@ static unsigned long hpet_pie_limit;
static rtc_irq_handler irq_handler;
/*
- * Check that the hpet counter c1 is ahead of the c2
+ * Check that the HPET counter c1 is ahead of c2
*/
static inline int hpet_cnt_ahead(u32 c1, u32 c2)
{
@@ -1163,8 +1070,8 @@ void hpet_unregister_irq_handler(rtc_irq_handler handler)
EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
/*
- * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
- * is not supported by all HPET implementations for timer 1.
+ * Channel 1 for RTC emulation. We use one shot mode, as periodic mode
+ * is not supported by all HPET implementations for channel 1.
*
* hpet_rtc_timer_init() is called when the rtc is initialized.
*/
@@ -1177,10 +1084,11 @@ int hpet_rtc_timer_init(void)
return 0;
if (!hpet_default_delta) {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
uint64_t clc;
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
- clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
+ clc >>= evt->shift + DEFAULT_RTC_SHIFT;
hpet_default_delta = clc;
}
@@ -1209,6 +1117,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
static void hpet_disable_rtc_channel(void)
{
u32 cfg = hpet_readl(HPET_T1_CFG);
+
cfg &= ~HPET_TN_ENABLE;
hpet_writel(cfg, HPET_T1_CFG);
}
@@ -1250,8 +1159,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
}
EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
-int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
- unsigned char sec)
+int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec)
{
if (!is_hpet_enabled())
return 0;
@@ -1271,15 +1179,18 @@ int hpet_set_periodic_freq(unsigned long freq)
if (!is_hpet_enabled())
return 0;
- if (freq <= DEFAULT_RTC_INT_FREQ)
+ if (freq <= DEFAULT_RTC_INT_FREQ) {
hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq;
- else {
- clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
+ } else {
+ struct clock_event_device *evt = &hpet_base.channels[0].evt;
+
+ clc = (uint64_t) evt->mult * NSEC_PER_SEC;
do_div(clc, freq);
- clc >>= hpet_clockevent.shift;
+ clc >>= evt->shift;
hpet_pie_delta = clc;
hpet_pie_limit = 0;
}
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
@@ -1317,8 +1228,7 @@ static void hpet_rtc_timer_reinit(void)
if (hpet_rtc_flags & RTC_PIE)
hpet_pie_count += lost_ints;
if (printk_ratelimit())
- printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
- lost_ints);
+ pr_warn("Lost %d RTC interrupts\n", lost_ints);
}
}
@@ -1340,8 +1250,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_prev_update_sec = curr_time.tm_sec;
}
- if (hpet_rtc_flags & RTC_PIE &&
- ++hpet_pie_count >= hpet_pie_limit) {
+ if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) {
rtc_int_flag |= RTC_PF;
hpet_pie_count = 0;
}
@@ -1350,7 +1259,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
(curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
(curr_time.tm_min == hpet_alarm_time.tm_min) &&
(curr_time.tm_hour == hpet_alarm_time.tm_hour))
- rtc_int_flag |= RTC_AF;
+ rtc_int_flag |= RTC_AF;
if (rtc_int_flag) {
rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 0d307a657abb..2b7999a1a50a 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -8,6 +8,7 @@
#include <linux/timex.h>
#include <linux/i8253.h>
+#include <asm/apic.h>
#include <asm/hpet.h>
#include <asm/time.h>
#include <asm/smp.h>
@@ -18,10 +19,32 @@
*/
struct clock_event_device *global_clock_event;
-void __init setup_pit_timer(void)
+/*
+ * Modern chipsets can disable the PIT clock which makes it unusable. It
+ * would be possible to enable the clock but the registers are chipset
+ * specific and not discoverable. Avoid the whack a mole game.
+ *
+ * These platforms have discoverable TSC/CPU frequencies but this also
+ * requires to know the local APIC timer frequency as it normally is
+ * calibrated against the PIT interrupt.
+ */
+static bool __init use_pit(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC))
+ return true;
+
+ /* This also returns true when APIC is disabled */
+ return apic_needs_pit();
+}
+
+bool __init pit_timer_init(void)
{
+ if (!use_pit())
+ return false;
+
clockevent_i8253_init(true);
global_clock_event = &i8253_clockevent;
+ return true;
}
#ifndef CONFIG_X86_64
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index d2482bbbe3d0..87ef69a72c52 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -319,7 +319,8 @@ void __init idt_setup_apic_and_irq_gates(void)
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
set_bit(i, system_vectors);
- set_intr_gate(i, spurious_interrupt);
+ entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
+ set_intr_gate(i, entry);
}
#endif
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9b68b5b00ac9..cc496eb7a8d2 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -247,7 +247,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
if (!handle_irq(desc, regs)) {
ack_APIC_irq();
- if (desc != VECTOR_RETRIGGERED) {
+ if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 1b2ee55a2dfb..ba95bc70460d 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -45,7 +45,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now)
static void __init jailhouse_timer_init(void)
{
- lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ);
+ lapic_timer_period = setup_data.apic_khz * (1000 / HZ);
}
static unsigned long jailhouse_get_tsc(void)
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 07c30ee17425..bb7e1132290b 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -74,6 +74,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
return regs_get_register(regs, pt_regs_offset[idx]);
}
+#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \
+ ~((1ULL << PERF_REG_X86_MAX) - 1))
+
#ifdef CONFIG_X86_32
#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \
(1ULL << PERF_REG_X86_R9) | \
@@ -86,7 +89,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
int perf_reg_validate(u64 mask)
{
- if (!mask || (mask & REG_NOSUPPORT))
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
return 0;
@@ -112,7 +115,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
int perf_reg_validate(u64 mask)
{
- if (!mask || (mask & REG_NOSUPPORT))
+ if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED)))
return -EINVAL;
return 0;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index cbac64659dc4..ee9099061d01 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -398,22 +398,12 @@ static int putreg(struct task_struct *child,
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_MAX)
return -EIO;
- /*
- * When changing the FS base, use do_arch_prctl_64()
- * to set the index to zero and to set the base
- * as requested.
- */
- if (child->thread.fsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
if (value >= TASK_SIZE_MAX)
return -EIO;
- if (child->thread.gsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ x86_gsbase_write_task(child, value);
return 0;
#endif
}
@@ -644,10 +634,9 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
{
struct thread_struct *thread = &tsk->thread;
unsigned long val = 0;
- int index = n;
if (n < HBP_NUM) {
- index = array_index_nospec(index, HBP_NUM);
+ int index = array_index_nospec(n, HBP_NUM);
struct perf_event *bp = thread->ptrace_bps[index];
if (bp)
@@ -750,9 +739,6 @@ static int ioperm_get(struct task_struct *target,
void ptrace_disable(struct task_struct *child)
{
user_disable_single_step(child);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-#endif
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 0ff3e294d0e5..10125358b9c4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -3,6 +3,7 @@
*/
+#include <linux/clocksource.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 4693e2f3a03e..96421f97e75c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -144,7 +144,7 @@ void native_send_call_func_ipi(const struct cpumask *mask)
}
cpumask_copy(allbutself, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), allbutself);
+ __cpumask_clear_cpu(smp_processor_id(), allbutself);
if (cpumask_equal(mask, allbutself) &&
cpumask_equal(cpu_online_mask, cpu_callout_mask))
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 0e14f6c0d35e..07c0e960b3f3 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -82,8 +82,11 @@ static void __init setup_default_timer_irq(void)
/* Default timer init function */
void __init hpet_time_init(void)
{
- if (!hpet_enable())
- setup_pit_timer();
+ if (!hpet_enable()) {
+ if (!pit_timer_init())
+ return;
+ }
+
setup_default_timer_irq();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0b29e58f288e..59b57605e66c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -632,31 +632,38 @@ unsigned long native_calibrate_tsc(void)
crystal_khz = ecx_hz / 1000;
- if (crystal_khz == 0) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_SKYLAKE_MOBILE:
- case INTEL_FAM6_SKYLAKE_DESKTOP:
- case INTEL_FAM6_KABYLAKE_MOBILE:
- case INTEL_FAM6_KABYLAKE_DESKTOP:
- crystal_khz = 24000; /* 24.0 MHz */
- break;
- case INTEL_FAM6_ATOM_GOLDMONT_X:
- crystal_khz = 25000; /* 25.0 MHz */
- break;
- case INTEL_FAM6_ATOM_GOLDMONT:
- crystal_khz = 19200; /* 19.2 MHz */
- break;
- }
- }
+ /*
+ * Denverton SoCs don't report crystal clock, and also don't support
+ * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
+ * clock.
+ */
+ if (crystal_khz == 0 &&
+ boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X)
+ crystal_khz = 25000;
- if (crystal_khz == 0)
- return 0;
/*
- * TSC frequency determined by CPUID is a "hardware reported"
+ * TSC frequency reported directly by CPUID is a "hardware reported"
* frequency and is the most accurate one so far we have. This
* is considered a known frequency.
*/
- setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ if (crystal_khz != 0)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Some Intel SoCs like Skylake and Kabylake don't report the crystal
+ * clock, but we can easily calculate it to a high degree of accuracy
+ * by considering the crystal ratio and the CPU speed.
+ */
+ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
+ unsigned int eax_base_mhz, ebx, ecx, edx;
+
+ cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
+ crystal_khz = eax_base_mhz * 1000 *
+ eax_denominator / ebx_numerator;
+ }
+
+ if (crystal_khz == 0)
+ return 0;
/*
* For Atom SoCs TSC is the only reliable clocksource.
@@ -665,6 +672,16 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+#ifdef CONFIG_X86_LOCAL_APIC
+ /*
+ * The local APIC appears to be fed by the core crystal clock
+ * (which sounds entirely sensible). We can set the global
+ * lapic_timer_period here to avoid having to calibrate the APIC
+ * timer later.
+ */
+ lapic_timer_period = crystal_khz * 1000 / HZ;
+#endif
+
return crystal_khz * ebx_numerator / eax_denominator;
}
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 3d0e9aeea7c8..067858fe4db8 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -71,7 +71,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
/*
* MSR-based CPU/TSC frequency discovery for certain CPUs.
*
- * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy
+ * Set global "lapic_timer_period" to bus_clock_cycles/jiffy
* Return processor base frequency in KHz, or 0 on failure.
*/
unsigned long cpu_khz_from_msr(void)
@@ -104,7 +104,7 @@ unsigned long cpu_khz_from_msr(void)
res = freq * ratio;
#ifdef CONFIG_X86_LOCAL_APIC
- lapic_timer_frequency = (freq * 1000) / HZ;
+ lapic_timer_period = (freq * 1000) / HZ;
#endif
/*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 33b66b5c5aec..72b997eaa1fc 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -82,9 +82,9 @@ static struct orc_entry *orc_find(unsigned long ip);
* But they are copies of the ftrace entries that are static and
* defined in ftrace_*.S, which do have orc entries.
*
- * If the undwinder comes across a ftrace trampoline, then find the
+ * If the unwinder comes across a ftrace trampoline, then find the
* ftrace function that was used to create it, and use that ftrace
- * function's orc entrie, as the placement of the return code in
+ * function's orc entry, as the placement of the return code in
* the stack will be identical.
*/
static struct orc_entry *orc_ftrace_find(unsigned long ip)
@@ -128,6 +128,16 @@ static struct orc_entry null_orc_entry = {
.type = ORC_TYPE_CALL
};
+/* Fake frame pointer entry -- used as a fallback for generated code */
+static struct orc_entry orc_fp_entry = {
+ .type = ORC_TYPE_CALL,
+ .sp_reg = ORC_REG_BP,
+ .sp_offset = 16,
+ .bp_reg = ORC_REG_PREV_SP,
+ .bp_offset = -16,
+ .end = 0,
+};
+
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
@@ -392,8 +402,16 @@ bool unwind_next_frame(struct unwind_state *state)
* calls and calls to noreturn functions.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
- if (!orc)
- goto err;
+ if (!orc) {
+ /*
+ * As a fallback, try to assume this code uses a frame pointer.
+ * This is useful for generated code, like BPF, which ORC
+ * doesn't know about. This is just a guess, so the rest of
+ * the unwind is no longer considered reliable.
+ */
+ orc = &orc_fp_entry;
+ state->error = true;
+ }
/* End-of-stack check for kernel threads: */
if (orc->sp_reg == ORC_REG_UNDEFINED) {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 9a327d5b6d1f..d78a61408243 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
[CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
[CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
- [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
- [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX},
[CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
[CPUID_6_EAX] = { 6, 0, CPUID_EAX},
[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a21c440ff356..4dabc318adb8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2339,7 +2339,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!apic_enabled(apic))
+ if (!kvm_apic_hw_enabled(apic))
return -1;
__apic_update_ppr(apic, &ppr);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 132d149494d6..ab73a9a639ae 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -261,10 +261,10 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
ctr_val = rdtsc();
break;
case VMWARE_BACKDOOR_PMC_REAL_TIME:
- ctr_val = ktime_get_boot_ns();
+ ctr_val = ktime_get_boottime_ns();
break;
case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
- ctr_val = ktime_get_boot_ns() +
+ ctr_val = ktime_get_boottime_ns() +
vcpu->kvm->arch.kvmclock_offset;
break;
default:
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5f9c1a200201..46af3a5e9209 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -5240,9 +5240,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
- kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
-
if (nested_vmx_allowed(vcpu) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
@@ -5251,6 +5248,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (vmx_has_valid_vmcs12(vcpu)) {
kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
+ if (vmx->nested.hv_evmcs)
+ kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
+
if (is_guest_mode(vcpu) &&
nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull)
@@ -5350,6 +5350,15 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
return -EINVAL;
+ /*
+ * KVM_STATE_NESTED_EVMCS used to signal that KVM should
+ * enable eVMCS capability on vCPU. However, since then
+ * code was changed such that flag signals vmcs12 should
+ * be copied into eVMCS in guest memory.
+ *
+ * To preserve backwards compatability, allow user
+ * to set this flag even when there is no VMXON region.
+ */
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
@@ -5358,7 +5367,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
return -EINVAL;
- }
+ }
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
@@ -5373,20 +5382,21 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
* must be zero.
*/
- if (is_smm(vcpu) ? kvm_state->flags : kvm_state->hdr.vmx.smm.flags)
+ if (is_smm(vcpu) ?
+ (kvm_state->flags &
+ (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
+ : kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
!(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
- vmx_leave_nested(vcpu);
- if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
- if (!nested_vmx_allowed(vcpu))
+ if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
+ (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
- nested_enable_evmcs(vcpu, NULL);
- }
+ vmx_leave_nested(vcpu);
if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
return 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9857992d4e58..63bb1ee8258e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -67,6 +67,7 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/intel_pt.h>
+#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -1554,7 +1555,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
vcpu->arch.tsc_always_catchup = 1;
return 0;
} else {
- WARN(1, "user requested TSC rate below hardware speed\n");
+ pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
return -1;
}
}
@@ -1564,8 +1565,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
user_tsc_khz, tsc_khz);
if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
- WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
- user_tsc_khz);
+ pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
return -1;
}
@@ -1728,7 +1729,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boot_ns();
+ ns = ktime_get_boottime_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -2070,7 +2071,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boot_ns() + ka->kvmclock_offset;
+ return ktime_get_boottime_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2086,7 +2087,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+ ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2185,7 +2186,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boot_ns();
+ kernel_ns = ktime_get_boottime_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -9015,7 +9016,7 @@ int kvm_arch_hardware_enable(void)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -9243,7 +9244,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+ kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 1811fa4a1b1a..7c48ff4ae8d1 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -15,6 +15,7 @@ EXPORT_SYMBOL(wbinvd_on_cpu);
int wbinvd_on_all_cpus(void)
{
- return on_each_cpu(__wbinvd, NULL, 1);
+ on_each_cpu(__wbinvd, NULL, 1);
+ return 0;
}
EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 46df4c6aae46..58e4f1f00bbc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address,
* To avoid leaking information about the kernel page
* table layout, pretend that user-mode accesses to
* kernel addresses are always protection faults.
+ *
+ * NB: This means that failed vsyscalls with vsyscall=none
+ * will have the PROT bit. This doesn't leak any
+ * information and does not appear to cause any problems.
*/
if (address >= TASK_SIZE_MAX)
error_code |= X86_PF_PROT;
@@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs,
#ifdef CONFIG_X86_64
/*
- * Instruction fetch faults in the vsyscall page might need
- * emulation. The vsyscall page is at a high address
- * (>PAGE_OFFSET), but is considered to be part of the user
- * address space.
+ * Faults in the vsyscall page might need emulation. The
+ * vsyscall page is at a high address (>PAGE_OFFSET), but is
+ * considered to be part of the user address space.
*
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
+ *
+ * PKRU never rejects instruction fetches, so we don't need
+ * to consider the PF_PK bit.
*/
- if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
- if (emulate_vsyscall(regs, address))
+ if (is_vsyscall_vaddr(address)) {
+ if (emulate_vsyscall(hw_error_code, regs, address))
return;
}
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 693aaf28d5fe..0f01c7b1d217 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -671,23 +671,25 @@ static unsigned long __meminit
phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
unsigned long page_size_mask, bool init)
{
- unsigned long paddr_next, paddr_last = paddr_end;
- unsigned long vaddr = (unsigned long)__va(paddr);
- int i = p4d_index(vaddr);
+ unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
+
+ paddr_last = paddr_end;
+ vaddr = (unsigned long)__va(paddr);
+ vaddr_end = (unsigned long)__va(paddr_end);
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
page_size_mask, init);
- for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
- p4d_t *p4d;
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ p4d_t *p4d = p4d_page + p4d_index(vaddr);
pud_t *pud;
- vaddr = (unsigned long)__va(paddr);
- p4d = p4d_page + p4d_index(vaddr);
- paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+ vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE;
+ paddr = __pa(vaddr);
if (paddr >= paddr_end) {
+ paddr_next = __pa(vaddr_next);
if (!after_bootmem &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RAM) &&
@@ -699,13 +701,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
- paddr_last = phys_pud_init(pud, paddr, paddr_end,
- page_size_mask, init);
+ paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
+ page_size_mask, init);
continue;
}
pud = alloc_low_page();
- paddr_last = phys_pud_init(pud, paddr, paddr_end,
+ paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
page_size_mask, init);
spin_lock(&init_mm.page_table_lock);
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 632b83885867..3b9fd679cea9 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -728,7 +728,7 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
* Address range 0x0000 - 0x0fff is always mapped in the efi_pgd, so
* page faulting on these addresses isn't expected.
*/
- if (phys_addr >= 0x0000 && phys_addr <= 0x0fff)
+ if (phys_addr <= 0x0fff)
return;
/*