aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig53
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/boot/.gitignore1
-rw-r--r--arch/x86/boot/Makefile44
-rw-r--r--arch/x86/boot/genimage.sh303
-rw-r--r--arch/x86/boot/mtools.conf.in3
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c2
-rw-r--r--arch/x86/entry/Makefile10
-rw-r--r--arch/x86/entry/calling.h45
-rw-r--r--arch/x86/entry/common.c92
-rw-r--r--arch/x86/entry/entry_64.S9
-rw-r--r--arch/x86/entry/syscall_32.c20
-rw-r--r--arch/x86/entry/syscall_64.c17
-rw-r--r--arch/x86/entry/syscall_x32.c35
-rw-r--r--arch/x86/entry/syscalls/Makefile38
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscallhdr.sh35
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh46
-rw-r--r--arch/x86/events/core.c28
-rw-r--r--arch/x86/events/intel/core.c23
-rw-r--r--arch/x86/events/intel/cstate.c23
-rw-r--r--arch/x86/events/intel/ds.c20
-rw-r--r--arch/x86/events/intel/lbr.c9
-rw-r--r--arch/x86/events/intel/uncore.c4
-rw-r--r--arch/x86/events/intel/uncore.h1
-rw-r--r--arch/x86/events/intel/uncore_snbep.c179
-rw-r--r--arch/x86/events/perf_event.h1
-rw-r--r--arch/x86/events/rapl.c6
-rw-r--r--arch/x86/hyperv/hv_init.c47
-rw-r--r--arch/x86/ia32/ia32_aout.c4
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/asm.h27
-rw-r--r--arch/x86/include/asm/atomic.h2
-rw-r--r--arch/x86/include/asm/barrier.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/crash.h6
-rw-r--r--arch/x86/include/asm/desc.h25
-rw-r--r--arch/x86/include/asm/elf.h4
-rw-r--r--arch/x86/include/asm/fpu/internal.h222
-rw-r--r--arch/x86/include/asm/fpu/signal.h2
-rw-r--r--arch/x86/include/asm/fpu/xstate.h78
-rw-r--r--arch/x86/include/asm/idtentry.h33
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h7
-rw-r--r--arch/x86/include/asm/jump_label.h79
-rw-r--r--arch/x86/include/asm/mce.h13
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/nops.h24
-rw-r--r--arch/x86/include/asm/page.h6
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/perf_event.h1
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable.h65
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/pkeys.h9
-rw-r--r--arch/x86/include/asm/pkru.h62
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/sev-common.h16
-rw-r--r--arch/x86/include/asm/sgx.h2
-rw-r--r--arch/x86/include/asm/sigframe.h2
-rw-r--r--arch/x86/include/asm/special_insns.h14
-rw-r--r--arch/x86/include/asm/stackprotector.h2
-rw-r--r--arch/x86/include/asm/syscall.h13
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h10
-rw-r--r--arch/x86/include/asm/unaligned.h15
-rw-r--r--arch/x86/include/asm/unistd.h8
-rw-r--r--arch/x86/include/uapi/asm/auxvec.h4
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h6
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c118
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/alternative.c6
-rw-r--r--arch/x86/kernel/amd_nb.c3
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/common.c77
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/hygon.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c46
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c55
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c3
-rw-r--r--arch/x86/kernel/cpu/mce/core.c13
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h21
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c10
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h4
-rw-r--r--arch/x86/kernel/cpu/tsx.c37
-rw-r--r--arch/x86/kernel/crash.c13
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/fpu/core.c282
-rw-r--r--arch/x86/kernel/fpu/init.c15
-rw-r--r--arch/x86/kernel/fpu/regset.c223
-rw-r--r--arch/x86/kernel/fpu/signal.c446
-rw-r--r--arch/x86/kernel/fpu/xstate.c685
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/idt.c45
-rw-r--r--arch/x86/kernel/jump_label.c81
-rw-r--r--arch/x86/kernel/kprobes/core.c26
-rw-r--r--arch/x86/kernel/machine_kexec_32.c15
-rw-r--r--arch/x86/kernel/machine_kexec_64.c33
-rw-r--r--arch/x86/kernel/process.c26
-rw-r--r--arch/x86/kernel/process_64.c28
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c51
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/sev.c201
-rw-r--r--arch/x86/kernel/signal.c92
-rw-r--r--arch/x86/kernel/smpboot.c4
-rw-r--r--arch/x86/kernel/trace.c234
-rw-r--r--arch/x86/kernel/traps.c14
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/umip.c10
-rw-r--r--arch/x86/kvm/Kconfig5
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c1
-rw-r--r--arch/x86/kvm/x86.c56
-rw-r--r--arch/x86/lib/insn-eval.c30
-rw-r--r--arch/x86/lib/retpoline.S4
-rw-r--r--arch/x86/math-emu/fpu_proto.h2
-rw-r--r--arch/x86/math-emu/load_store.c2
-rw-r--r--arch/x86/math-emu/reg_ld_str.c2
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c9
-rw-r--r--arch/x86/mm/pat/set_memory.c4
-rw-r--r--arch/x86/mm/pgtable.c36
-rw-r--r--arch/x86/mm/pkeys.c26
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c49
-rw-r--r--arch/x86/pci/mmconfig-shared.c10
-rw-r--r--arch/x86/platform/efi/efi.c2
-rw-r--r--arch/x86/purgatory/purgatory.c2
-rw-r--r--arch/x86/realmode/Makefile1
-rw-r--r--arch/x86/um/sys_call_table_32.c14
-rw-r--r--arch/x86/um/sys_call_table_64.c15
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c2
144 files changed, 2795 insertions, 2371 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0045e1b44190..49270655e827 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -33,6 +33,7 @@ config X86_64
select NEED_DMA_MAP_STATE
select SWIOTLB
select ARCH_HAS_ELFCORE_COMPAT
+ select ZONE_DMA32
config FORCE_DYNAMIC_FTRACE
def_bool y
@@ -63,7 +64,7 @@ config X86
select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM)
select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
- select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE
+ select ARCH_ENABLE_SPLIT_PMD_PTLOCK if (PGTABLE_LEVELS > 2) && (X86_64 || X86_PAE)
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
@@ -93,6 +94,7 @@ config X86
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_DEBUG_WX
+ select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -103,8 +105,8 @@ config X86
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
- select ARCH_SUPPORTS_LTO_CLANG if X86_64
- select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64
+ select ARCH_SUPPORTS_LTO_CLANG
+ select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
@@ -113,6 +115,7 @@ config X86
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+ select ARCH_WANTS_NO_INSTR
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_THP_SWAP if X86_64
@@ -343,9 +346,6 @@ config ARCH_SUSPEND_POSSIBLE
config ARCH_WANT_GENERAL_HUGETLB
def_bool y
-config ZONE_DMA32
- def_bool y if X86_64
-
config AUDIT_ARCH
def_bool y if X86_64
@@ -393,16 +393,6 @@ config CC_HAS_SANE_STACKPROTECTOR
menu "Processor type and features"
-config ZONE_DMA
- bool "DMA memory allocation support" if EXPERT
- default y
- help
- DMA memory allocation support allows devices with less than 32-bit
- addressing to allocate within the first 16MB of address space.
- Disable if no such devices will be used.
-
- If unsure, say Y.
-
config SMP
bool "Symmetric multi-processing support"
help
@@ -1597,7 +1587,7 @@ config NODES_SHIFT
default "10" if MAXSMP
default "6" if X86_64
default "3"
- depends on NEED_MULTIPLE_NODES
+ depends on NUMA
help
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accommodate various tables.
@@ -1693,35 +1683,6 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
Set whether the default state of memory_corruption_check is
on or off.
-config X86_RESERVE_LOW
- int "Amount of low memory, in kilobytes, to reserve for the BIOS"
- default 64
- range 4 640
- help
- Specify the amount of low memory to reserve for the BIOS.
-
- The first page contains BIOS data structures that the kernel
- must not use, so that page must always be reserved.
-
- By default we reserve the first 64K of physical RAM, as a
- number of BIOSes are known to corrupt that memory range
- during events such as suspend/resume or monitor cable
- insertion, so it must not be used by the kernel.
-
- You can set this to 4 if you are absolutely sure that you
- trust the BIOS to get all its memory reservations and usages
- right. If you know your BIOS have problems beyond the
- default 64K area, you can set this to 640 to avoid using the
- entire low memory range.
-
- If you have doubts about the BIOS (e.g. suspend/resume does
- not work or there's kernel crashes after certain hardware
- hotplug events) then you might want to enable
- X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
- typical corruption patterns.
-
- Leave this to the default value of 64 if you are unsure.
-
config MATH_EMULATION
bool
depends on MODIFY_LDT_SYSCALL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cb5e8d39cac1..307fd0000a83 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -240,9 +240,6 @@ head-y += arch/x86/kernel/platform-quirks.o
libs-y += arch/x86/lib/
-# See arch/x86/Kbuild for content of core part of the kernel
-core-y += arch/x86/
-
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
@@ -257,7 +254,7 @@ drivers-$(CONFIG_FB) += arch/x86/video/
boot := arch/x86/boot
-BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 isoimage
+BOOT_TARGETS = bzdisk fdimage fdimage144 fdimage288 hdimage isoimage
PHONY += bzImage $(BOOT_TARGETS)
@@ -315,8 +312,9 @@ define archhelp
echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
+ echo ' hdimage - Create a BIOS/EFI hard disk image (arch/x86/boot/hdimage)'
echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
- echo ' bzdisk/fdimage*/isoimage also accept:'
+ echo ' bzdisk/fdimage*/hdimage/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
echo ''
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 1db7913795f5..b3c1ae084180 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -44,7 +44,7 @@ ELF_FORMAT := elf64-x86-64
# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example.
-LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64
+LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib64
LINK-y += -m64
endif
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 9cc7f1357b9b..1189be057ebd 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -11,3 +11,4 @@ setup.elf
fdimage
mtools.conf
image.iso
+hdimage
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index fe605205b4ce..dfbc26a8e924 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -29,7 +29,7 @@ KCOV_INSTRUMENT := n
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
-targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
+targets += fdimage fdimage144 fdimage288 image.iso hdimage
subdir- := compressed
setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
@@ -115,47 +115,49 @@ $(obj)/compressed/vmlinux: FORCE
$(Q)$(MAKE) $(build)=$(obj)/compressed $@
# Set this if you want to pass append arguments to the
-# bzdisk/fdimage/isoimage kernel
+# bzdisk/fdimage/hdimage/isoimage kernel
FDARGS =
-# Set this if you want an initrd included with the
-# bzdisk/fdimage/isoimage kernel
+# Set this if you want one or more initrds included in the image
FDINITRD =
-image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
+imgdeps = $(obj)/bzImage $(obj)/mtools.conf $(src)/genimage.sh
$(obj)/mtools.conf: $(src)/mtools.conf.in
sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+targets += mtools.conf
+
+# genimage.sh requires bash, but it also has a bunch of other
+# external dependencies.
quiet_cmd_genimage = GENIMAGE $3
-cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
- $(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD)
+cmd_genimage = $(BASH) $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+ $(obj)/mtools.conf '$(FDARGS)' $(FDINITRD)
-PHONY += bzdisk fdimage fdimage144 fdimage288 isoimage bzlilo install
+PHONY += bzdisk fdimage fdimage144 fdimage288 hdimage isoimage install
# This requires write access to /dev/fd0
-bzdisk: $(obj)/bzImage $(obj)/mtools.conf
+# All images require syslinux to be installed; hdimage also requires
+# EDK2/OVMF if the kernel is compiled with the EFI stub.
+bzdisk: $(imgdeps)
$(call cmd,genimage,bzdisk,/dev/fd0)
-# These require being root or having syslinux 2.02 or higher installed
-fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
+fdimage fdimage144: $(imgdeps)
$(call cmd,genimage,fdimage144,$(obj)/fdimage)
@$(kecho) 'Kernel: $(obj)/fdimage is ready'
-fdimage288: $(obj)/bzImage $(obj)/mtools.conf
+fdimage288: $(imgdeps)
$(call cmd,genimage,fdimage288,$(obj)/fdimage)
@$(kecho) 'Kernel: $(obj)/fdimage is ready'
-isoimage: $(obj)/bzImage
+hdimage: $(imgdeps)
+ $(call cmd,genimage,hdimage,$(obj)/hdimage)
+ @$(kecho) 'Kernel: $(obj)/hdimage is ready'
+
+isoimage: $(imgdeps)
$(call cmd,genimage,isoimage,$(obj)/image.iso)
@$(kecho) 'Kernel: $(obj)/image.iso is ready'
-bzlilo:
- if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
- if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
- cat $(obj)/bzImage > $(INSTALL_PATH)/vmlinuz
- cp System.map $(INSTALL_PATH)/
- if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
-
install:
- sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \
+ $(CONFIG_SHELL) $(srctree)/$(src)/install.sh \
+ $(KERNELRELEASE) $(obj)/bzImage \
System.map "$(INSTALL_PATH)"
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
index 6a10d52a4145..0673fdfc1a11 100644
--- a/arch/x86/boot/genimage.sh
+++ b/arch/x86/boot/genimage.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
@@ -8,15 +8,24 @@
#
# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
#
-# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture
+# "make fdimage/fdimage144/fdimage288/hdimage/isoimage"
+# script for x86 architecture
#
# Arguments:
-# $1 - fdimage format
-# $2 - target image file
-# $3 - kernel bzImage file
-# $4 - mtool configuration file
-# $5 - kernel cmdline
-# $6 - inird image file
+# $1 - fdimage format
+# $2 - target image file
+# $3 - kernel bzImage file
+# $4 - mtools configuration file
+# $5 - kernel cmdline
+# $6+ - initrd image file(s)
+#
+# This script requires:
+# bash
+# syslinux
+# mtools (for fdimage* and hdimage)
+# edk2/OVMF (for hdimage)
+#
+# Otherwise try to stick to POSIX shell commands...
#
# Use "make V=1" to debug this script
@@ -26,105 +35,237 @@ case "${KBUILD_VERBOSE}" in
;;
esac
-verify () {
- if [ ! -f "$1" ]; then
- echo "" 1>&2
- echo " *** Missing file: $1" 1>&2
- echo "" 1>&2
- exit 1
+# Exit the top-level shell with an error
+topshell=$$
+trap 'exit 1' USR1
+die() {
+ echo "" 1>&2
+ echo " *** $*" 1>&2
+ echo "" 1>&2
+ kill -USR1 $topshell
+}
+
+# Verify the existence and readability of a file
+verify() {
+ if [ ! -f "$1" -o ! -r "$1" ]; then
+ die "Missing file: $1"
fi
}
+diskfmt="$1"
+FIMAGE="$2"
+FBZIMAGE="$3"
+MTOOLSRC="$4"
+KCMDLINE="$5"
+shift 5 # Remaining arguments = initrd files
+
+export MTOOLSRC
-export MTOOLSRC=$4
-FIMAGE=$2
-FBZIMAGE=$3
-KCMDLINE=$5
-FDINITRD=$6
+# common options for dd
+dd='dd iflag=fullblock'
# Make sure the files actually exist
verify "$FBZIMAGE"
-genbzdisk() {
- verify "$MTOOLSRC"
- mformat a:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - a:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" a:initrd.img
+declare -a FDINITRDS
+irdpfx=' initrd='
+initrdopts_syslinux=''
+initrdopts_efi=''
+for f in "$@"; do
+ if [ -f "$f" -a -r "$f" ]; then
+ FDINITRDS=("${FDINITRDS[@]}" "$f")
+ fname="$(basename "$f")"
+ initrdopts_syslinux="${initrdopts_syslinux}${irdpfx}${fname}"
+ irdpfx=,
+ initrdopts_efi="${initrdopts_efi} initrd=${fname}"
fi
- mcopy $FBZIMAGE a:linux
+done
+
+# Read a $3-byte littleendian unsigned value at offset $2 from file $1
+le() {
+ local n=0
+ local m=1
+ for b in $(od -A n -v -j $2 -N $3 -t u1 "$1"); do
+ n=$((n + b*m))
+ m=$((m * 256))
+ done
+ echo $n
}
-genfdimage144() {
- verify "$MTOOLSRC"
- dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
- mformat v:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - v:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" v:initrd.img
- fi
- mcopy $FBZIMAGE v:linux
+# Get the EFI architecture name such that boot{name}.efi is the default
+# boot file name. Returns false with no output if the file is not an
+# EFI image or otherwise unknown.
+efiarch() {
+ [ -f "$1" ] || return
+ [ $(le "$1" 0 2) -eq 23117 ] || return # MZ magic
+ peoffs=$(le "$1" 60 4) # PE header offset
+ [ $peoffs -ge 64 ] || return
+ [ $(le "$1" $peoffs 4) -eq 17744 ] || return # PE magic
+ case $(le "$1" $((peoffs+4+20)) 2) in # PE type
+ 267) ;; # PE32
+ 523) ;; # PE32+
+ *) return 1 ;; # Invalid
+ esac
+ [ $(le "$1" $((peoffs+4+20+68)) 2) -eq 10 ] || return # EFI app
+ case $(le "$1" $((peoffs+4)) 2) in # Machine type
+ 332) echo i386 ;;
+ 450) echo arm ;;
+ 512) echo ia64 ;;
+ 20530) echo riscv32 ;;
+ 20580) echo riscv64 ;;
+ 20776) echo riscv128 ;;
+ 34404) echo x64 ;;
+ 43620) echo aa64 ;;
+ esac
}
-genfdimage288() {
- verify "$MTOOLSRC"
- dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
- mformat w:
- syslinux $FIMAGE
- echo "$KCMDLINE" | mcopy - W:syslinux.cfg
- if [ -f "$FDINITRD" ] ; then
- mcopy "$FDINITRD" w:initrd.img
- fi
- mcopy $FBZIMAGE w:linux
+# Get the combined sizes in bytes of the files given, counting sparse
+# files as full length, and padding each file to a 4K block size
+filesizes() {
+ local t=0
+ local s
+ for s in $(ls -lnL "$@" 2>/dev/null | awk '/^-/{ print $5; }'); do
+ t=$((t + ((s+4095)/4096)*4096))
+ done
+ echo $t
}
-geniso() {
- tmp_dir=`dirname $FIMAGE`/isoimage
- rm -rf $tmp_dir
- mkdir $tmp_dir
- for i in lib lib64 share ; do
- for j in syslinux ISOLINUX ; do
- if [ -f /usr/$i/$j/isolinux.bin ] ; then
- isolinux=/usr/$i/$j/isolinux.bin
- fi
+# Expand directory names which should be in /usr/share into a list
+# of possible alternatives
+sharedirs() {
+ local dir file
+ for dir in /usr/share /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
done
- for j in syslinux syslinux/modules/bios ; do
- if [ -f /usr/$i/$j/ldlinux.c32 ]; then
- ldlinux=/usr/$i/$j/ldlinux.c32
- fi
+ done
+}
+efidirs() {
+ local dir file
+ for dir in /usr/share /boot /usr/lib64 /usr/lib; do
+ for file; do
+ echo "$dir/$file"
+ echo "$dir/${file^^}"
done
- if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
- break
+ done
+}
+
+findsyslinux() {
+ local f="$(find -L $(sharedirs syslinux isolinux) \
+ -name "$1" -readable -type f -print -quit 2>/dev/null)"
+ if [ ! -f "$f" ]; then
+ die "Need a $1 file, please install syslinux/isolinux."
+ fi
+ echo "$f"
+ return 0
+}
+
+findovmf() {
+ local arch="$1"
+ shift
+ local -a names=(-false)
+ local name f
+ for name; do
+ names=("${names[@]}" -or -iname "$name")
+ done
+ for f in $(find -L $(efidirs edk2 ovmf) \
+ \( "${names[@]}" \) -readable -type f \
+ -print 2>/dev/null); do
+ if [ "$(efiarch "$f")" = "$arch" ]; then
+ echo "$f"
+ return 0
fi
done
- if [ -z "$isolinux" ] ; then
- echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
- exit 1
+ die "Need a $1 file for $arch, please install EDK2/OVMF."
+}
+
+do_mcopy() {
+ if [ ${#FDINITRDS[@]} -gt 0 ]; then
+ mcopy "${FDINITRDS[@]}" "$1"
+ fi
+ if [ -n "$efishell" ]; then
+ mmd "$1"EFI "$1"EFI/Boot
+ mcopy "$efishell" "$1"EFI/Boot/boot${kefiarch}.efi
fi
- if [ -z "$ldlinux" ] ; then
- echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
- exit 1
+ if [ -n "$kefiarch" ]; then
+ echo linux "$KCMDLINE$initrdopts_efi" | \
+ mcopy - "$1"startup.nsh
fi
- cp $isolinux $tmp_dir
- cp $ldlinux $tmp_dir
- cp $FBZIMAGE $tmp_dir/linux
- echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
- if [ -f "$FDINITRD" ] ; then
- cp "$FDINITRD" $tmp_dir/initrd.img
+ echo default linux "$KCMDLINE$initrdopts_syslinux" | \
+ mcopy - "$1"syslinux.cfg
+ mcopy "$FBZIMAGE" "$1"linux
+}
+
+genbzdisk() {
+ verify "$MTOOLSRC"
+ mformat -v 'LINUX_BOOT' a:
+ syslinux "$FIMAGE"
+ do_mcopy a:
+}
+
+genfdimage144() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=1440 2>/dev/null
+ mformat -v 'LINUX_BOOT' v:
+ syslinux "$FIMAGE"
+ do_mcopy v:
+}
+
+genfdimage288() {
+ verify "$MTOOLSRC"
+ $dd if=/dev/zero of="$FIMAGE" bs=1024 count=2880 2>/dev/null
+ mformat -v 'LINUX_BOOT' w:
+ syslinux "$FIMAGE"
+ do_mcopy w:
+}
+
+genhdimage() {
+ verify "$MTOOLSRC"
+ mbr="$(findsyslinux mbr.bin)"
+ kefiarch="$(efiarch "$FBZIMAGE")"
+ if [ -n "$kefiarch" ]; then
+ # The efishell provides command line handling
+ efishell="$(findovmf $kefiarch shell.efi shell${kefiarch}.efi)"
+ ptype='-T 0xef' # EFI system partition, no GPT
fi
- genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
- -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
- -boot-info-table $tmp_dir
- isohybrid $FIMAGE 2>/dev/null || true
- rm -rf $tmp_dir
+ sizes=$(filesizes "$FBZIMAGE" "${FDINITRDS[@]}" "$efishell")
+ # Allow 1% + 1 MiB for filesystem and partition table overhead,
+ # syslinux, and config files
+ megs=$(((sizes + sizes/100 + 2*1024*1024 - 1)/(1024*1024)))
+ $dd if=/dev/zero of="$FIMAGE" bs=$((1024*1024)) count=$megs 2>/dev/null
+ mpartition -I -c -s 32 -h 64 -t $megs $ptype -b 512 -a h:
+ $dd if="$mbr" of="$FIMAGE" bs=440 count=1 conv=notrunc 2>/dev/null
+ mformat -v 'LINUX_BOOT' -s 32 -h 64 -t $megs h:
+ syslinux --offset $((512*512)) "$FIMAGE"
+ do_mcopy h:
+}
+
+geniso() {
+ tmp_dir="$(dirname "$FIMAGE")/isoimage"
+ rm -rf "$tmp_dir"
+ mkdir "$tmp_dir"
+ isolinux=$(findsyslinux isolinux.bin)
+ ldlinux=$(findsyslinux ldlinux.c32)
+ cp "$isolinux" "$ldlinux" "$tmp_dir"
+ cp "$FBZIMAGE" "$tmp_dir"/linux
+ echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg
+ cp "${FDINITRDS[@]}" "$tmp_dir"/
+ genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \
+ -quiet -o "$FIMAGE" -b isolinux.bin \
+ -c boot.cat -no-emul-boot -boot-load-size 4 \
+ -boot-info-table "$tmp_dir"
+ isohybrid "$FIMAGE" 2>/dev/null || true
+ rm -rf "$tmp_dir"
}
-case $1 in
+rm -f "$FIMAGE"
+
+case "$diskfmt" in
bzdisk) genbzdisk;;
fdimage144) genfdimage144;;
fdimage288) genfdimage288;;
+ hdimage) genhdimage;;
isoimage) geniso;;
- *) echo 'Unknown image format'; exit 1;
+ *) die "Unknown image format: $diskfmt";;
esac
diff --git a/arch/x86/boot/mtools.conf.in b/arch/x86/boot/mtools.conf.in
index efd6d2490c1d..9e2662d01364 100644
--- a/arch/x86/boot/mtools.conf.in
+++ b/arch/x86/boot/mtools.conf.in
@@ -14,4 +14,7 @@ drive v:
drive w:
file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
+# Hard disk
+drive h:
+ file="@OBJ@/hdimage" partition=1 mformat_only
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 6706b6cb1d0f..38caf61cd5b7 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -1500,7 +1500,7 @@ static int __init curve25519_mod_init(void)
static void __exit curve25519_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) &&
- (boot_cpu_has(X86_FEATURE_BMI2) || boot_cpu_has(X86_FEATURE_ADX)))
+ static_branch_likely(&curve25519_use_bmi2_adx))
crypto_unregister_kpp(&curve25519_alg);
}
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 08bf95dbc911..7fec5dcf6438 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -8,18 +8,8 @@ UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_syscall_x32.o = $(CC_FLAGS_FTRACE)
CFLAGS_common.o += -fno-stack-protector
-CFLAGS_syscall_64.o += -fno-stack-protector
-CFLAGS_syscall_32.o += -fno-stack-protector
-CFLAGS_syscall_x32.o += -fno-stack-protector
-
-CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
-CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
-CFLAGS_syscall_x32.o += $(call cc-option,-Wno-override-init,)
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 07a9331d55e7..a4c061fb7c6e 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/ptrace-abi.h>
/*
@@ -62,42 +63,7 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-/* The layout forms the "struct pt_regs" on the stack: */
-/*
- * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
- * unless syscall needs a complete, fully filled "struct pt_regs".
- */
-#define R15 0*8
-#define R14 1*8
-#define R13 2*8
-#define R12 3*8
-#define RBP 4*8
-#define RBX 5*8
-/* These regs are callee-clobbered. Always saved on kernel entry. */
-#define R11 6*8
-#define R10 7*8
-#define R9 8*8
-#define R8 9*8
-#define RAX 10*8
-#define RCX 11*8
-#define RDX 12*8
-#define RSI 13*8
-#define RDI 14*8
-/*
- * On syscall entry, this is syscall#. On CPU exception, this is error code.
- * On hw interrupt, it's IRQ number:
- */
-#define ORIG_RAX 15*8
-/* Return frame for iretq */
-#define RIP 16*8
-#define CS 17*8
-#define EFLAGS 18*8
-#define RSP 19*8
-#define SS 20*8
-
-#define SIZEOF_PTREGS 21*8
-
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
+.macro PUSH_REGS rdx=%rdx rax=%rax save_ret=0
.if \save_ret
pushq %rsi /* pt_regs->si */
movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */
@@ -124,7 +90,9 @@ For 32-bit we have the following conventions - kernel is built with
.if \save_ret
pushq %rsi /* return address on top of stack */
.endif
+.endm
+.macro CLEAR_REGS
/*
* Sanitize registers of values that a speculation attack might
* otherwise want to exploit. The lower registers are likely clobbered
@@ -146,6 +114,11 @@ For 32-bit we have the following conventions - kernel is built with
.endm
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
+ PUSH_REGS rdx=\rdx, rax=\rax, save_ret=\save_ret
+ CLEAR_REGS
+.endm
+
.macro POP_REGS pop_rdi=1 skip_r11rcx=0
popq %r15
popq %r14
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 7b2542b13ebd..6c2826417b33 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -36,61 +36,97 @@
#include <asm/irq_stack.h>
#ifdef CONFIG_X86_64
-__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = sys_call_table[unr](regs);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ /*
+ * Adjust the starting offset of the table, and convert numbers
+ * < __X32_SYSCALL_BIT to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int xnr = nr - __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call_table[xnr](regs);
+ return true;
+ }
+ return false;
+}
+
+__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
- if (likely(nr < NR_syscalls)) {
- nr = array_index_nospec(nr, NR_syscalls);
- regs->ax = sys_call_table[nr](regs);
-#ifdef CONFIG_X86_X32_ABI
- } else if (likely((nr & __X32_SYSCALL_BIT) &&
- (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
- nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
- X32_NR_syscalls);
- regs->ax = x32_sys_call_table[nr](regs);
-#endif
+
+ if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
+ /* Invalid system call, but still a system call. */
+ regs->ax = __x64_sys_ni_syscall(regs);
}
+
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT;
- return (unsigned int)regs->orig_ax;
+ return (int)regs->orig_ax;
}
/*
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
*/
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
- unsigned int nr)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
{
- if (likely(nr < IA32_NR_syscalls)) {
- nr = array_index_nospec(nr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[nr](regs);
+ /*
+ * Convert negative numbers to very high and thus out of range
+ * numbers for comparisons.
+ */
+ unsigned int unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call_table[unr](regs);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
}
}
/* Handles int $0x80 */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
add_random_kstack_offset();
/*
- * Subtlety here: if ptrace pokes something larger than 2^32-1 into
- * orig_ax, the unsigned int return value truncates it. This may
- * or may not be necessary, but it matches the old asm behavior.
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
*/
- nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
+ nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
do_syscall_32_irqs_on(regs, nr);
@@ -101,7 +137,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
int res;
add_random_kstack_offset();
@@ -130,14 +166,13 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
/* User code screwed up. */
regs->ax = -EFAULT;
- instrumentation_end();
local_irq_disable();
+ instrumentation_end();
irqentry_exit_to_user_mode(regs);
return false;
}
- /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
- nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
+ nr = syscall_enter_from_user_mode_work(regs, nr);
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
@@ -269,15 +304,16 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
irqentry_state_t state = irqentry_enter(regs);
bool inhcall;
+ instrumentation_begin();
run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
- instrumentation_begin();
irqentry_exit_cond_resched();
instrumentation_end();
restore_inhcall(inhcall);
} else {
+ instrumentation_end();
irqentry_exit(regs, state);
}
}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a16a5294d55f..e38a4cf795d9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -107,8 +107,9 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
- movq %rax, %rdi
- movq %rsp, %rsi
+ movq %rsp, %rdi
+ /* Sign extend the lower 32bit as syscall numbers are treated as int */
+ movslq %eax, %rsi
call do_syscall_64 /* returns with IRQs disabled */
/*
@@ -506,7 +507,7 @@ SYM_CODE_START(\asmsym)
movq %rsp, %rdi /* pt_regs pointer */
- call \cfunc
+ call kernel_\cfunc
/*
* No need to switch back to the IST stack. The current stack is either
@@ -517,7 +518,7 @@ SYM_CODE_START(\asmsym)
/* Switch to the regular task stack */
.Lfrom_usermode_switch_stack_\@:
- idtentry_body safe_stack_\cfunc, has_error_code=1
+ idtentry_body user_\cfunc, has_error_code=1
_ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 86eb0d89d46f..8cfc9bc73e7f 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -5,21 +5,21 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __SYSCALL_I386(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
+#ifdef CONFIG_IA32_EMULATION
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat)
+#else
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+#endif
+
+#define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *);
#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
+#undef __SYSCALL
-#define __SYSCALL_I386(nr, sym) [nr] = __ia32_##sym,
+#define __SYSCALL(nr, sym) __ia32_##sym,
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_ia32_syscall_max] = &__ia32_sys_ni_syscall,
+__visible const sys_call_ptr_t ia32_sys_call_table[] = {
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 1594ec72bcbb..be120eec1fc9 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -5,23 +5,14 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-#define __SYSCALL_X32(nr, sym)
-#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym)
-
-#define __SYSCALL_64(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
-#undef __SYSCALL_64
+#undef __SYSCALL
-#define __SYSCALL_64(nr, sym) [nr] = __x64_##sym,
+#define __SYSCALL(nr, sym) __x64_##sym,
-asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall,
+asmlinkage const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c
index f2fe0a33bcfd..bdd0e03a1265 100644
--- a/arch/x86/entry/syscall_x32.c
+++ b/arch/x86/entry/syscall_x32.c
@@ -5,37 +5,14 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <linux/syscalls.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
-/*
- * Reuse the 64-bit entry points for the x32 versions that occupy different
- * slots in the syscall table.
- */
-#define __x32_sys_readv __x64_sys_readv
-#define __x32_sys_writev __x64_sys_writev
-#define __x32_sys_getsockopt __x64_sys_getsockopt
-#define __x32_sys_setsockopt __x64_sys_setsockopt
-#define __x32_sys_vmsplice __x64_sys_vmsplice
-#define __x32_sys_process_vm_readv __x64_sys_process_vm_readv
-#define __x32_sys_process_vm_writev __x64_sys_process_vm_writev
+#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
+#include <asm/syscalls_x32.h>
+#undef __SYSCALL
-#define __SYSCALL_64(nr, sym)
+#define __SYSCALL(nr, sym) __x64_##sym,
-#define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *);
-#define __SYSCALL_COMMON(nr, sym) extern long __x64_##sym(const struct pt_regs *);
-#include <asm/syscalls_64.h>
-#undef __SYSCALL_X32
-#undef __SYSCALL_COMMON
-
-#define __SYSCALL_X32(nr, sym) [nr] = __x32_##sym,
-#define __SYSCALL_COMMON(nr, sym) [nr] = __x64_##sym,
-
-asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_x32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_x32_syscall_max] = &__x64_sys_ni_syscall,
-#include <asm/syscalls_64.h>
+asmlinkage const sys_call_ptr_t x32_sys_call_table[] = {
+#include <asm/syscalls_x32.h>
};
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index d8c4f6c9eadc..5b3efed0e4e8 100644
--- a/arch/x86/entry/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -9,47 +9,54 @@ _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
syscall32 := $(src)/syscall_32.tbl
syscall64 := $(src)/syscall_64.tbl
-syshdr := $(srctree)/$(src)/syscallhdr.sh
-systbl := $(srctree)/$(src)/syscalltbl.sh
+syshdr := $(srctree)/scripts/syscallhdr.sh
+systbl := $(srctree)/scripts/syscalltbl.sh
+offset :=
+prefix :=
quiet_cmd_syshdr = SYSHDR $@
- cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \
- '$(syshdr_abi_$(basetarget))' \
- '$(syshdr_pfx_$(basetarget))' \
- '$(syshdr_offset_$(basetarget))'
+ cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) --emit-nr \
+ $(if $(offset),--offset $(offset)) \
+ $(if $(prefix),--prefix $(prefix)) \
+ $< $@
quiet_cmd_systbl = SYSTBL $@
- cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@
quiet_cmd_hypercalls = HYPERCALLS $@
cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<, $(real-prereqs))
-syshdr_abi_unistd_32 := i386
+$(uapi)/unistd_32.h: abis := i386
$(uapi)/unistd_32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_32_ia32 := i386
-syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: abis := i386
+$(out)/unistd_32_ia32.h: prefix := ia32_
$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_x32 := common,x32
-syshdr_offset_unistd_x32 := __X32_SYSCALL_BIT
+$(uapi)/unistd_x32.h: abis := common,x32
+$(uapi)/unistd_x32.h: offset := __X32_SYSCALL_BIT
$(uapi)/unistd_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_64 := common,64
+$(uapi)/unistd_64.h: abis := common,64
$(uapi)/unistd_64.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
-syshdr_abi_unistd_64_x32 := x32
-syshdr_pfx_unistd_64_x32 := x32_
+$(out)/unistd_64_x32.h: abis := x32
+$(out)/unistd_64_x32.h: prefix := x32_
$(out)/unistd_64_x32.h: $(syscall64) $(syshdr) FORCE
$(call if_changed,syshdr)
+$(out)/syscalls_32.h: abis := i386
$(out)/syscalls_32.h: $(syscall32) $(systbl) FORCE
$(call if_changed,systbl)
+$(out)/syscalls_64.h: abis := common,64
$(out)/syscalls_64.h: $(syscall64) $(systbl) FORCE
$(call if_changed,systbl)
+$(out)/syscalls_x32.h: abis := common,x32
+$(out)/syscalls_x32.h: $(syscall64) $(systbl) FORCE
+ $(call if_changed,systbl)
$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh FORCE
$(call if_changed,hypercalls)
@@ -60,6 +67,7 @@ uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
syshdr-y += syscalls_32.h
syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
syshdr-$(CONFIG_X86_64) += syscalls_64.h
+syshdr-$(CONFIG_X86_X32) += syscalls_x32.h
syshdr-$(CONFIG_XEN) += xen-hypercalls.h
uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y))
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4bbc267fb36b..ce763a12311c 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -447,7 +447,8 @@
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 i386 mount_setattr sys_mount_setattr
-# 443 reserved for quotactl_path
+443 i386 quotactl_fd sys_quotactl_fd
444 i386 landlock_create_ruleset sys_landlock_create_ruleset
445 i386 landlock_add_rule sys_landlock_add_rule
446 i386 landlock_restrict_self sys_landlock_restrict_self
+447 i386 memfd_secret sys_memfd_secret
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ce18119ea0d0..f6b57799c1ea 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,10 +364,11 @@
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr
-# 443 reserved for quotactl_path
+443 common quotactl_fd sys_quotactl_fd
444 common landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
+447 common memfd_secret sys_memfd_secret
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh
deleted file mode 100644
index cc1e63857427..000000000000
--- a/arch/x86/entry/syscalls/syscallhdr.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-my_abis=`echo "($3)" | tr ',' '|'`
-prefix="$4"
-offset="$5"
-
-fileguard=_ASM_X86_`basename "$out" | sed \
- -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
- -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
- echo "#ifndef ${fileguard}"
- echo "#define ${fileguard} 1"
- echo ""
-
- max=0
- while read nr abi name entry ; do
- if [ -z "$offset" ]; then
- echo "#define __NR_${prefix}${name} $nr"
- else
- echo "#define __NR_${prefix}${name} ($offset + $nr)"
- fi
-
- max=$nr
- done
-
- echo ""
- echo "#ifdef __KERNEL__"
- echo "#define __NR_${prefix}syscall_max $max"
- echo "#endif"
- echo ""
- echo "#endif /* ${fileguard} */"
-) > "$out"
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
deleted file mode 100644
index 929bde120d6b..000000000000
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-in="$1"
-out="$2"
-
-syscall_macro() {
- local abi="$1"
- local nr="$2"
- local entry="$3"
-
- echo "__SYSCALL_${abi}($nr, $entry)"
-}
-
-emit() {
- local abi="$1"
- local nr="$2"
- local entry="$3"
- local compat="$4"
-
- if [ "$abi" != "I386" -a -n "$compat" ]; then
- echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2
- exit 1
- fi
-
- if [ -z "$compat" ]; then
- if [ -n "$entry" ]; then
- syscall_macro "$abi" "$nr" "$entry"
- fi
- else
- echo "#ifdef CONFIG_X86_32"
- if [ -n "$entry" ]; then
- syscall_macro "$abi" "$nr" "$entry"
- fi
- echo "#else"
- syscall_macro "$abi" "$nr" "$compat"
- echo "#endif"
- fi
-}
-
-grep '^[0-9]' "$in" | sort -n | (
- while read nr abi name entry compat; do
- abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
- emit "$abi" "$nr" "$entry" "$compat"
- done
-) > "$out"
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 8f71dd72ef95..1eb45139fcc6 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1626,6 +1626,8 @@ static void x86_pmu_del(struct perf_event *event, int flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto do_del;
+ __set_bit(event->hw.idx, cpuc->dirty);
+
/*
* Not a TXN, therefore cleanup properly.
*/
@@ -2474,6 +2476,31 @@ static int x86_pmu_event_init(struct perf_event *event)
return err;
}
+void perf_clear_dirty_counters(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ /* Don't need to clear the assigned counter. */
+ for (i = 0; i < cpuc->n_events; i++)
+ __clear_bit(cpuc->assign[i], cpuc->dirty);
+
+ if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
+ return;
+
+ for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
+ /* Metrics and fake events don't have corresponding HW counters. */
+ if (is_metric_idx(i) || (i == INTEL_PMC_IDX_FIXED_VLBR))
+ continue;
+ else if (i >= INTEL_PMC_IDX_FIXED)
+ wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0);
+ else
+ wrmsrl(x86_pmu_event_addr(i), 0);
+ }
+
+ bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
+}
+
static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
{
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
@@ -2497,7 +2524,6 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
{
-
if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e28892270c58..fca7a6e2242f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -280,6 +280,8 @@ static struct extra_reg intel_spr_extra_regs[] __read_mostly = {
INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
EVENT_EXTRA_END
};
@@ -4030,8 +4032,10 @@ spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
* The :ppp indicates the Precise Distribution (PDist) facility, which
* is only supported on the GP counter 0. If a :ppp event which is not
* available on the GP counter 0, error out.
+ * Exception: Instruction PDIR is only available on the fixed counter 0.
*/
- if (event->attr.precise_ip == 3) {
+ if ((event->attr.precise_ip == 3) &&
+ !constraint_match(&fixed0_constraint, event->hw.config)) {
if (c->idxmsk64 & BIT_ULL(0))
return &counter0_constraint;
@@ -6015,7 +6019,13 @@ __init int intel_pmu_init(void)
tsx_attr = hsw_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(pmem);
- if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ /*
+ * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default.
+ * TSX force abort hooks are not required on these systems. Only deploy
+ * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT.
+ */
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) &&
+ !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
x86_pmu.flags |= PMU_FL_TFA;
x86_pmu.get_event_constraints = tfa_get_event_constraints;
x86_pmu.enable_all = intel_tfa_pmu_enable_all;
@@ -6157,8 +6167,13 @@ __init int intel_pmu_init(void)
pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
pmu->name = "cpu_core";
pmu->cpu_type = hybrid_big;
- pmu->num_counters = x86_pmu.num_counters + 2;
- pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+ pmu->num_counters = x86_pmu.num_counters + 2;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
+ } else {
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ }
pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
pmu->unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 433399069e27..c6262b154c3a 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL,TNT,ADL
+ * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -50,8 +50,8 @@
* MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL,ADL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
@@ -61,7 +61,7 @@
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT,RKL,ADL
+ * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
@@ -72,8 +72,8 @@
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL,ADL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
+ * TGL,TNT,RKL,ADL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@@ -566,6 +566,14 @@ static const struct cstate_model icl_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model icx_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES),
+};
+
static const struct cstate_model adl_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES) |
@@ -664,6 +672,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
+
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 1ec8fd311f38..8647713276a7 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1187,6 +1187,9 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
+ u64 value = ds->pebs_event_reset[hwc->idx];
+ u32 base = MSR_RELOAD_PMC0;
+ unsigned int idx = hwc->idx;
if (!is_pebs_pt(event))
return;
@@ -1196,7 +1199,12 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
cpuc->pebs_enabled |= PEBS_OUTPUT_PT;
- wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]);
+ if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+ base = MSR_RELOAD_FIXED_CTR0;
+ idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+ value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx];
+ }
+ wrmsrl(base + idx, value);
}
void intel_pmu_pebs_enable(struct perf_event *event)
@@ -1204,6 +1212,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
+ unsigned int idx = hwc->idx;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
@@ -1222,19 +1231,18 @@ void intel_pmu_pebs_enable(struct perf_event *event)
}
}
+ if (idx >= INTEL_PMC_IDX_FIXED)
+ idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
+
/*
* Use auto-reload if possible to save a MSR write in the PMI.
* This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
- unsigned int idx = hwc->idx;
-
- if (idx >= INTEL_PMC_IDX_FIXED)
- idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED);
ds->pebs_event_reset[idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
} else {
- ds->pebs_event_reset[hwc->idx] = 0;
+ ds->pebs_event_reset[idx] = 0;
}
intel_pmu_pebs_via_pt_enable(event);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 4409d2cccfda..9e6d6eaeb4cb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -491,7 +491,7 @@ static void intel_pmu_arch_lbr_xrstors(void *ctx)
{
struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
- copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+ xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR);
}
static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
@@ -576,7 +576,7 @@ static void intel_pmu_arch_lbr_xsaves(void *ctx)
{
struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
- copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+ xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR);
}
static void __intel_pmu_lbr_save(void *ctx)
@@ -731,7 +731,8 @@ void reserve_lbr_buffers(void)
if (!kmem_cache || cpuc->lbr_xsave)
continue;
- cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, GFP_KERNEL,
+ cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache,
+ GFP_KERNEL | __GFP_ZERO,
cpu_to_node(cpu));
}
}
@@ -992,7 +993,7 @@ static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
intel_pmu_store_lbr(cpuc, NULL);
return;
}
- copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+ xsaves(&xsave->xsave, XFEATURE_MASK_LBR);
intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
}
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index df7b07d7fdcb..9bf4dbbc26e2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -801,8 +801,6 @@ static void uncore_pmu_enable(struct pmu *pmu)
struct intel_uncore_box *box;
uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
- if (!uncore_pmu)
- return;
box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
if (!box)
@@ -818,8 +816,6 @@ static void uncore_pmu_disable(struct pmu *pmu)
struct intel_uncore_box *box;
uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
- if (!uncore_pmu)
- return;
box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
if (!box)
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 291791002997..187d7287039c 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -92,6 +92,7 @@ struct intel_uncore_type {
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
+ int (*get_topology)(struct intel_uncore_type *type);
int (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 3a75a2c601c2..609c24aec71a 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -348,6 +348,13 @@
#define SKX_M2M_PCI_PMON_CTR0 0x200
#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+/* Memory Map registers device ID */
+#define SNR_ICX_MESH2IIO_MMAP_DID 0x9a2
+#define SNR_ICX_SAD_CONTROL_CFG 0x3f4
+
+/* Getting I/O stack id in SAD_COTROL_CFG notation */
+#define SAD_CONTROL_STACK_ID(data) (((data) >> 4) & 0x7)
+
/* SNR Ubox */
#define SNR_U_MSR_PMON_CTR0 0x1f98
#define SNR_U_MSR_PMON_CTL0 0x1f91
@@ -3682,12 +3689,19 @@ static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
}
static umode_t
-skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
+ int die, int zero_bus_pmu)
{
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
- /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
- return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+ return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 0. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 0);
}
static ssize_t skx_iio_mapping_show(struct device *dev,
@@ -3772,7 +3786,8 @@ static const struct attribute_group *skx_iio_attr_update[] = {
NULL,
};
-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+static int
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
char buf[64];
int ret;
@@ -3780,7 +3795,7 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
struct attribute **attrs = NULL;
struct dev_ext_attribute *eas = NULL;
- ret = skx_iio_get_topology(type);
+ ret = type->get_topology(type);
if (ret < 0)
goto clear_attr_update;
@@ -3789,11 +3804,11 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
if (!attrs)
- goto err;
+ goto clear_topology;
eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
if (!eas)
- goto err;
+ goto clear_attrs;
for (die = 0; die < uncore_max_dies(); die++) {
sprintf(buf, "die%ld", die);
@@ -3807,20 +3822,27 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
eas[die].var = (void *)die;
attrs[die] = &eas[die].attr.attr;
}
- skx_iio_mapping_group.attrs = attrs;
+ ag->attrs = attrs;
return 0;
err:
for (; die >= 0; die--)
kfree(eas[die].attr.attr.name);
kfree(eas);
+clear_attrs:
kfree(attrs);
+clear_topology:
kfree(type->topology);
clear_attr_update:
type->attr_update = NULL;
return ret;
}
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+}
+
static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
struct attribute **attr = skx_iio_mapping_group.attrs;
@@ -3851,6 +3873,7 @@ static struct intel_uncore_type skx_uncore_iio = {
.ops = &skx_uncore_iio_ops,
.format_group = &skx_uncore_iio_format_group,
.attr_update = skx_iio_attr_update,
+ .get_topology = skx_iio_get_topology,
.set_mapping = skx_iio_set_mapping,
.cleanup_mapping = skx_iio_cleanup_mapping,
};
@@ -4393,6 +4416,91 @@ static const struct attribute_group snr_uncore_iio_format_group = {
.attrs = snr_uncore_iio_formats_attr,
};
+static umode_t
+snr_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 1. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 1);
+}
+
+static struct attribute_group snr_iio_mapping_group = {
+ .is_visible = snr_iio_mapping_visible,
+};
+
+static const struct attribute_group *snr_iio_attr_update[] = {
+ &snr_iio_mapping_group,
+ NULL,
+};
+
+static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_mapping)
+{
+ u32 sad_cfg;
+ int die, stack_id, ret = -EPERM;
+ struct pci_dev *dev = NULL;
+
+ type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
+ GFP_KERNEL);
+ if (!type->topology)
+ return -ENOMEM;
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
+ ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ die = uncore_pcibus_to_dieid(dev->bus);
+ stack_id = SAD_CONTROL_STACK_ID(sad_cfg);
+ if (die < 0 || stack_id >= type->num_boxes) {
+ ret = -EPERM;
+ break;
+ }
+
+ /* Convert stack id from SAD_CONTROL to PMON notation. */
+ stack_id = sad_pmon_mapping[stack_id];
+
+ ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number;
+ type->topology[die].segment = pci_domain_nr(dev->bus);
+ }
+
+ if (ret) {
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+
+ return ret;
+}
+
+/*
+ * SNR has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ SNR_QAT_PMON_ID,
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID
+};
+
+static u8 snr_sad_pmon_mapping[] = {
+ SNR_CBDMA_DMI_PMON_ID,
+ SNR_PCIE_GEN3_PMON_ID,
+ SNR_DLB_PMON_ID,
+ SNR_NIS_PMON_ID,
+ SNR_QAT_PMON_ID
+};
+
+static int snr_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
+}
+
+static int snr_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+}
+
static struct intel_uncore_type snr_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4406,6 +4514,10 @@ static struct intel_uncore_type snr_uncore_iio = {
.msr_offset = SNR_IIO_MSR_OFFSET,
.ops = &ivbep_uncore_msr_ops,
.format_group = &snr_uncore_iio_format_group,
+ .attr_update = snr_iio_attr_update,
+ .get_topology = snr_iio_get_topology,
+ .set_mapping = snr_iio_set_mapping,
+ .cleanup_mapping = skx_iio_cleanup_mapping,
};
static struct intel_uncore_type snr_uncore_irp = {
@@ -4933,6 +5045,53 @@ static struct event_constraint icx_uncore_iio_constraints[] = {
EVENT_CONSTRAINT_END
};
+static umode_t
+icx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ /* Root bus 0x00 is valid only for pmu_idx = 5. */
+ return pmu_iio_mapping_visible(kobj, attr, die, 5);
+}
+
+static struct attribute_group icx_iio_mapping_group = {
+ .is_visible = icx_iio_mapping_visible,
+};
+
+static const struct attribute_group *icx_iio_attr_update[] = {
+ &icx_iio_mapping_group,
+ NULL,
+};
+
+/*
+ * ICX has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON
+ */
+enum {
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+ ICX_CBDMA_DMI_PMON_ID
+};
+
+static u8 icx_sad_pmon_mapping[] = {
+ ICX_CBDMA_DMI_PMON_ID,
+ ICX_PCIE1_PMON_ID,
+ ICX_PCIE2_PMON_ID,
+ ICX_PCIE3_PMON_ID,
+ ICX_PCIE4_PMON_ID,
+ ICX_PCIE5_PMON_ID,
+};
+
+static int icx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
+}
+
+static int icx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+}
+
static struct intel_uncore_type icx_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -4947,6 +5106,10 @@ static struct intel_uncore_type icx_uncore_iio = {
.constraints = icx_uncore_iio_constraints,
.ops = &skx_uncore_iio_ops,
.format_group = &snr_uncore_iio_format_group,
+ .attr_update = icx_iio_attr_update,
+ .get_topology = icx_iio_get_topology,
+ .set_mapping = icx_iio_set_mapping,
+ .cleanup_mapping = skx_iio_cleanup_mapping,
};
static struct intel_uncore_type icx_uncore_irp = {
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ad87cb36f7c8..2bf1c7ea2758 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -229,6 +229,7 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 84a1042c3b01..85feafacc445 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -764,13 +764,14 @@ static struct rapl_model model_spr = {
.rapl_msrs = intel_rapl_spr_msrs,
};
-static struct rapl_model model_amd_fam17h = {
+static struct rapl_model model_amd_hygon = {
.events = BIT(PERF_RAPL_PKG),
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
.rapl_msrs = amd_rapl_msrs,
};
static const struct x86_cpu_id rapl_model_match[] __initconst = {
+ X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb),
X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep),
X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb),
@@ -803,9 +804,6 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
- X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h),
- X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h),
- X86_MATCH_VENDOR_FAM(AMD, 0x19, &model_amd_fam17h),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index bb0ae4b5c00f..6952e219cba3 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -614,50 +614,3 @@ bool hv_is_isolation_supported(void)
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
-
-/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
-bool hv_query_ext_cap(u64 cap_query)
-{
- /*
- * The address of the 'hv_extended_cap' variable will be used as an
- * output parameter to the hypercall below and so it should be
- * compatible with 'virt_to_phys'. Which means, it's address should be
- * directly mapped. Use 'static' to keep it compatible; stack variables
- * can be virtually mapped, making them imcompatible with
- * 'virt_to_phys'.
- * Hypercall input/output addresses should also be 8-byte aligned.
- */
- static u64 hv_extended_cap __aligned(8);
- static bool hv_extended_cap_queried;
- u64 status;
-
- /*
- * Querying extended capabilities is an extended hypercall. Check if the
- * partition supports extended hypercall, first.
- */
- if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
- return false;
-
- /* Extended capabilities do not change at runtime. */
- if (hv_extended_cap_queried)
- return hv_extended_cap & cap_query;
-
- status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
- &hv_extended_cap);
-
- /*
- * The query extended capabilities hypercall should not fail under
- * any normal circumstances. Avoid repeatedly making the hypercall, on
- * error.
- */
- hv_extended_cap_queried = true;
- status &= HV_HYPERCALL_RESULT_MASK;
- if (status != HV_STATUS_SUCCESS) {
- pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
- status);
- return false;
- }
-
- return hv_extended_cap & cap_query;
-}
-EXPORT_SYMBOL_GPL(hv_query_ext_cap);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a09fc37ead9d..5e5b9fc2747f 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -203,7 +203,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
+ MAP_32BIT,
fd_offset);
if (error != N_TXTADDR(ex))
@@ -212,7 +212,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_EXECUTABLE | MAP_32BIT,
+ MAP_32BIT,
fd_offset + ex.a_text);
if (error != N_DATADDR(ex))
return error;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index b19ec8282d50..1e51650b79d7 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -3,6 +3,7 @@
generated-y += syscalls_32.h
generated-y += syscalls_64.h
+generated-y += syscalls_x32.h
generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0603c7423aca..3ad3da9a7d97 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,25 +3,26 @@
#define _ASM_X86_ASM_H
#ifdef __ASSEMBLY__
-# define __ASM_FORM(x) x
-# define __ASM_FORM_RAW(x) x
-# define __ASM_FORM_COMMA(x) x,
+# define __ASM_FORM(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__
+# define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__,
#else
#include <linux/stringify.h>
-
-# define __ASM_FORM(x) " " __stringify(x) " "
-# define __ASM_FORM_RAW(x) __stringify(x)
-# define __ASM_FORM_COMMA(x) " " __stringify(x) ","
+# define __ASM_FORM(x, ...) " " __stringify(x,##__VA_ARGS__) " "
+# define __ASM_FORM_RAW(x, ...) __stringify(x,##__VA_ARGS__)
+# define __ASM_FORM_COMMA(x, ...) " " __stringify(x,##__VA_ARGS__) ","
#endif
+#define _ASM_BYTES(x, ...) __ASM_FORM(.byte x,##__VA_ARGS__ ;)
+
#ifndef __x86_64__
/* 32 bit */
-# define __ASM_SEL(a,b) __ASM_FORM(a)
-# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
+# define __ASM_SEL(a,b) __ASM_FORM(a)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
#else
/* 64 bit */
-# define __ASM_SEL(a,b) __ASM_FORM(b)
-# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
+# define __ASM_SEL(a,b) __ASM_FORM(b)
+# define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
#endif
#define __ASM_SIZE(inst, ...) __ASM_SEL(inst##l##__VA_ARGS__, \
@@ -119,6 +120,8 @@
# define CC_OUT(c) [_cc_ ## c] "=qm"
#endif
+#ifdef __KERNEL__
+
/* Exception table entry */
#ifdef __ASSEMBLY__
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
@@ -185,4 +188,6 @@ register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index f732741ad7c7..5e754e895767 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -269,6 +269,4 @@ static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v)
# include <asm/atomic64_64.h>
#endif
-#define ARCH_ATOMIC
-
#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 4819d5e5a335..3ba772a69cc8 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -54,11 +54,8 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
#define dma_rmb() barrier()
#define dma_wmb() barrier()
-#ifdef CONFIG_X86_32
-#define __smp_mb() asm volatile("lock; addl $0,-4(%%esp)" ::: "memory", "cc")
-#else
-#define __smp_mb() asm volatile("lock; addl $0,-4(%%rsp)" ::: "memory", "cc")
-#endif
+#define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc")
+
#define __smp_rmb() dma_rmb()
#define __smp_wmb() barrier()
#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ac37830ae941..d0ce5cfd3ac1 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -108,7 +108,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
-/* free ( 3*32+29) */
+#define X86_FEATURE_RAPL ( 3*32+29) /* AMD/Hygon RAPL interface */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
@@ -378,6 +378,7 @@
#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
+#define X86_FEATURE_RTM_ALWAYS_ABORT (18*32+11) /* "" RTM transaction always aborts */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
index f58de66091e5..8b6bd63530dc 100644
--- a/arch/x86/include/asm/crash.h
+++ b/arch/x86/include/asm/crash.h
@@ -9,10 +9,4 @@ int crash_setup_memmap_entries(struct kimage *image,
struct boot_params *params);
void crash_smp_send_stop(void);
-#ifdef CONFIG_KEXEC_CORE
-void __init crash_reserve_low_1M(void);
-#else
-static inline void __init crash_reserve_low_1M(void) { }
-#endif
-
#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 476082a83d1c..ab97b22ac04a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,7 @@
#include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>
+#include <linux/debug_locks.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@@ -224,6 +225,26 @@ static inline void store_idt(struct desc_ptr *dtr)
asm volatile("sidt %0":"=m" (*dtr));
}
+static inline void native_gdt_invalidate(void)
+{
+ const struct desc_ptr invalid_gdt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_gdt(&invalid_gdt);
+}
+
+static inline void native_idt_invalidate(void)
+{
+ const struct desc_ptr invalid_idt = {
+ .address = 0,
+ .size = 0
+ };
+
+ native_load_idt(&invalid_idt);
+}
+
/*
* The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
* a read-only remapping. To prevent a page fault, the GDT is switched to the
@@ -421,12 +442,10 @@ extern bool idt_is_f00f_address(unsigned long address);
#ifdef CONFIG_X86_64
extern void idt_setup_early_pf(void);
-extern void idt_setup_ist_traps(void);
#else
static inline void idt_setup_early_pf(void) { }
-static inline void idt_setup_ist_traps(void) { }
#endif
-extern void idt_invalidate(void *addr);
+extern void idt_invalidate(void);
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 7d7500806af8..29fea180a665 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -312,6 +312,7 @@ do { \
NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
} \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
/*
@@ -328,6 +329,7 @@ extern unsigned long task_size_32bit(void);
extern unsigned long task_size_64bit(int full_addr_space);
extern unsigned long get_mmap_base(int is_legacy);
extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
+extern unsigned long get_sigframe_size(void);
#ifdef CONFIG_X86_32
@@ -349,6 +351,7 @@ do { \
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
(unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */
@@ -357,6 +360,7 @@ do { \
if (vdso64_enabled) \
NEW_AUX_ENT(AT_SYSINFO_EHDR, \
(unsigned long __force)current->mm->context.vdso); \
+ NEW_AUX_ENT(AT_MINSIGSTKSZ, get_sigframe_size()); \
} while (0)
#define AT_SYSINFO 32
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fdee23ea4e17..5a18694a89b2 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -26,16 +26,17 @@
/*
* High level FPU state handling functions:
*/
-extern void fpu__prepare_read(struct fpu *fpu);
-extern void fpu__prepare_write(struct fpu *fpu);
-extern void fpu__save(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
-extern int fpu__copy(struct task_struct *dst, struct task_struct *src);
extern void fpu__clear_user_states(struct fpu *fpu);
-extern void fpu__clear_all(struct fpu *fpu);
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern void fpu_sync_fpstate(struct fpu *fpu);
+
+/* Clone and exit operations */
+extern int fpu_clone(struct task_struct *dst);
+extern void fpu_flush_thread(void);
+
/*
* Boot time FPU initialization functions:
*/
@@ -45,7 +46,6 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
@@ -86,23 +86,9 @@ extern void fpstate_init_soft(struct swregs_state *soft);
#else
static inline void fpstate_init_soft(struct swregs_state *soft) {}
#endif
+extern void save_fpregs_to_fpstate(struct fpu *fpu);
-static inline void fpstate_init_xstate(struct xregs_state *xsave)
-{
- /*
- * XRSTORS requires these bits set in xcomp_bv, or it will
- * trigger #GP:
- */
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
-}
-
-static inline void fpstate_init_fxstate(struct fxregs_state *fx)
-{
- fx->cwd = 0x37f;
- fx->mxcsr = MXCSR_DEFAULT;
-}
-extern void fpstate_sanitize_xstate(struct fpu *fpu);
-
+/* Returns 0 or the negated trap number, which results in -EFAULT for #PF */
#define user_insn(insn, output, input...) \
({ \
int err; \
@@ -110,14 +96,14 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
might_fault(); \
\
asm volatile(ASM_STAC "\n" \
- "1:" #insn "\n\t" \
+ "1: " #insn "\n" \
"2: " ASM_CLAC "\n" \
".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
+ "3: negl %%eax\n" \
" jmp 2b\n" \
".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err), output \
: "0"(0), input); \
err; \
})
@@ -143,12 +129,12 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
: output : input)
-static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx)
{
return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
}
-static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
@@ -157,7 +143,7 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
}
-static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+static inline void fxrstor(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -165,7 +151,7 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
+static inline int fxrstor_safe(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -173,7 +159,7 @@ static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx)
return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -181,27 +167,27 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+static inline void frstor(struct fregs_state *fx)
{
kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline int copy_kernel_to_fregs_err(struct fregs_state *fx)
+static inline int frstor_safe(struct fregs_state *fx)
{
return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+static inline int frstor_from_user_sigframe(struct fregs_state __user *fx)
{
return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+static inline void fxsave(struct fxregs_state *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
- asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx));
else
- asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx));
}
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -211,16 +197,20 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+/*
+ * After this @err contains 0 on success or the negated trap number when
+ * the operation raises an exception. For faults this results in -EFAULT.
+ */
#define XSTATE_OP(op, st, lmask, hmask, err) \
asm volatile("1:" op "\n\t" \
"xor %[err], %[err]\n" \
"2:\n\t" \
".pushsection .fixup,\"ax\"\n\t" \
- "3: movl $-2,%[err]\n\t" \
+ "3: negl %%eax\n\t" \
"jmp 2b\n\t" \
".popsection\n\t" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err) \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : [err] "=a" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -272,31 +262,9 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
* This function is called only during boot time when x86 caps are not set
* up and alternative can not be used yet.
*/
-static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+static inline void os_xrstor_booting(struct xregs_state *xstate)
{
- u64 mask = xfeatures_mask_all;
- u32 lmask = mask;
- u32 hmask = mask >> 32;
- int err;
-
- WARN_ON(system_state != SYSTEM_BOOTING);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
- else
- XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
-
- /* We should never fault when copying to a kernel buffer: */
- WARN_ON_FPU(err);
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
-{
- u64 mask = -1;
+ u64 mask = xfeatures_mask_fpstate();
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -317,8 +285,11 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
/*
* Save processor xstate to xsave area.
+ *
+ * Uses either XSAVE or XSAVEOPT or XSAVES depending on the CPU features
+ * and command line options. The choice is permanent until the next reboot.
*/
-static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+static inline void os_xsave(struct xregs_state *xstate)
{
u64 mask = xfeatures_mask_all;
u32 lmask = mask;
@@ -335,8 +306,10 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
/*
* Restore processor xstate from xsave area.
+ *
+ * Uses XRSTORS when XSAVES is used, XRSTOR otherwise.
*/
-static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+static inline void os_xrstor(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
@@ -354,9 +327,14 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
* backward compatibility for old applications which don't understand
* compacted format of xsave area.
*/
-static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
{
- u64 mask = xfeatures_mask_user();
+ /*
+ * Include the features which are not xsaved/rstored by the kernel
+ * internally, e.g. PKRU. That's user space ABI and also required
+ * to allow the signal handler to modify PKRU.
+ */
+ u64 mask = xfeatures_mask_uabi();
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
@@ -379,7 +357,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
/*
* Restore xstate from user space xsave area.
*/
-static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 mask)
{
struct xregs_state *xstate = ((__force struct xregs_state *)buf);
u32 lmask = mask;
@@ -397,13 +375,13 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
* Restore xstate from kernel space xsave area, return an error code instead of
* an exception.
*/
-static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
+static inline int os_xrstor_safe(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
int err;
- if (static_cpu_has(X86_FEATURE_XSAVES))
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES))
XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
else
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -411,36 +389,11 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
return err;
}
-extern int copy_fpregs_to_fpstate(struct fpu *fpu);
+extern void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask);
-static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
+static inline void restore_fpregs_from_fpstate(union fpregs_state *fpstate)
{
- if (use_xsave()) {
- copy_kernel_to_xregs(&fpstate->xsave, mask);
- } else {
- if (use_fxsr())
- copy_kernel_to_fxregs(&fpstate->fxsave);
- else
- copy_kernel_to_fregs(&fpstate->fsave);
- }
-}
-
-static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
-{
- /*
- * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
- * pending. Clear the x87 state here by setting it to fixed values.
- * "m" is a random variable that should be in L1.
- */
- if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
- asm volatile(
- "fnclex\n\t"
- "emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
- }
-
- __copy_kernel_to_fpregs(fpstate, -1);
+ __restore_fpregs_from_fpstate(fpstate, xfeatures_mask_fpstate());
}
extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
@@ -499,10 +452,8 @@ static inline void fpregs_activate(struct fpu *fpu)
trace_x86_fpu_regs_activated(fpu);
}
-/*
- * Internal helper, do not use directly. Use switch_fpu_return() instead.
- */
-static inline void __fpregs_load_activate(void)
+/* Internal helper for switch_fpu_return() and signal frame setup */
+static inline void fpregs_restore_userregs(void)
{
struct fpu *fpu = &current->thread.fpu;
int cpu = smp_processor_id();
@@ -511,7 +462,21 @@ static inline void __fpregs_load_activate(void)
return;
if (!fpregs_state_valid(fpu, cpu)) {
- copy_kernel_to_fpregs(&fpu->state);
+ u64 mask;
+
+ /*
+ * This restores _all_ xstate which has not been
+ * established yet.
+ *
+ * If PKRU is enabled, then the PKRU value is already
+ * correct because it was either set in switch_to() or in
+ * flush_thread(). So it is excluded because it might be
+ * not up to date in current->thread.fpu.xsave state.
+ */
+ mask = xfeatures_mask_restore_user() |
+ xfeatures_mask_supervisor();
+ __restore_fpregs_from_fpstate(&fpu->state, mask);
+
fpregs_activate(fpu);
fpu->last_cpu = cpu;
}
@@ -543,12 +508,17 @@ static inline void __fpregs_load_activate(void)
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
- if (!copy_fpregs_to_fpstate(old_fpu))
- old_fpu->last_cpu = -1;
- else
- old_fpu->last_cpu = cpu;
+ save_fpregs_to_fpstate(old_fpu);
+ /*
+ * The save operation preserved register state, so the
+ * fpu_fpregs_owner_ctx is still @old_fpu. Store the
+ * current CPU number in @old_fpu, so the next return
+ * to user space can avoid the FPU register restore
+ * when is returns on the same CPU and still owns the
+ * context.
+ */
+ old_fpu->last_cpu = cpu;
- /* But leave fpu_fpregs_owner_ctx! */
trace_x86_fpu_regs_deactivated(old_fpu);
}
}
@@ -558,39 +528,13 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
*/
/*
- * Load PKRU from the FPU context if available. Delay loading of the
- * complete FPU state until the return to userland.
+ * Delay loading of the complete FPU state until the return to userland.
+ * PKRU is handled separately.
*/
static inline void switch_fpu_finish(struct fpu *new_fpu)
{
- u32 pkru_val = init_pkru_value;
- struct pkru_state *pk;
-
- if (!static_cpu_has(X86_FEATURE_FPU))
- return;
-
- set_thread_flag(TIF_NEED_FPU_LOAD);
-
- if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
- return;
-
- /*
- * PKRU state is switched eagerly because it needs to be valid before we
- * return to userland e.g. for a copy_to_user() operation.
- */
- if (!(current->flags & PF_KTHREAD)) {
- /*
- * If the PKRU bit in xsave.header.xfeatures is not set,
- * then the PKRU component was in init state, which means
- * XRSTOR will set PKRU to 0. If the bit is not set then
- * get_xsave_addr() will return NULL because the PKRU value
- * in memory is not valid. This means pkru_val has to be
- * set to 0 and not to init_pkru_value.
- */
- pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU);
- pkru_val = pk ? pk->pkru : 0;
- }
- __write_pkru(pkru_val);
+ if (cpu_feature_enabled(X86_FEATURE_FPU))
+ set_thread_flag(TIF_NEED_FPU_LOAD);
}
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 7fb516b6893a..8b6631dffefd 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -29,6 +29,8 @@ unsigned long
fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
unsigned long *buf_fx, unsigned long *size);
+unsigned long fpu__get_fpstate_size(void);
+
extern void fpu__init_prepare_fx_sw_frame(void);
#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..109dfcc75299 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -6,6 +6,7 @@
#include <linux/types.h>
#include <asm/processor.h>
+#include <asm/fpu/api.h>
#include <asm/user.h>
/* Bit 63 of XCR0 is reserved for future expansion */
@@ -34,6 +35,14 @@
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR)
+/*
+ * Features which are restored when returning to user space.
+ * PKRU is not restored on return to user space because PKRU
+ * is switched eagerly in switch_to() and flush_thread()
+ */
+#define XFEATURE_MASK_USER_RESTORE \
+ (XFEATURE_MASK_USER_SUPPORTED & ~XFEATURE_MASK_PKRU)
+
/* All currently supported supervisor features */
#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
@@ -42,21 +51,21 @@
* and its size may be huge. Saving/restoring such supervisor state components
* at each context switch can cause high CPU and space overhead, which should
* be avoided. Such supervisor state components should only be saved/restored
- * on demand. The on-demand dynamic supervisor features are set in this mask.
+ * on demand. The on-demand supervisor features are set in this mask.
*
- * Unlike the existing supported supervisor features, a dynamic supervisor
+ * Unlike the existing supported supervisor features, an independent supervisor
* feature does not allocate a buffer in task->fpu, and the corresponding
* supervisor state component cannot be saved/restored at each context switch.
*
- * To support a dynamic supervisor feature, a developer should follow the
+ * To support an independent supervisor feature, a developer should follow the
* dos and don'ts as below:
* - Do dynamically allocate a buffer for the supervisor state component.
* - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
* state component to/from the buffer.
- * - Don't set the bit corresponding to the dynamic supervisor feature in
+ * - Don't set the bit corresponding to the independent supervisor feature in
* IA32_XSS at run time, since it has been set at boot time.
*/
-#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+#define XFEATURE_MASK_INDEPENDENT (XFEATURE_MASK_LBR)
/*
* Unsupported supervisor features. When a supervisor feature in this mask is
@@ -66,7 +75,7 @@
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
- XFEATURE_MASK_DYNAMIC | \
+ XFEATURE_MASK_INDEPENDENT | \
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
#ifdef CONFIG_X86_64
@@ -82,17 +91,42 @@ static inline u64 xfeatures_mask_supervisor(void)
return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}
-static inline u64 xfeatures_mask_user(void)
+/*
+ * The xfeatures which are enabled in XCR0 and expected to be in ptrace
+ * buffers and signal frames.
+ */
+static inline u64 xfeatures_mask_uabi(void)
{
return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
}
-static inline u64 xfeatures_mask_dynamic(void)
+/*
+ * The xfeatures which are restored by the kernel when returning to user
+ * mode. This is not necessarily the same as xfeatures_mask_uabi() as the
+ * kernel does not manage all XCR0 enabled features via xsave/xrstor as
+ * some of them have to be switched eagerly on context switch and exec().
+ */
+static inline u64 xfeatures_mask_restore_user(void)
+{
+ return xfeatures_mask_all & XFEATURE_MASK_USER_RESTORE;
+}
+
+/*
+ * Like xfeatures_mask_restore_user() but additionally restors the
+ * supported supervisor states.
+ */
+static inline u64 xfeatures_mask_fpstate(void)
+{
+ return xfeatures_mask_all & \
+ (XFEATURE_MASK_USER_RESTORE | XFEATURE_MASK_SUPERVISOR_SUPPORTED);
+}
+
+static inline u64 xfeatures_mask_independent(void)
{
if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+ return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_DYNAMIC;
+ return XFEATURE_MASK_INDEPENDENT;
}
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
@@ -101,19 +135,21 @@ extern void __init update_regset_xstate_info(unsigned int size,
u64 xstate_mask);
void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
-const void *get_xsave_field_ptr(int xfeature_nr);
-int using_compacted_format(void);
int xfeature_size(int xfeature_nr);
-struct membuf;
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave);
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
-void copy_supervisor_to_kernel(struct xregs_state *xsave);
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
+int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+void xsaves(struct xregs_state *xsave, u64 mask);
+void xrstors(struct xregs_state *xsave, u64 mask);
-/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr);
+enum xstate_copy_mode {
+ XSTATE_COPY_FP,
+ XSTATE_COPY_FX,
+ XSTATE_COPY_XSAVE,
+};
+
+struct membuf;
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode mode);
#endif
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 73d45b0dfff2..1345088e9902 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -312,8 +312,8 @@ static __always_inline void __##func(struct pt_regs *regs)
*/
#define DECLARE_IDTENTRY_VC(vector, func) \
DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
- __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \
- __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code)
+ __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned long error_code); \
+ __visible noinstr void user_##func(struct pt_regs *regs, unsigned long error_code)
/**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points
@@ -355,33 +355,24 @@ static __always_inline void __##func(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
/**
- * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
- which runs on a safe stack.
+ * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
+ when raised from kernel mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
-#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
+#define DEFINE_IDTENTRY_VC_KERNEL(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
/**
- * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
- which runs on the VC fall-back stack
+ * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
+ when raised from user mode
* @func: Function name of the entry point
*
* Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
*/
-#define DEFINE_IDTENTRY_VC_IST(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
-
-/**
- * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
- * @func: Function name of the entry point
- *
- * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
- */
-#define DEFINE_IDTENTRY_VC(func) \
- DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+#define DEFINE_IDTENTRY_VC_USER(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
#else /* CONFIG_X86_64 */
@@ -504,7 +495,7 @@ __visible noinstr void func(struct pt_regs *regs, \
.align 8
SYM_CODE_START(irq_entries_start)
vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ .rept NR_EXTERNAL_VECTORS
UNWIND_HINT_IRET_REGS
0 :
.byte 0x6a, vector
@@ -520,7 +511,7 @@ SYM_CODE_END(irq_entries_start)
.align 8
SYM_CODE_START(spurious_entries_start)
vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+ .rept NR_SYSTEM_VECTORS
UNWIND_HINT_IRET_REGS
0 :
.byte 0x6a, vector
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 955b06d6325a..27158436f322 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -102,7 +102,8 @@
#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
-#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Willow Cove */
+
+#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 889f8b1b5b7f..43dcb9284208 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -26,8 +26,8 @@
* This file enumerates the exact layout of them:
*/
+/* This is used as an interrupt vector when programming the APIC. */
#define NMI_VECTOR 0x02
-#define MCE_VECTOR 0x12
/*
* IDT vectors usable for external interrupt sources start at 0x20.
@@ -84,7 +84,7 @@
*/
#define IRQ_WORK_VECTOR 0xf6
-#define UV_BAU_MESSAGE 0xf5
+/* 0xf5 - unused, was UV_BAU_MESSAGE */
#define DEFERRED_ERROR_VECTOR 0xf4
/* Vector on which hypervisor callbacks will be delivered */
@@ -114,6 +114,9 @@
#define FIRST_SYSTEM_VECTOR NR_VECTORS
#endif
+#define NR_EXTERNAL_VECTORS (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+#define NR_SYSTEM_VECTORS (NR_VECTORS - FIRST_SYSTEM_VECTOR)
+
/*
* Size the maximum number of interrupts.
*
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 610a05374c02..0449b125d27f 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -4,8 +4,6 @@
#define HAVE_JUMP_LABEL_BATCH
-#define JUMP_LABEL_NOP_SIZE 5
-
#include <asm/asm.h>
#include <asm/nops.h>
@@ -14,15 +12,35 @@
#include <linux/stringify.h>
#include <linux/types.h>
+#define JUMP_TABLE_ENTRY \
+ ".pushsection __jump_table, \"aw\" \n\t" \
+ _ASM_ALIGN "\n\t" \
+ ".long 1b - . \n\t" \
+ ".long %l[l_yes] - . \n\t" \
+ _ASM_PTR "%c0 + %c1 - .\n\t" \
+ ".popsection \n\t"
+
+#ifdef CONFIG_STACK_VALIDATION
+
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+{
+ asm_volatile_goto("1:"
+ "jmp %l[l_yes] # objtool NOPs this \n\t"
+ JUMP_TABLE_ENTRY
+ : : "i" (key), "i" (2 | branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+#else
+
static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
asm_volatile_goto("1:"
".byte " __stringify(BYTES_NOP5) "\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- ".long 1b - ., %l[l_yes] - . \n\t"
- _ASM_PTR "%c0 + %c1 - .\n\t"
- ".popsection \n\t"
+ JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
return false;
@@ -30,16 +48,13 @@ l_yes:
return true;
}
+#endif /* STACK_VALIDATION */
+
static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
asm_volatile_goto("1:"
- ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
- "2:\n\t"
- ".pushsection __jump_table, \"aw\" \n\t"
- _ASM_ALIGN "\n\t"
- ".long 1b - ., %l[l_yes] - . \n\t"
- _ASM_PTR "%c0 + %c1 - .\n\t"
- ".popsection \n\t"
+ "jmp %l[l_yes]\n\t"
+ JUMP_TABLE_ENTRY
: : "i" (key), "i" (branch) : : l_yes);
return false;
@@ -47,41 +62,7 @@ l_yes:
return true;
}
-#else /* __ASSEMBLY__ */
-
-.macro STATIC_JUMP_IF_TRUE target, key, def
-.Lstatic_jump_\@:
- .if \def
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .else
- .byte BYTES_NOP5
- .endif
- .pushsection __jump_table, "aw"
- _ASM_ALIGN
- .long .Lstatic_jump_\@ - ., \target - .
- _ASM_PTR \key - .
- .popsection
-.endm
-
-.macro STATIC_JUMP_IF_FALSE target, key, def
-.Lstatic_jump_\@:
- .if \def
- .byte BYTES_NOP5
- .else
- /* Equivalent to "jmp.d32 \target" */
- .byte 0xe9
- .long \target - .Lstatic_jump_after_\@
-.Lstatic_jump_after_\@:
- .endif
- .pushsection __jump_table, "aw"
- _ASM_ALIGN
- .long .Lstatic_jump_\@ - ., \target - .
- _ASM_PTR \key + 1 - .
- .popsection
-.endm
+extern int arch_jump_entry_size(struct jump_entry *entry);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..0607ec4f5091 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -305,7 +305,7 @@ extern void apei_mce_report_mem_error(int corrected,
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
- SMCA_LS_V2, /* Load Store */
+ SMCA_LS_V2,
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
@@ -314,17 +314,22 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
- SMCA_CS_V2, /* Coherent Slave */
+ SMCA_CS_V2,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_V2,
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
- SMCA_PSP_V2, /* Platform Security Processor */
+ SMCA_PSP_V2,
SMCA_SMU, /* System Management Unit */
- SMCA_SMU_V2, /* System Management Unit */
+ SMCA_SMU_V2,
SMCA_MP5, /* Microprocessor 5 Unit */
SMCA_NBIO, /* Northbridge IO Unit */
SMCA_PCIE, /* PCI Express Unit */
+ SMCA_PCIE_V2,
+ SMCA_XGMI_PCS, /* xGMI PCS Unit */
+ SMCA_XGMI_PHY, /* xGMI PHY Unit */
+ SMCA_WAFL_PHY, /* WAFL PHY Unit */
N_SMCA_BANK_TYPES
};
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 211ba3375ee9..a7c413432b33 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -772,6 +772,10 @@
#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+#define MSR_TFA_TSX_CPUID_CLEAR_BIT 1
+#define MSR_TFA_TSX_CPUID_CLEAR BIT_ULL(MSR_TFA_TSX_CPUID_CLEAR_BIT)
+#define MSR_TFA_SDV_ENABLE_RTM_BIT 2
+#define MSR_TFA_SDV_ENABLE_RTM BIT_ULL(MSR_TFA_SDV_ENABLE_RTM_BIT)
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index c1e5e818ba16..c5573eaa5bb9 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_NOPS_H
#define _ASM_X86_NOPS_H
+#include <asm/asm.h>
+
/*
* Define nops for use with alternative() and for tracing.
*/
@@ -57,20 +59,14 @@
#endif /* CONFIG_64BIT */
-#ifdef __ASSEMBLY__
-#define _ASM_MK_NOP(x) .byte x
-#else
-#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
-#endif
-
-#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8)
+#define ASM_NOP1 _ASM_BYTES(BYTES_NOP1)
+#define ASM_NOP2 _ASM_BYTES(BYTES_NOP2)
+#define ASM_NOP3 _ASM_BYTES(BYTES_NOP3)
+#define ASM_NOP4 _ASM_BYTES(BYTES_NOP4)
+#define ASM_NOP5 _ASM_BYTES(BYTES_NOP5)
+#define ASM_NOP6 _ASM_BYTES(BYTES_NOP6)
+#define ASM_NOP7 _ASM_BYTES(BYTES_NOP7)
+#define ASM_NOP8 _ASM_BYTES(BYTES_NOP8)
#define ASM_NOP_MAX 8
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 7555b48803a8..4d5810c8fab7 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}
-#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
+ alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index ca840fec7776..4bde0dc66100 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -75,7 +75,7 @@ void copy_page(void *to, void *from);
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/
-static inline unsigned long task_size_max(void)
+static __always_inline unsigned long task_size_max(void)
{
unsigned long ret;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 544f41a179fb..8fc1b5003713 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -478,6 +478,7 @@ struct x86_pmu_lbr {
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
+extern void perf_clear_dirty_counters(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 62ad61d6fefc..c7ec5bb88334 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -84,8 +84,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
#if CONFIG_PGTABLE_LEVELS > 2
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b1099f2d9800..448cd01eb3ec 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -23,7 +23,7 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
-#include <asm/fpu/xstate.h>
+#include <asm/pkru.h>
#include <asm/fpu/api.h>
#include <asm-generic/pgtable_uffd.h>
@@ -126,35 +126,6 @@ static inline int pte_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_DIRTY;
}
-
-static inline u32 read_pkru(void)
-{
- if (boot_cpu_has(X86_FEATURE_OSPKE))
- return rdpkru();
- return 0;
-}
-
-static inline void write_pkru(u32 pkru)
-{
- struct pkru_state *pk;
-
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return;
-
- pk = get_xsave_addr(&current->thread.fpu.state.xsave, XFEATURE_PKRU);
-
- /*
- * The PKRU value in xstate needs to be in sync with the value that is
- * written to the CPU. The FPU restore on return to userland would
- * otherwise load the previous value again.
- */
- fpregs_lock();
- if (pk)
- pk->pkru = pkru;
- __write_pkru(pkru);
- fpregs_unlock();
-}
-
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
@@ -865,9 +836,9 @@ static inline int pud_present(pud_t pud)
return pud_flags(pud) & _PAGE_PRESENT;
}
-static inline unsigned long pud_page_vaddr(pud_t pud)
+static inline pmd_t *pud_pgtable(pud_t pud)
{
- return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
+ return (pmd_t *)__va(pud_val(pud) & pud_pfn_mask(pud));
}
/*
@@ -906,9 +877,9 @@ static inline int p4d_present(p4d_t p4d)
return p4d_flags(p4d) & _PAGE_PRESENT;
}
-static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+static inline pud_t *p4d_pgtable(p4d_t p4d)
{
- return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
+ return (pud_t *)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}
/*
@@ -1360,32 +1331,6 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
-#define PKRU_BITS_PER_PKEY 2
-
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-extern u32 init_pkru_value;
-#else
-#define init_pkru_value 0
-#endif
-
-static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
-{
- int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
- return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
-}
-
-static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
-{
- int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
- /*
- * Access-disable disables writes too so we need to check
- * both bits here.
- */
- return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
-}
-
static inline u16 pte_flags_pkey(unsigned long pte_flags)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..40497a9020c6 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -7,8 +7,6 @@
#include <asm/page_types.h>
-#define FIRST_USER_ADDRESS 0UL
-
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
#define _PAGE_BIT_USER 2 /* userspace addressable */
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 2ff9b98812b7..5c7bcaa79623 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -9,14 +9,14 @@
* will be necessary to ensure that the types that store key
* numbers and masks have sufficient capacity.
*/
-#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+#define arch_max_pkey() (cpu_feature_enabled(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
static inline bool arch_pkeys_enabled(void)
{
- return boot_cpu_has(X86_FEATURE_OSPKE);
+ return cpu_feature_enabled(X86_FEATURE_OSPKE);
}
/*
@@ -26,7 +26,7 @@ static inline bool arch_pkeys_enabled(void)
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
@@ -37,7 +37,7 @@ extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
int prot, int pkey)
{
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return 0;
return __arch_override_mprotect_pkey(vma, prot, pkey);
@@ -124,7 +124,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
-extern void copy_init_pkru_to_fpregs(void);
static inline int vma_pkey(struct vm_area_struct *vma)
{
diff --git a/arch/x86/include/asm/pkru.h b/arch/x86/include/asm/pkru.h
new file mode 100644
index 000000000000..ccc539faa5bb
--- /dev/null
+++ b/arch/x86/include/asm/pkru.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKRU_H
+#define _ASM_X86_PKRU_H
+
+#include <asm/fpu/xstate.h>
+
+#define PKRU_AD_BIT 0x1
+#define PKRU_WD_BIT 0x2
+#define PKRU_BITS_PER_PKEY 2
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+extern u32 init_pkru_value;
+#define pkru_get_init_value() READ_ONCE(init_pkru_value)
+#else
+#define init_pkru_value 0
+#define pkru_get_init_value() 0
+#endif
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u32 read_pkru(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return rdpkru();
+ return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+ /*
+ * WRPKRU is relatively expensive compared to RDPKRU.
+ * Avoid WRPKRU when it would not change the value.
+ */
+ if (pkru != rdpkru())
+ wrpkru(pkru);
+}
+
+static inline void pkru_write_default(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ wrpkru(pkru_get_init_value());
+}
+
+#endif
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index f8cb8af4de5c..fe5efbcba824 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+ per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 556b2b17c3e2..f3020c54e2cb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -518,6 +518,15 @@ struct thread_struct {
unsigned int sig_on_uaccess_err:1;
+ /*
+ * Protection Keys Register for Userspace. Loaded immediately on
+ * context switch. Store it in thread_struct to avoid a lookup in
+ * the tasks's FPU xstate buffer. This value is only valid when a
+ * task is scheduled out. For 'current' the authoritative source of
+ * PKRU is the hardware itself.
+ */
+ u32 pkru;
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
@@ -663,6 +672,7 @@ extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
+extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
extern void cr4_init(void);
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index 629c3df243f0..2cef6c5a52c2 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -9,8 +9,13 @@
#define __ASM_X86_SEV_COMMON_H
#define GHCB_MSR_INFO_POS 0
-#define GHCB_MSR_INFO_MASK (BIT_ULL(12) - 1)
+#define GHCB_DATA_LOW 12
+#define GHCB_MSR_INFO_MASK (BIT_ULL(GHCB_DATA_LOW) - 1)
+#define GHCB_DATA(v) \
+ (((unsigned long)(v) & ~GHCB_MSR_INFO_MASK) >> GHCB_DATA_LOW)
+
+/* SEV Information Request/Response */
#define GHCB_MSR_SEV_INFO_RESP 0x001
#define GHCB_MSR_SEV_INFO_REQ 0x002
#define GHCB_MSR_VER_MAX_POS 48
@@ -28,6 +33,7 @@
#define GHCB_MSR_PROTO_MAX(v) (((v) >> GHCB_MSR_VER_MAX_POS) & GHCB_MSR_VER_MAX_MASK)
#define GHCB_MSR_PROTO_MIN(v) (((v) >> GHCB_MSR_VER_MIN_POS) & GHCB_MSR_VER_MIN_MASK)
+/* CPUID Request/Response */
#define GHCB_MSR_CPUID_REQ 0x004
#define GHCB_MSR_CPUID_RESP 0x005
#define GHCB_MSR_CPUID_FUNC_POS 32
@@ -45,6 +51,14 @@
(((unsigned long)reg & GHCB_MSR_CPUID_REG_MASK) << GHCB_MSR_CPUID_REG_POS) | \
(((unsigned long)fn) << GHCB_MSR_CPUID_FUNC_POS))
+/* AP Reset Hold */
+#define GHCB_MSR_AP_RESET_HOLD_REQ 0x006
+#define GHCB_MSR_AP_RESET_HOLD_RESP 0x007
+
+/* GHCB Hypervisor Feature Request/Response */
+#define GHCB_MSR_HV_FT_REQ 0x080
+#define GHCB_MSR_HV_FT_RESP 0x081
+
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index 9c31e0ebc55b..05f3e21f01a7 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -13,7 +13,7 @@
/*
* This file contains both data structures defined by SGX architecture and Linux
* defined software data structures and functions. The two should not be mixed
- * together for better readibility. The architectural definitions come first.
+ * together for better readability. The architectural definitions come first.
*/
/* The SGX specific CPUID function. */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 84eab2724875..5b1ed650b124 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -85,4 +85,6 @@ struct rt_sigframe_x32 {
#endif /* CONFIG_X86_64 */
+void __init init_sigframe_size(void);
+
#endif /* _ASM_X86_SIGFRAME_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2acd6cb62328..f3fbb84ff8a7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -104,25 +104,13 @@ static inline void wrpkru(u32 pkru)
: : "a" (pkru), "c"(ecx), "d"(edx));
}
-static inline void __write_pkru(u32 pkru)
-{
- /*
- * WRPKRU is relatively expensive compared to RDPKRU.
- * Avoid WRPKRU when it would not change the value.
- */
- if (pkru == rdpkru())
- return;
-
- wrpkru(pkru);
-}
-
#else
static inline u32 rdpkru(void)
{
return 0;
}
-static inline void __write_pkru(u32 pkru)
+static inline void wrpkru(u32 pkru)
{
}
#endif
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index b6ffe58c70fa..24a8d6c4fb18 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -11,7 +11,7 @@
* The same segment is shared by percpu area and stack canary. On
* x86_64, percpu symbols are zero based and %gs (64-bit) points to the
* base of percpu area. The first occupant of the percpu area is always
- * fixed_percpu_data which contains stack_canary at the approproate
+ * fixed_percpu_data which contains stack_canary at the appropriate
* offset. On x86_32, the stack canary is just a regular percpu
* variable.
*
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7cbf733d11af..f7e2d82d24fb 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -21,13 +21,12 @@ extern const sys_call_ptr_t sys_call_table[];
#if defined(CONFIG_X86_32)
#define ia32_sys_call_table sys_call_table
-#endif
-
-#if defined(CONFIG_IA32_EMULATION)
+#else
+/*
+ * These may not exist, but still put the prototypes in so we
+ * can use IS_ENABLED().
+ */
extern const sys_call_ptr_t ia32_sys_call_table[];
-#endif
-
-#ifdef CONFIG_X86_X32_ABI
extern const sys_call_ptr_t x32_sys_call_table[];
#endif
@@ -160,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
-void do_syscall_64(unsigned long nr, struct pt_regs *regs);
+void do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index 80c08c7d5e72..6a2827d0681f 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -17,7 +17,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* __x64_sys_*() - 64-bit native syscall
* __ia32_sys_*() - 32-bit native syscall or common compat syscall
* __ia32_compat_sys_*() - 32-bit compat syscall
- * __x32_compat_sys_*() - 64-bit X32 compat syscall
+ * __x64_compat_sys_*() - 64-bit X32 compat syscall
*
* The registers are decoded according to the ABI:
* 64-bit: RDI, RSI, RDX, R10, R8, R9
@@ -166,17 +166,17 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
* with x86_64 obviously do not need such care.
*/
#define __X32_COMPAT_SYS_STUB0(name) \
- __SYS_STUB0(x32, compat_sys_##name)
+ __SYS_STUB0(x64, compat_sys_##name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...) \
- __SYS_STUBx(x32, compat_sys##name, \
+ __SYS_STUBx(x64, compat_sys##name, \
SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
#define __X32_COMPAT_COND_SYSCALL(name) \
- __COND_SYSCALL(x32, compat_sys_##name)
+ __COND_SYSCALL(x64, compat_sys_##name)
#define __X32_COMPAT_SYS_NI(name) \
- __SYS_NI(x32, compat_sys_##name)
+ __SYS_NI(x64, compat_sys_##name)
#else /* CONFIG_X86_X32 */
#define __X32_COMPAT_SYS_STUB0(name)
#define __X32_COMPAT_SYS_STUBx(x, name, ...)
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
deleted file mode 100644
index 9c754a7447aa..000000000000
--- a/arch/x86/include/asm/unaligned.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_UNALIGNED_H
-#define _ASM_X86_UNALIGNED_H
-
-/*
- * The x86 can do unaligned accesses itself.
- */
-
-#include <linux/unaligned/access_ok.h>
-#include <linux/unaligned/generic.h>
-
-#define get_unaligned __get_unaligned_le
-#define put_unaligned __put_unaligned_le
-
-#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index c1c3d31b15c0..80e9d5206a71 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -13,7 +13,7 @@
# define __ARCH_WANT_SYS_OLD_MMAP
# define __ARCH_WANT_SYS_OLD_SELECT
-# define __NR_ia32_syscall_max __NR_syscall_max
+# define IA32_NR_syscalls (__NR_syscalls)
# else
@@ -26,12 +26,12 @@
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
+# define X32_NR_syscalls (__NR_x32_syscalls)
+# define IA32_NR_syscalls (__NR_ia32_syscalls)
# endif
-# define NR_syscalls (__NR_syscall_max + 1)
-# define X32_NR_syscalls (__NR_x32_syscall_max + 1)
-# define IA32_NR_syscalls (__NR_ia32_syscall_max + 1)
+# define NR_syscalls (__NR_syscalls)
# define __ARCH_WANT_NEW_STAT
# define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/uapi/asm/auxvec.h b/arch/x86/include/uapi/asm/auxvec.h
index 580e3c567046..6beb55bbefa4 100644
--- a/arch/x86/include/uapi/asm/auxvec.h
+++ b/arch/x86/include/uapi/asm/auxvec.h
@@ -12,9 +12,9 @@
/* entries in ARCH_DLINFO: */
#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
-# define AT_VECTOR_SIZE_ARCH 2
+# define AT_VECTOR_SIZE_ARCH 3
#else /* else it's non-compat x86-64 */
-# define AT_VECTOR_SIZE_ARCH 1
+# define AT_VECTOR_SIZE_ARCH 2
#endif
#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
index 5fdfcb47000f..054604aba9f0 100644
--- a/arch/x86/include/uapi/asm/hwcap2.h
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -2,10 +2,12 @@
#ifndef _ASM_X86_HWCAP2_H
#define _ASM_X86_HWCAP2_H
+#include <linux/const.h>
+
/* MONITOR/MWAIT enabled in Ring 3 */
-#define HWCAP2_RING3MWAIT (1 << 0)
+#define HWCAP2_RING3MWAIT _BITUL(0)
/* Kernel allows FSGSBASE instructions available in Ring 3 */
-#define HWCAP2_FSGSBASE BIT(1)
+#define HWCAP2_FSGSBASE _BITUL(1)
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f66682ac02a..3e625c61f008 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e90310cbe73a..e55e0c1fad8c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -5,6 +5,7 @@
* Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
* Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
*/
+#define pr_fmt(fmt) "ACPI: " fmt
#include <linux/init.h>
#include <linux/acpi.h>
@@ -42,8 +43,6 @@ EXPORT_SYMBOL(acpi_disabled);
# include <asm/proto.h>
#endif /* X86 */
-#define PREFIX "ACPI: "
-
int acpi_noirq; /* skip ACPI IRQ initialization */
static int acpi_nobgrt; /* skip ACPI BGRT */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
@@ -130,15 +129,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
madt = (struct acpi_table_madt *)table;
if (!madt) {
- printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ pr_warn("Unable to map MADT\n");
return -ENODEV;
}
if (madt->address) {
acpi_lapic_addr = (u64) madt->address;
- printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
- madt->address);
+ pr_debug("Local APIC address 0x%08x\n", madt->address);
}
default_acpi_madt_oem_check(madt->header.oem_id,
@@ -161,7 +159,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
int cpu;
if (id >= MAX_LOCAL_APIC) {
- printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ pr_info("skipped apicid that is too big\n");
return -EINVAL;
}
@@ -213,13 +211,13 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
*/
if (!apic->apic_id_valid(apic_id)) {
if (enabled)
- pr_warn(PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
return 0;
}
acpi_register_lapic(apic_id, processor->uid, enabled);
#else
- printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ pr_warn("x2apic entry ignored\n");
#endif
return 0;
@@ -306,7 +304,7 @@ acpi_parse_x2apic_nmi(union acpi_subtable_headers *header,
acpi_table_print_madt_entry(&header->common);
if (x2apic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
@@ -324,7 +322,7 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
acpi_table_print_madt_entry(&header->common);
if (lapic_nmi->lint != 1)
- printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+ pr_warn("NMI not connected to LINT 1!\n");
return 0;
}
@@ -514,14 +512,14 @@ acpi_parse_int_src_ovr(union acpi_subtable_headers * header,
if (intsrc->source_irq == 0) {
if (acpi_skip_timer_override) {
- printk(PREFIX "BIOS IRQ0 override ignored.\n");
+ pr_warn("BIOS IRQ0 override ignored.\n");
return 0;
}
if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
&& (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
- printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
+ pr_warn("BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
}
}
@@ -597,7 +595,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
if (old == new)
return;
- printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+ pr_warn("setting ELCR to %04x (from %04x)\n", new, old);
outb(new, 0x4d0);
outb(new >> 8, 0x4d1);
}
@@ -754,7 +752,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
if (cpu < 0) {
- pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ pr_info("Unable to map lapic to logical cpu number\n");
return cpu;
}
@@ -870,8 +868,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
- printk(KERN_WARNING PREFIX "HPET timers must be located in "
- "memory.\n");
+ pr_warn("HPET timers must be located in memory.\n");
return -1;
}
@@ -883,9 +880,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
* want to allocate a resource there.
*/
if (!hpet_address) {
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: %#lx is invalid\n",
- hpet_tbl->id, hpet_address);
+ pr_warn("HPET id: %#x base: %#lx is invalid\n", hpet_tbl->id, hpet_address);
return 0;
}
#ifdef CONFIG_X86_64
@@ -896,21 +891,17 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
*/
if (hpet_address == 0xfed0000000000000UL) {
if (!hpet_force_user) {
- printk(KERN_WARNING PREFIX "HPET id: %#x "
- "base: 0xfed0000000000000 is bogus\n "
- "try hpet=force on the kernel command line to "
- "fix it up to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 is bogus, try hpet=force on the kernel command line to fix it up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address = 0;
return 0;
}
- printk(KERN_WARNING PREFIX
- "HPET id: %#x base: 0xfed0000000000000 fixed up "
- "to 0xfed00000.\n", hpet_tbl->id);
+ pr_warn("HPET id: %#x base: 0xfed0000000000000 fixed up to 0xfed00000.\n",
+ hpet_tbl->id);
hpet_address >>= 32;
}
#endif
- printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
- hpet_tbl->id, hpet_address);
+ pr_info("HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address);
/*
* Allocate and initialize the HPET firmware resource for adding into
@@ -955,24 +946,24 @@ late_initcall(hpet_insert_resource);
static int __init acpi_parse_fadt(struct acpi_table_header *table)
{
if (!(acpi_gbl_FADT.boot_flags & ACPI_FADT_LEGACY_DEVICES)) {
- pr_debug("ACPI: no legacy devices present\n");
+ pr_debug("no legacy devices present\n");
x86_platform.legacy.devices.pnpbios = 0;
}
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
!(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
- pr_debug("ACPI: i8042 controller is absent\n");
+ pr_debug("i8042 controller is absent\n");
x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
}
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
- pr_debug("ACPI: not registering RTC platform device\n");
+ pr_debug("not registering RTC platform device\n");
x86_platform.legacy.rtc = 0;
}
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_VGA) {
- pr_debug("ACPI: probing for VGA not safe\n");
+ pr_debug("probing for VGA not safe\n");
x86_platform.legacy.no_vga = 1;
}
@@ -997,8 +988,7 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
pmtmr_ioport = acpi_gbl_FADT.pm_timer_block;
}
if (pmtmr_ioport)
- printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
- pmtmr_ioport);
+ pr_info("PM-Timer IO Port: %#x\n", pmtmr_ioport);
#endif
return 0;
}
@@ -1024,8 +1014,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
+ pr_err("Error parsing LAPIC address override entry\n");
return count;
}
@@ -1057,8 +1046,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
sizeof(struct acpi_table_madt),
madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
if (ret < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC/X2APIC entries\n");
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
return ret;
}
@@ -1066,11 +1054,11 @@ static int __init acpi_parse_madt_lapic_entries(void)
x2count = madt_proc[1].count;
}
if (!count && !x2count) {
- printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+ pr_err("No LAPIC entries present\n");
/* TBD: Cleanup to allow fallback to MPS */
return -ENODEV;
} else if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+ pr_err("Error parsing LAPIC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1080,7 +1068,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
- printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+ pr_err("Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1139,7 +1127,7 @@ static void __init mp_config_acpi_legacy_irqs(void)
}
if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ pr_debug("ACPI: IRQ%d used by override.\n", i);
continue; /* IRQ already used */
}
@@ -1179,26 +1167,24 @@ static int __init acpi_parse_madt_ioapic_entries(void)
* if "noapic" boot option, don't look for IO-APICs
*/
if (skip_ioapic_setup) {
- printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
- "due to 'noapic' option.\n");
+ pr_info("Skipping IOAPIC probe due to 'noapic' option.\n");
return -ENODEV;
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
MAX_IO_APICS);
if (!count) {
- printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+ pr_err("No IOAPIC entries present\n");
return -ENODEV;
} else if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+ pr_err("Error parsing IOAPIC entry\n");
return count;
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
acpi_parse_int_src_ovr, nr_irqs);
if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing interrupt source overrides entry\n");
+ pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1218,7 +1204,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
acpi_parse_nmi_src, nr_irqs);
if (count < 0) {
- printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+ pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return count;
}
@@ -1251,8 +1237,7 @@ static void __init early_acpi_process_madt(void)
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
}
@@ -1289,8 +1274,7 @@ static void __init acpi_process_madt(void)
/*
* Dell Precision Workstation 410, 610 come here.
*/
- printk(KERN_ERR PREFIX
- "Invalid BIOS MADT, disabling ACPI\n");
+ pr_err("Invalid BIOS MADT, disabling ACPI\n");
disable_acpi();
}
} else {
@@ -1300,8 +1284,7 @@ static void __init acpi_process_madt(void)
* Boot with "acpi=off" to use MPS on such a system.
*/
if (smp_found_config) {
- printk(KERN_WARNING PREFIX
- "No APIC-table, disabling MPS\n");
+ pr_warn("No APIC-table, disabling MPS\n");
smp_found_config = 0;
}
}
@@ -1311,11 +1294,9 @@ static void __init acpi_process_madt(void)
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic)
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
- "information\n");
+ pr_info("Using ACPI (MADT) for SMP configuration information\n");
else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) "
- "configuration information\n");
+ pr_info("Using ACPI for processor (LAPIC) configuration information\n");
#endif
return;
}
@@ -1323,8 +1304,7 @@ static void __init acpi_process_madt(void)
static int __init disable_acpi_irq(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n",
- d->ident);
+ pr_notice("%s detected: force use of acpi=noirq\n", d->ident);
acpi_noirq_set();
}
return 0;
@@ -1333,8 +1313,7 @@ static int __init disable_acpi_irq(const struct dmi_system_id *d)
static int __init disable_acpi_pci(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n",
- d->ident);
+ pr_notice("%s detected: force use of pci=noacpi\n", d->ident);
acpi_disable_pci();
}
return 0;
@@ -1343,11 +1322,10 @@ static int __init disable_acpi_pci(const struct dmi_system_id *d)
static int __init dmi_disable_acpi(const struct dmi_system_id *d)
{
if (!acpi_force) {
- printk(KERN_NOTICE "%s detected: acpi off\n", d->ident);
+ pr_notice("%s detected: acpi off\n", d->ident);
disable_acpi();
} else {
- printk(KERN_NOTICE
- "Warning: DMI blacklist says broken, but acpi forced\n");
+ pr_notice("Warning: DMI blacklist says broken, but acpi forced\n");
}
return 0;
}
@@ -1574,9 +1552,9 @@ int __init early_acpi_boot_init(void)
*/
if (acpi_blacklisted()) {
if (acpi_force) {
- printk(KERN_WARNING PREFIX "acpi=force override\n");
+ pr_warn("acpi=force override\n");
} else {
- printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+ pr_warn("Disabling ACPI support\n");
disable_acpi();
return 1;
}
@@ -1692,9 +1670,7 @@ int __init acpi_mps_check(void)
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
/* mptable code is not built-in*/
if (acpi_disabled || acpi_noirq) {
- printk(KERN_WARNING "MPS support code is not built-in.\n"
- "Using acpi=off or acpi=noirq or pci=noacpi "
- "may have problem\n");
+ pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n");
return 1;
}
#endif
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 49ae4e1ac9cd..7de599eba7f0 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -197,7 +197,8 @@ static int __init ffh_cstate_init(void)
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor != X86_VENDOR_INTEL &&
- c->x86_vendor != X86_VENDOR_AMD)
+ c->x86_vendor != X86_VENDOR_AMD &&
+ c->x86_vendor != X86_VENDOR_HYGON)
return -1;
cpu_cstate_entry = alloc_percpu(struct cstate_entry);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6fe5b44fcbc9..e9da3dc71254 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -75,7 +75,7 @@ do { \
} \
} while (0)
-const unsigned char x86nops[] =
+static const unsigned char x86nops[] =
{
BYTES_NOP1,
BYTES_NOP2,
@@ -301,8 +301,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
instr, instr, a->instrlen,
replacement, a->replacementlen);
- DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
- DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
+ DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
memcpy(insn_buff, replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 09083094eb57..23dda362dc0f 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -25,6 +25,7 @@
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -57,6 +58,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
};
@@ -72,6 +74,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c06ac56eae4d..b7c003013d41 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -646,6 +646,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a1b756c49a93..64b805bd6a54 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
#include <asm/uv/uv.h>
+#include <asm/sigframe.h>
#include "cpu.h"
@@ -465,27 +466,22 @@ static bool pku_disabled;
static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
- struct pkru_state *pk;
+ if (c == &boot_cpu_data) {
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /*
+ * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+ * bit to be set. Enforce it.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSPKE);
- /* check the boot processor, plus compile options for PKU: */
- if (!cpu_feature_enabled(X86_FEATURE_PKU))
- return;
- /* checks the actual processor's cpuid bits: */
- if (!cpu_has(c, X86_FEATURE_PKU))
- return;
- if (pku_disabled)
+ } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
return;
+ }
cr4_set_bits(X86_CR4_PKE);
- pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
- if (pk)
- pk->pkru = init_pkru_value;
- /*
- * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
- * cpuid bit to be set. We need to ensure that we
- * update that bit in this CPU's "cpu_info".
- */
- set_cpu_cap(c, X86_FEATURE_OSPKE);
+ /* Load the default PKRU value */
+ pkru_write_default();
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -1332,6 +1328,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
fpu__init_system(c);
+ init_sigframe_size();
+
#ifdef CONFIG_X86_32
/*
* Regardless of whether PCID is enumerated, the SDM says
@@ -1717,9 +1715,8 @@ void print_cpu_info(struct cpuinfo_x86 *c)
}
/*
- * clearcpuid= was already parsed in fpu__init_parse_early_param.
- * But we need to keep a dummy __setup around otherwise it would
- * show up as an environment variable for init.
+ * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy
+ * function prevents it from becoming an environment variable for init.
*/
static __init int setup_clearcpuid(char *arg)
{
@@ -1773,10 +1770,16 @@ void syscall_init(void)
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
- /* Flags to clear on syscall */
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
}
#else /* CONFIG_X86_64 */
@@ -1938,13 +1941,12 @@ void cpu_init_exception_handling(void)
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
void cpu_init(void)
{
- struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct task_struct *cur = current;
int cpu = raw_smp_processor_id();
@@ -1957,8 +1959,6 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
- setup_getcpu(cpu);
-
pr_debug("Initializing CPU#%d\n", cpu);
if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
@@ -1970,7 +1970,6 @@ void cpu_init(void)
* and set up the GDT descriptor:
*/
switch_to_new_gdt(cpu);
- load_current_idt();
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
@@ -1990,12 +1989,6 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, cur);
- /* Initialize the TSS. */
- tss_setup_ist(tss);
- tss_setup_io_bitmap(tss);
- set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
-
- load_TR_desc();
/*
* sp0 points to the entry trampoline stack regardless of what task
* is running.
@@ -2017,6 +2010,18 @@ void cpu_init(void)
load_fixmap_gdt(cpu);
}
+#ifdef CONFIG_SMP
+void cpu_init_secondary(void)
+{
+ /*
+ * Relies on the BP having set-up the IDT tables, which are loaded
+ * on this CPU in cpu_init_exception_handling().
+ */
+ cpu_init_exception_handling();
+ cpu_init();
+}
+#endif
+
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 67944128876d..95521302630d 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -48,6 +48,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
enum tsx_ctrl_states {
TSX_CTRL_ENABLE,
TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
TSX_CTRL_NOT_SUPPORTED,
};
@@ -56,6 +57,7 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
extern void tsx_enable(void);
extern void tsx_disable(void);
+extern void tsx_clear_cpuid(void);
#else
static inline void tsx_init(void) { }
#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 0bd6c74e3ba1..6d50136f7ab9 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -260,6 +260,10 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
if (c->x86_power & BIT(12))
set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8adffc17fa8b..8321c43554a1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -10,6 +10,7 @@
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
+#include <linux/delay.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -41,6 +42,7 @@ enum split_lock_detect_state {
sld_off = 0,
sld_warn,
sld_fatal,
+ sld_ratelimit,
};
/*
@@ -717,8 +719,10 @@ static void init_intel(struct cpuinfo_x86 *c)
if (tsx_ctrl_state == TSX_CTRL_ENABLE)
tsx_enable();
- if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ tsx_clear_cpuid();
split_lock_init();
bus_lock_init();
@@ -997,13 +1001,30 @@ static const struct {
{ "off", sld_off },
{ "warn", sld_warn },
{ "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
};
+static struct ratelimit_state bld_ratelimit;
+
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
- int len = strlen(opt);
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
- return len == arglen && !strncmp(arg, opt, len);
+ return len == arglen;
}
static bool split_lock_verify_msr(bool on)
@@ -1082,6 +1103,15 @@ static void sld_update_msr(bool on)
static void split_lock_init(void)
{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
if (cpu_model_supports_sld)
split_lock_verify_msr(sld_state != sld_off);
}
@@ -1154,6 +1184,12 @@ void handle_bus_lock(struct pt_regs *regs)
switch (sld_state) {
case sld_off:
break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
case sld_warn:
pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
current->comm, current->pid, regs->ip);
@@ -1259,6 +1295,10 @@ static void sld_state_show(void)
" from non-WB" : "");
}
break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
}
}
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..08831acc1d03 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -77,27 +77,29 @@ struct smca_bank_name {
};
static struct smca_bank_name smca_names[] = {
- [SMCA_LS] = { "load_store", "Load Store Unit" },
- [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
- [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP] = { "psp", "Platform Security Processor" },
- [SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU] = { "smu", "System Management Unit" },
- [SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE] = { "pcie", "PCI Express Unit" },
+ [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_RESERVED] = { "reserved", "Reserved" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
+ [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
+ [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
+ [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
+ [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
+ [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -155,6 +157,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
@@ -175,6 +178,16 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* PCI Express Unit MCA type */
{ SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ /* xGMI PCS MCA type */
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+
+ /* xGMI PHY MCA type */
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+
+ /* WAFL PHY MCA type */
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
};
struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index b58b85380ddb..0e3ae64d3b76 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -36,7 +36,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
mce_setup(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index bf7fe87a7e88..22791aadc085 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1257,19 +1257,28 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ int ret;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags) &&
- !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
+ ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ if (!ret && !(p->mce_kflags & MCE_IN_KERNEL_COPYIN)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
sync_core();
return;
}
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info, so no need to
+ * send SIGBUS here again.
+ */
+ if (ret == -EHWPOISON)
+ return;
+
if (p->mce_vaddr != (void __user *)-1l) {
force_sig_mceerr(BUS_MCEERR_AR, p->mce_vaddr, PAGE_SHIFT);
} else {
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c268c2730048..cc8f1773deca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,6 +17,7 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
+#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
@@ -236,7 +237,7 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
for_each_present_cpu(i) {
if (i == 0)
continue;
- ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, i);
BUG_ON(ret);
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c4d320d02fd5..6a5f60a37219 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -70,6 +70,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
* @name: name of the event
+ * @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
u32 evtid;
@@ -78,10 +79,13 @@ struct mon_evt {
};
/**
- * struct mon_data_bits - Monitoring details for each event file
- * @rid: Resource id associated with the event file.
+ * union mon_data_bits - Monitoring details for each event file
+ * @priv: Used to store monitoring event data in @u
+ * as kernfs private data
+ * @rid: Resource id associated with the event file
* @evtid: Event id associated with the event file
* @domid: The domain to which the event file belongs
+ * @u: Name of the bit fields struct
*/
union mon_data_bits {
void *priv;
@@ -119,6 +123,7 @@ enum rdt_group_type {
* @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
* @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
* allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
*
* The mode of a resource group enables control over the allowed overlap
* between allocations associated with different resource groups (classes
@@ -142,7 +147,7 @@ enum rdtgrp_mode {
/**
* struct mongroup - store mon group's data in resctrl fs.
- * @mon_data_kn kernlfs node for the mon_data directory
+ * @mon_data_kn: kernfs node for the mon_data directory
* @parent: parent rdtgrp
* @crdtgrp_list: child rdtgroup node list
* @rmid: rmid for this rdtgroup
@@ -282,11 +287,11 @@ struct rftype {
/**
* struct mbm_state - status for each MBM counter in each domain
* @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it
+ * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it
* @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
- * @prev_bw The most recent bandwidth in MBps
- * @delta_bw Difference between the current and previous bandwidth
- * @delta_comp Indicates whether to compute the delta_bw
+ * @prev_bw: The most recent bandwidth in MBps
+ * @delta_bw: Difference between the current and previous bandwidth
+ * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 chunks;
@@ -456,11 +461,13 @@ struct rdt_parse_data {
* @data_width: Character width of data when displaying
* @domains: All domains for this resource
* @cache: Cache allocation related data
+ * @membw: If the component has bandwidth controls, their properties.
* @format_str: Per resource format string to show domain value
* @parse_ctrlval: Per resource function pointer to parse control values
* @evt_list: List of monitoring events
* @num_rmid: Number of RMIDs available
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width: Monitor width, to detect and correct for overflow.
* @fflags: flags to choose base and info files
*/
struct rdt_resource {
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 05a89e33fde2..2207916cae65 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -49,6 +49,7 @@ static struct class *pseudo_lock_class;
/**
* get_prefetch_disable_bits - prefetch disable bits of supported platforms
+ * @void: It takes no parameters.
*
* Capture the list of platforms that have been validated to support
* pseudo-locking. This includes testing to ensure pseudo-locked regions
@@ -162,7 +163,7 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
}
/**
- * pseudo_lock_pm_req - A power management QoS request list entry
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
* @list: Entry within the @pm_reqs list for a pseudo-locked region
* @req: PM QoS request
*/
@@ -184,6 +185,7 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
/**
* pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
*
* To prevent the cache from being affected by power management entering
* C6 has to be avoided. This is accomplished by requesting a latency
@@ -196,6 +198,8 @@ static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
* the ACPI latencies need to be considered while keeping in mind that C2
* may be set to map to deeper sleep states. In this case the latency
* requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
*/
static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
{
@@ -520,7 +524,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
/**
* rdtgroup_monitor_in_progress - Test if monitoring in progress
- * @r: resource group being queried
+ * @rdtgrp: resource group being queried
*
* Return: 1 if monitor groups have been created for this resource
* group, 0 otherwise.
@@ -1140,6 +1144,8 @@ out:
/**
* pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
*
* The measurement of latency to access a pseudo-locked region should be
* done from a cpu that is associated with that pseudo-locked region.
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 3be203297988..001808e3901c 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -383,7 +383,7 @@ const struct vm_operations_struct sgx_vm_ops = {
/**
* sgx_encl_release - Destroy an enclave instance
- * @kref: address of a kref inside &sgx_encl
+ * @ref: address of a kref inside &sgx_encl
*
* Used together with kref_put(). Frees all the resources associated with the
* enclave and the instance itself.
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index 6e74f85b6264..fec43ca65065 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -91,8 +91,8 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
{
struct vm_area_struct *result;
- result = find_vma(mm, addr);
- if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+ result = vma_lookup(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops)
return -EINVAL;
*vma = result;
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index e2ad30e474f8..9c7a5f049292 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -2,7 +2,7 @@
/*
* Intel Transactional Synchronization Extensions (TSX) control.
*
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2021 Intel Corporation
*
* Author:
* Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
@@ -84,13 +84,46 @@ static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
return TSX_CTRL_ENABLE;
}
+void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrl(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrl(MSR_TSX_FORCE_ABORT, msr);
+ }
+}
+
void __init tsx_init(void)
{
char arg[5] = {};
int ret;
- if (!tsx_ctrl_is_supported())
+ /*
+ * Hardware will always abort a TSX transaction if both CPUID bits
+ * RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are set. In this case, it is
+ * better not to enumerate CPUID.RTM and CPUID.HLE bits. Clear them
+ * here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
return;
+ }
+
+ if (!tsx_ctrl_is_supported()) {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
+ return;
+ }
ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg));
if (ret >= 0) {
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 54ce999ed321..e8326a8d1c5d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -70,19 +70,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
-/*
- * When the crashkernel option is specified, only use the low
- * 1M for the real mode trampoline.
- */
-void __init crash_reserve_low_1M(void)
-{
- if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
- return;
-
- memblock_reserve(0, 1<<20);
- pr_info("Reserving the low 1M of memory for crashkernel\n");
-}
-
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 299c20f0a38b..ea4fe192189d 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -69,7 +69,7 @@ static void printk_stack_address(unsigned long address, int reliable,
const char *log_lvl)
{
touch_nmi_watchdog();
- printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
+ printk("%s %s%pBb\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 6edd1e2ee8af..38837dad46e6 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -549,9 +549,11 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_CNL_IDS(&gen9_early_ops),
INTEL_ICL_11_IDS(&gen11_early_ops),
INTEL_EHL_IDS(&gen11_early_ops),
+ INTEL_JSL_IDS(&gen11_early_ops),
INTEL_TGL_12_IDS(&gen11_early_ops),
INTEL_RKL_IDS(&gen11_early_ops),
INTEL_ADLS_IDS(&gen11_early_ops),
+ INTEL_ADLP_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 571220ac8bea..7ada7bd03a32 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -23,7 +23,7 @@
* Represents the initial FPU state. It's mostly (but not completely) zeroes,
* depending on the FPU hardware format:
*/
-union fpregs_state init_fpstate __read_mostly;
+union fpregs_state init_fpstate __ro_after_init;
/*
* Track whether the kernel is using the FPU state
@@ -83,19 +83,23 @@ bool irq_fpu_usable(void)
EXPORT_SYMBOL(irq_fpu_usable);
/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
+ * Save the FPU register state in fpu->state. The register state is
+ * preserved.
*
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
+ * Must be called with fpregs_lock() held.
+ *
+ * The legacy FNSAVE instruction clears all FPU state unconditionally, so
+ * register state has to be reloaded. That might be a pointless exercise
+ * when the FPU is going to be used by another task right after that. But
+ * this only affects 20+ years old 32bit systems and avoids conditionals all
+ * over the place.
+ *
+ * FXSAVE and all XSAVE variants preserve the FPU register state.
*/
-int copy_fpregs_to_fpstate(struct fpu *fpu)
+void save_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
- copy_xregs_to_kernel(&fpu->state.xsave);
+ os_xsave(&fpu->state.xsave);
/*
* AVX512 state is tracked here because its use is
@@ -103,23 +107,49 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
*/
if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
fpu->avx512_timestamp = jiffies;
- return 1;
+ return;
}
if (likely(use_fxsr())) {
- copy_fxregs_to_kernel(fpu);
- return 1;
+ fxsave(&fpu->state.fxsave);
+ return;
}
/*
* Legacy FPU register saving, FNSAVE always clears FPU registers,
- * so we have to mark them inactive:
+ * so we have to reload them from the memory state.
*/
asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+ frstor(&fpu->state.fsave);
+}
+EXPORT_SYMBOL(save_fpregs_to_fpstate);
- return 0;
+void __restore_fpregs_from_fpstate(union fpregs_state *fpstate, u64 mask)
+{
+ /*
+ * AMD K7/K8 and later CPUs up to Zen don't save/restore
+ * FDP/FIP/FOP unless an exception is pending. Clear the x87 state
+ * here by setting it to fixed values. "m" is a random variable
+ * that should be in L1.
+ */
+ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
+ asm volatile(
+ "fnclex\n\t"
+ "emms\n\t"
+ "fildl %P[addr]" /* set F?P to defined value */
+ : : [addr] "m" (fpstate));
+ }
+
+ if (use_xsave()) {
+ os_xrstor(&fpstate->xsave, mask);
+ } else {
+ if (use_fxsr())
+ fxrstor(&fpstate->fxsave);
+ else
+ frstor(&fpstate->fsave);
+ }
}
-EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+EXPORT_SYMBOL_GPL(__restore_fpregs_from_fpstate);
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
@@ -133,11 +163,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
- /*
- * Ignore return value -- we don't care if reg state
- * is clobbered.
- */
- copy_fpregs_to_fpstate(&current->thread.fpu);
+ save_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
@@ -160,27 +186,38 @@ void kernel_fpu_end(void)
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
- * Save the FPU state (mark it for reload if necessary):
- *
- * This only ever gets called for the current task.
+ * Sync the FPU register state to current's memory register state when the
+ * current task owns the FPU. The hardware register state is preserved.
*/
-void fpu__save(struct fpu *fpu)
+void fpu_sync_fpstate(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu);
fpregs_lock();
trace_x86_fpu_before_save(fpu);
- if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
- if (!copy_fpregs_to_fpstate(fpu)) {
- copy_kernel_to_fpregs(&fpu->state);
- }
- }
+ if (!test_thread_flag(TIF_NEED_FPU_LOAD))
+ save_fpregs_to_fpstate(fpu);
trace_x86_fpu_after_save(fpu);
fpregs_unlock();
}
+static inline void fpstate_init_xstate(struct xregs_state *xsave)
+{
+ /*
+ * XRSTORS requires these bits set in xcomp_bv, or it will
+ * trigger #GP:
+ */
+ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
+}
+
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+ fx->cwd = 0x37f;
+ fx->mxcsr = MXCSR_DEFAULT;
+}
+
/*
* Legacy x87 fpstate state init:
*/
@@ -210,18 +247,18 @@ void fpstate_init(union fpregs_state *state)
}
EXPORT_SYMBOL_GPL(fpstate_init);
-int fpu__copy(struct task_struct *dst, struct task_struct *src)
+/* Clone current's FPU state on fork */
+int fpu_clone(struct task_struct *dst)
{
+ struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
- struct fpu *src_fpu = &src->thread.fpu;
+ /* The new task's FPU state cannot be valid in the hardware. */
dst_fpu->last_cpu = -1;
- if (!static_cpu_has(X86_FEATURE_FPU))
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return 0;
- WARN_ON_FPU(src_fpu != &current->thread.fpu);
-
/*
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
@@ -229,20 +266,16 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
- * If the FPU registers are not current just memcpy() the state.
- * Otherwise save current FPU registers directly into the child's FPU
- * context, without any memory-to-memory copying.
- *
- * ( The function 'fails' in the FNSAVE case, which destroys
- * register contents so we have to load them back. )
+ * If the FPU registers are not owned by current just memcpy() the
+ * state. Otherwise save the FPU registers directly into the
+ * child's FPU context, without any memory-to-memory copying.
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size);
- else if (!copy_fpregs_to_fpstate(dst_fpu))
- copy_kernel_to_fpregs(&dst_fpu->state);
-
+ else
+ save_fpregs_to_fpstate(dst_fpu);
fpregs_unlock();
set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD);
@@ -254,63 +287,6 @@ int fpu__copy(struct task_struct *dst, struct task_struct *src)
}
/*
- * Activate the current task's in-memory FPU context,
- * if it has not been used before:
- */
-static void fpu__initialize(struct fpu *fpu)
-{
- WARN_ON_FPU(fpu != &current->thread.fpu);
-
- set_thread_flag(TIF_NEED_FPU_LOAD);
- fpstate_init(&fpu->state);
- trace_x86_fpu_init_state(fpu);
-}
-
-/*
- * This function must be called before we read a task's fpstate.
- *
- * There's two cases where this gets called:
- *
- * - for the current task (when coredumping), in which case we have
- * to save the latest FPU registers into the fpstate,
- *
- * - or it's called for stopped tasks (ptrace), in which case the
- * registers were already saved by the context-switch code when
- * the task scheduled out.
- *
- * If the task has used the FPU before then save it.
- */
-void fpu__prepare_read(struct fpu *fpu)
-{
- if (fpu == &current->thread.fpu)
- fpu__save(fpu);
-}
-
-/*
- * This function must be called before we write a task's fpstate.
- *
- * Invalidate any cached FPU registers.
- *
- * After this function call, after registers in the fpstate are
- * modified and the child task has woken up, the child task will
- * restore the modified FPU state from the modified context. If we
- * didn't clear its cached status here then the cached in-registers
- * state pending on its former CPU could be restored, corrupting
- * the modifications.
- */
-void fpu__prepare_write(struct fpu *fpu)
-{
- /*
- * Only stopped child tasks can be used to modify the FPU
- * state in the fpstate buffer:
- */
- WARN_ON_FPU(fpu == &current->thread.fpu);
-
- /* Invalidate any cached state: */
- __fpu_invalidate_fpregs_state(fpu);
-}
-
-/*
* Drops current FPU state: deactivates the fpregs and
* the fpstate. NOTE: it still leaves previous contents
* in the fpregs in the eager-FPU case.
@@ -340,61 +316,97 @@ void fpu__drop(struct fpu *fpu)
* Clear FPU registers by setting them up from the init fpstate.
* Caller must do fpregs_[un]lock() around it.
*/
-static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
+static inline void restore_fpregs_from_init_fpstate(u64 features_mask)
{
if (use_xsave())
- copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
- else if (static_cpu_has(X86_FEATURE_FXSR))
- copy_kernel_to_fxregs(&init_fpstate.fxsave);
+ os_xrstor(&init_fpstate.xsave, features_mask);
+ else if (use_fxsr())
+ fxrstor(&init_fpstate.fxsave);
else
- copy_kernel_to_fregs(&init_fpstate.fsave);
+ frstor(&init_fpstate.fsave);
- if (boot_cpu_has(X86_FEATURE_OSPKE))
- copy_init_pkru_to_fpregs();
+ pkru_write_default();
+}
+
+static inline unsigned int init_fpstate_copy_size(void)
+{
+ if (!use_xsave())
+ return fpu_kernel_xstate_size;
+
+ /* XSAVE(S) just needs the legacy and the xstate header part */
+ return sizeof(init_fpstate.xsave);
}
/*
- * Clear the FPU state back to init state.
- *
- * Called by sys_execve(), by the signal handler code and by various
- * error paths.
+ * Reset current->fpu memory state to the init values.
+ */
+static void fpu_reset_fpstate(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ fpregs_lock();
+ fpu__drop(fpu);
+ /*
+ * This does not change the actual hardware registers. It just
+ * resets the memory image and sets TIF_NEED_FPU_LOAD so a
+ * subsequent return to usermode will reload the registers from the
+ * task's memory image.
+ *
+ * Do not use fpstate_init() here. Just copy init_fpstate which has
+ * the correct content already except for PKRU.
+ *
+ * PKRU handling does not rely on the xstate when restoring for
+ * user space as PKRU is eagerly written in switch_to() and
+ * flush_thread().
+ */
+ memcpy(&fpu->state, &init_fpstate, init_fpstate_copy_size());
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ fpregs_unlock();
+}
+
+/*
+ * Reset current's user FPU states to the init states. current's
+ * supervisor states, if any, are not modified by this function. The
+ * caller guarantees that the XSTATE header in memory is intact.
*/
-static void fpu__clear(struct fpu *fpu, bool user_only)
+void fpu__clear_user_states(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu);
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- fpu__drop(fpu);
- fpu__initialize(fpu);
+ fpregs_lock();
+ if (!cpu_feature_enabled(X86_FEATURE_FPU)) {
+ fpu_reset_fpstate();
+ fpregs_unlock();
return;
}
- fpregs_lock();
-
- if (user_only) {
- if (!fpregs_state_valid(fpu, smp_processor_id()) &&
- xfeatures_mask_supervisor())
- copy_kernel_to_xregs(&fpu->state.xsave,
- xfeatures_mask_supervisor());
- copy_init_fpstate_to_fpregs(xfeatures_mask_user());
- } else {
- copy_init_fpstate_to_fpregs(xfeatures_mask_all);
+ /*
+ * Ensure that current's supervisor states are loaded into their
+ * corresponding registers.
+ */
+ if (xfeatures_mask_supervisor() &&
+ !fpregs_state_valid(fpu, smp_processor_id())) {
+ os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
}
+ /* Reset user states in registers. */
+ restore_fpregs_from_init_fpstate(xfeatures_mask_restore_user());
+
+ /*
+ * Now all FPU registers have their desired values. Inform the FPU
+ * state machine that current's FPU registers are in the hardware
+ * registers. The memory image does not need to be updated because
+ * any operation relying on it has to save the registers first when
+ * current's FPU is marked active.
+ */
fpregs_mark_activate();
fpregs_unlock();
}
-void fpu__clear_user_states(struct fpu *fpu)
+void fpu_flush_thread(void)
{
- fpu__clear(fpu, true);
+ fpu_reset_fpstate();
}
-
-void fpu__clear_all(struct fpu *fpu)
-{
- fpu__clear(fpu, false);
-}
-
/*
* Load FPU context before returning to userspace.
*/
@@ -403,7 +415,7 @@ void switch_fpu_return(void)
if (!static_cpu_has(X86_FEATURE_FPU))
return;
- __fpregs_load_activate();
+ fpregs_restore_userregs();
}
EXPORT_SYMBOL_GPL(switch_fpu_return);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 701f196d7c68..64e29927cc32 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -89,7 +89,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
/*
* Boot time FPU feature detection code:
*/
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu;
EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
@@ -135,7 +135,7 @@ static void __init fpu__init_system_generic(void)
* This is inherent to the XSAVE architecture which puts all state
* components into a single, continuous memory block:
*/
-unsigned int fpu_kernel_xstate_size;
+unsigned int fpu_kernel_xstate_size __ro_after_init;
EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);
/* Get alignment of the TYPE. */
@@ -216,17 +216,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_user_xstate_size = fpu_kernel_xstate_size;
}
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
- return XFEATURE_MASK_USER_SUPPORTED |
- XFEATURE_MASK_SUPERVISOR_SUPPORTED;
-}
-
/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c413756ba89f..66ed317ebc0d 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -2,11 +2,13 @@
/*
* FPU register's regset abstraction, for ptrace, core dumps, etc.
*/
+#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
+
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
-#include <linux/sched/task_stack.h>
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
@@ -26,18 +28,58 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r
return 0;
}
+/*
+ * The regset get() functions are invoked from:
+ *
+ * - coredump to dump the current task's fpstate. If the current task
+ * owns the FPU then the memory state has to be synchronized and the
+ * FPU register state preserved. Otherwise fpstate is already in sync.
+ *
+ * - ptrace to dump fpstate of a stopped task, in which case the registers
+ * have already been saved to fpstate on context switch.
+ */
+static void sync_fpstate(struct fpu *fpu)
+{
+ if (fpu == &current->thread.fpu)
+ fpu_sync_fpstate(fpu);
+}
+
+/*
+ * Invalidate cached FPU registers before modifying the stopped target
+ * task's fpstate.
+ *
+ * This forces the target task on resume to restore the FPU registers from
+ * modified fpstate. Otherwise the task might skip the restore and operate
+ * with the cached FPU registers which discards the modifications.
+ */
+static void fpu_force_restore(struct fpu *fpu)
+{
+ /*
+ * Only stopped child tasks can be used to modify the FPU
+ * state in the fpstate buffer:
+ */
+ WARN_ON_FPU(fpu == &current->thread.fpu);
+
+ __fpu_invalidate_fpregs_state(fpu);
+}
+
int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
struct fpu *fpu = &target->thread.fpu;
- if (!boot_cpu_has(X86_FEATURE_FXSR))
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__prepare_read(fpu);
- fpstate_sanitize_xstate(fpu);
+ sync_fpstate(fpu);
+
+ if (!use_xsave()) {
+ return membuf_write(&to, &fpu->state.fxsave,
+ sizeof(fpu->state.fxsave));
+ }
- return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state));
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_FX);
+ return 0;
}
int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -45,62 +87,52 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
const void *kbuf, const void __user *ubuf)
{
struct fpu *fpu = &target->thread.fpu;
+ struct user32_fxsr_struct newstate;
int ret;
- if (!boot_cpu_has(X86_FEATURE_FXSR))
+ BUILD_BUG_ON(sizeof(newstate) != sizeof(struct fxregs_state));
+
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__prepare_write(fpu);
- fpstate_sanitize_xstate(fpu);
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(newstate))
+ return -EINVAL;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fxsave, 0, -1);
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, 0, -1);
+ if (ret)
+ return ret;
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+ /* Do not allow an invalid MXCSR value. */
+ if (newstate.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
- /*
- * update the header bits in the xsave header, indicating the
- * presence of FP and SSE state.
- */
- if (boot_cpu_has(X86_FEATURE_XSAVE))
+ fpu_force_restore(fpu);
+
+ /* Copy the state */
+ memcpy(&fpu->state.fxsave, &newstate, sizeof(newstate));
+
+ /* Clear xmm8..15 */
+ BUILD_BUG_ON(sizeof(fpu->state.fxsave.xmm_space) != 16 * 16);
+ memset(&fpu->state.fxsave.xmm_space[8], 0, 8 * 16);
+
+ /* Mark FP and SSE as in use when XSAVE is enabled */
+ if (use_xsave())
fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
- return ret;
+ return 0;
}
int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
struct membuf to)
{
- struct fpu *fpu = &target->thread.fpu;
- struct xregs_state *xsave;
-
- if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
return -ENODEV;
- xsave = &fpu->state.xsave;
-
- fpu__prepare_read(fpu);
+ sync_fpstate(&target->thread.fpu);
- if (using_compacted_format()) {
- copy_xstate_to_kernel(to, xsave);
- return 0;
- } else {
- fpstate_sanitize_xstate(fpu);
- /*
- * Copy the 48 bytes defined by the software into the xsave
- * area in the thread struct, so that we can copy the whole
- * area to user using one user_regset_copyout().
- */
- memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
- /*
- * Copy the xstate memory layout.
- */
- return membuf_write(&to, xsave, fpu_user_xstate_size);
- }
+ copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE);
+ return 0;
}
int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -108,44 +140,34 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
const void *kbuf, const void __user *ubuf)
{
struct fpu *fpu = &target->thread.fpu;
- struct xregs_state *xsave;
+ struct xregs_state *tmpbuf = NULL;
int ret;
- if (!boot_cpu_has(X86_FEATURE_XSAVE))
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVE))
return -ENODEV;
/*
* A whole standard-format XSAVE buffer is needed:
*/
- if ((pos != 0) || (count < fpu_user_xstate_size))
+ if (pos != 0 || count != fpu_user_xstate_size)
return -EFAULT;
- xsave = &fpu->state.xsave;
-
- fpu__prepare_write(fpu);
+ if (!kbuf) {
+ tmpbuf = vmalloc(count);
+ if (!tmpbuf)
+ return -ENOMEM;
- if (using_compacted_format()) {
- if (kbuf)
- ret = copy_kernel_to_xstate(xsave, kbuf);
- else
- ret = copy_user_to_xstate(xsave, ubuf);
- } else {
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
- if (!ret)
- ret = validate_user_xstate_header(&xsave->header);
+ if (copy_from_user(tmpbuf, ubuf, count)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- /*
- * mxcsr reserved bits must be masked to zero for security reasons.
- */
- xsave->i387.mxcsr &= mxcsr_feature_mask;
-
- /*
- * In case of failure, mark all states as init:
- */
- if (ret)
- fpstate_init(&fpu->state);
+ fpu_force_restore(fpu);
+ ret = copy_uabi_from_kernel_to_xstate(&fpu->state.xsave, kbuf ?: tmpbuf);
+out:
+ vfree(tmpbuf);
return ret;
}
@@ -221,10 +243,10 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
* FXSR floating point environment conversions.
*/
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+static void __convert_from_fxsr(struct user_i387_ia32_struct *env,
+ struct task_struct *tsk,
+ struct fxregs_state *fxsave)
{
- struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -258,6 +280,12 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
memcpy(&to[i], &from[i], sizeof(to[0]));
}
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+ __convert_from_fxsr(env, tsk, &tsk->thread.fpu.state.fxsave);
+}
+
void convert_to_fxsr(struct fxregs_state *fxsave,
const struct user_i387_ia32_struct *env)
@@ -290,25 +318,29 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
{
struct fpu *fpu = &target->thread.fpu;
struct user_i387_ia32_struct env;
+ struct fxregs_state fxsave, *fx;
- fpu__prepare_read(fpu);
+ sync_fpstate(fpu);
- if (!boot_cpu_has(X86_FEATURE_FPU))
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return fpregs_soft_get(target, regset, to);
- if (!boot_cpu_has(X86_FEATURE_FXSR)) {
+ if (!cpu_feature_enabled(X86_FEATURE_FXSR)) {
return membuf_write(&to, &fpu->state.fsave,
sizeof(struct fregs_state));
}
- fpstate_sanitize_xstate(fpu);
+ if (use_xsave()) {
+ struct membuf mb = { .p = &fxsave, .left = sizeof(fxsave) };
- if (to.left == sizeof(env)) {
- convert_from_fxsr(to.p, target);
- return 0;
+ /* Handle init state optimized xstate correctly */
+ copy_xstate_to_uabi_buf(mb, target, XSTATE_COPY_FP);
+ fx = &fxsave;
+ } else {
+ fx = &fpu->state.fxsave;
}
- convert_from_fxsr(&env, target);
+ __convert_from_fxsr(&env, target, fx);
return membuf_write(&to, &env, sizeof(env));
}
@@ -320,31 +352,32 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;
- fpu__prepare_write(fpu);
- fpstate_sanitize_xstate(fpu);
+ /* No funny business with partial or oversized writes is permitted. */
+ if (pos != 0 || count != sizeof(struct user_i387_ia32_struct))
+ return -EINVAL;
- if (!boot_cpu_has(X86_FEATURE_FPU))
+ if (!cpu_feature_enabled(X86_FEATURE_FPU))
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
- if (!boot_cpu_has(X86_FEATURE_FXSR))
- return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fsave, 0,
- -1);
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+ if (ret)
+ return ret;
- if (pos > 0 || count < sizeof(env))
- convert_from_fxsr(&env, target);
+ fpu_force_restore(fpu);
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
- if (!ret)
- convert_to_fxsr(&target->thread.fpu.state.fxsave, &env);
+ if (cpu_feature_enabled(X86_FEATURE_FXSR))
+ convert_to_fxsr(&fpu->state.fxsave, &env);
+ else
+ memcpy(&fpu->state.fsave, &env, sizeof(env));
/*
- * update the header bit in the xsave header, indicating the
+ * Update the header bit in the xsave header, indicating the
* presence of FP.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE))
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
- return ret;
+
+ return 0;
}
#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index ec3ae3054792..445c57c9c539 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -15,29 +15,30 @@
#include <asm/sigframe.h>
#include <asm/trace/fpu.h>
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+static struct _fpx_sw_bytes fx_sw_reserved __ro_after_init;
+static struct _fpx_sw_bytes fx_sw_reserved_ia32 __ro_after_init;
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
*/
-static inline int check_for_xstate(struct fxregs_state __user *buf,
- void __user *fpstate,
- struct _fpx_sw_bytes *fx_sw)
+static inline int check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
+ struct _fpx_sw_bytes *fx_sw)
{
int min_xstate_size = sizeof(struct fxregs_state) +
sizeof(struct xstate_header);
+ void __user *fpstate = fxbuf;
unsigned int magic2;
- if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
- return -1;
+ if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw)))
+ return -EFAULT;
/* Check for the first magic field and other error scenarios. */
if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
fx_sw->xstate_size < min_xstate_size ||
fx_sw->xstate_size > fpu_user_xstate_size ||
fx_sw->xstate_size > fx_sw->extended_size)
- return -1;
+ goto setfx;
/*
* Check for the presence of second magic word at the end of memory
@@ -45,10 +46,18 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
- || magic2 != FP_XSTATE_MAGIC2)
- return -1;
+ if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)))
+ return -EFAULT;
+ if (likely(magic2 == FP_XSTATE_MAGIC2))
+ return 0;
+setfx:
+ trace_x86_fpu_xstate_check_failed(&current->thread.fpu);
+
+ /* Set the parameters for fx only state */
+ fx_sw->magic1 = 0;
+ fx_sw->xstate_size = sizeof(struct fxregs_state);
+ fx_sw->xfeatures = XFEATURE_MASK_FPSSE;
return 0;
}
@@ -64,7 +73,7 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- copy_fxregs_to_kernel(&tsk->thread.fpu);
+ fxsave(&tsk->thread.fpu.state.fxsave);
fpregs_unlock();
convert_from_fxsr(&env, tsk);
@@ -129,11 +138,11 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
int err;
if (use_xsave())
- err = copy_xregs_to_user(buf);
+ err = xsave_to_user_sigframe(buf);
else if (use_fxsr())
- err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+ err = fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
- err = copy_fregs_to_user((struct fregs_state __user *) buf);
+ err = fnsave_to_user_sigframe((struct fregs_state __user *) buf);
if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size))
err = -EFAULT;
@@ -188,7 +197,7 @@ retry:
*/
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
- __fpregs_load_activate();
+ fpregs_restore_userregs();
pagefault_disable();
ret = copy_fpregs_to_sigframe(buf_fx);
@@ -211,280 +220,202 @@ retry:
return 0;
}
-static inline void
-sanitize_restored_user_xstate(union fpregs_state *state,
- struct user_i387_ia32_struct *ia32_env,
- u64 user_xfeatures, int fx_only)
+static int __restore_fpregs_from_user(void __user *buf, u64 xrestore,
+ bool fx_only)
{
- struct xregs_state *xsave = &state->xsave;
- struct xstate_header *header = &xsave->header;
-
if (use_xsave()) {
- /*
- * Note: we don't need to zero the reserved bits in the
- * xstate_header here because we either didn't copy them at all,
- * or we checked earlier that they aren't set.
- */
+ u64 init_bv = xfeatures_mask_uabi() & ~xrestore;
+ int ret;
- /*
- * 'user_xfeatures' might have bits clear which are
- * set in header->xfeatures. This represents features that
- * were in init state prior to a signal delivery, and need
- * to be reset back to the init state. Clear any user
- * feature bits which are set in the kernel buffer to get
- * them back to the init state.
- *
- * Supervisor state is unchanged by input from userspace.
- * Ensure supervisor state bits stay set and supervisor
- * state is not modified.
- */
- if (fx_only)
- header->xfeatures = XFEATURE_MASK_FPSSE;
+ if (likely(!fx_only))
+ ret = xrstor_from_user_sigframe(buf, xrestore);
else
- header->xfeatures &= user_xfeatures |
- xfeatures_mask_supervisor();
- }
-
- if (use_fxsr()) {
- /*
- * mscsr reserved bits must be masked to zero for security
- * reasons.
- */
- xsave->i387.mxcsr &= mxcsr_feature_mask;
+ ret = fxrstor_from_user_sigframe(buf);
- if (ia32_env)
- convert_to_fxsr(&state->fxsave, ia32_env);
+ if (!ret && unlikely(init_bv))
+ os_xrstor(&init_fpstate.xsave, init_bv);
+ return ret;
+ } else if (use_fxsr()) {
+ return fxrstor_from_user_sigframe(buf);
+ } else {
+ return frstor_from_user_sigframe(buf);
}
}
/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ * Attempt to restore the FPU registers directly from user memory.
+ * Pagefaults are handled and any errors returned are fatal.
*/
-static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+static int restore_fpregs_from_user(void __user *buf, u64 xrestore,
+ bool fx_only, unsigned int size)
{
- u64 init_bv;
- int r;
+ struct fpu *fpu = &current->thread.fpu;
+ int ret;
- if (use_xsave()) {
- if (fx_only) {
- init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
-
- r = copy_user_to_fxregs(buf);
- if (!r)
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- return r;
- } else {
- init_bv = xfeatures_mask_user() & ~xbv;
-
- r = copy_user_to_xregs(buf, xbv);
- if (!r && unlikely(init_bv))
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- return r;
- }
- } else if (use_fxsr()) {
- return copy_user_to_fxregs(buf);
- } else
- return copy_user_to_fregs(buf);
+retry:
+ fpregs_lock();
+ pagefault_disable();
+ ret = __restore_fpregs_from_user(buf, xrestore, fx_only);
+ pagefault_enable();
+
+ if (unlikely(ret)) {
+ /*
+ * The above did an FPU restore operation, restricted to
+ * the user portion of the registers, and failed, but the
+ * microcode might have modified the FPU registers
+ * nevertheless.
+ *
+ * If the FPU registers do not belong to current, then
+ * invalidate the FPU register state otherwise the task
+ * might preempt current and return to user space with
+ * corrupted FPU registers.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __cpu_invalidate_fpregs_state();
+ fpregs_unlock();
+
+ /* Try to handle #PF, but anything else is fatal. */
+ if (ret != -EFAULT)
+ return -EINVAL;
+
+ ret = fault_in_pages_readable(buf, size);
+ if (!ret)
+ goto retry;
+ return ret;
+ }
+
+ /*
+ * Restore supervisor states: previous context switch etc has done
+ * XSAVES and saved the supervisor states in the kernel buffer from
+ * which they can be restored now.
+ *
+ * It would be optimal to handle this with a single XRSTORS, but
+ * this does not work because the rest of the FPU registers have
+ * been restored from a user buffer directly.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD) && xfeatures_mask_supervisor())
+ os_xrstor(&fpu->state.xsave, xfeatures_mask_supervisor());
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+ return 0;
}
-static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+static int __fpu_restore_sig(void __user *buf, void __user *buf_fx,
+ bool ia32_fxstate)
{
- struct user_i387_ia32_struct *envp = NULL;
int state_size = fpu_kernel_xstate_size;
- int ia32_fxstate = (buf != buf_fx);
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
u64 user_xfeatures = 0;
- int fx_only = 0;
- int ret = 0;
-
- ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) ||
- IS_ENABLED(CONFIG_IA32_EMULATION));
-
- if (!buf) {
- fpu__clear_user_states(fpu);
- return 0;
- }
-
- if (!access_ok(buf, size)) {
- ret = -EACCES;
- goto out;
- }
-
- if (!static_cpu_has(X86_FEATURE_FPU)) {
- ret = fpregs_soft_set(current, NULL, 0,
- sizeof(struct user_i387_ia32_struct),
- NULL, buf);
- goto out;
- }
+ bool fx_only = false;
+ int ret;
if (use_xsave()) {
struct _fpx_sw_bytes fx_sw_user;
- if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
- /*
- * Couldn't find the extended state information in the
- * memory layout. Restore just the FP/SSE and init all
- * the other extended state.
- */
- state_size = sizeof(struct fxregs_state);
- fx_only = 1;
- trace_x86_fpu_xstate_check_failed(fpu);
- } else {
- state_size = fx_sw_user.xstate_size;
- user_xfeatures = fx_sw_user.xfeatures;
- }
- }
-
- if ((unsigned long)buf_fx % 64)
- fx_only = 1;
-
- if (!ia32_fxstate) {
- /*
- * Attempt to restore the FPU registers directly from user
- * memory. For that to succeed, the user access cannot cause
- * page faults. If it does, fall back to the slow path below,
- * going through the kernel buffer with the enabled pagefault
- * handler.
- */
- fpregs_lock();
- pagefault_disable();
- ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
- pagefault_enable();
- if (!ret) {
-
- /*
- * Restore supervisor states: previous context switch
- * etc has done XSAVES and saved the supervisor states
- * in the kernel buffer from which they can be restored
- * now.
- *
- * We cannot do a single XRSTORS here - which would
- * be nice - because the rest of the FPU registers are
- * being restored from a user buffer directly. The
- * single XRSTORS happens below, when the user buffer
- * has been copied to the kernel one.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
- xfeatures_mask_supervisor())
- copy_kernel_to_xregs(&fpu->state.xsave,
- xfeatures_mask_supervisor());
- fpregs_mark_activate();
- fpregs_unlock();
- return 0;
- }
- /*
- * The above did an FPU restore operation, restricted to
- * the user portion of the registers, and failed, but the
- * microcode might have modified the FPU registers
- * nevertheless.
- *
- * If the FPU registers do not belong to current, then
- * invalidate the FPU register state otherwise the task might
- * preempt current and return to user space with corrupted
- * FPU registers.
- *
- * In case current owns the FPU registers then no further
- * action is required. The fixup below will handle it
- * correctly.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- __cpu_invalidate_fpregs_state();
+ ret = check_xstate_in_sigframe(buf_fx, &fx_sw_user);
+ if (unlikely(ret))
+ return ret;
- fpregs_unlock();
+ fx_only = !fx_sw_user.magic1;
+ state_size = fx_sw_user.xstate_size;
+ user_xfeatures = fx_sw_user.xfeatures;
} else {
+ user_xfeatures = XFEATURE_MASK_FPSSE;
+ }
+
+ if (likely(!ia32_fxstate)) {
/*
- * For 32-bit frames with fxstate, copy the fxstate so it can
- * be reconstructed later.
+ * Attempt to restore the FPU registers directly from user
+ * memory. For that to succeed, the user access cannot cause page
+ * faults. If it does, fall back to the slow path below, going
+ * through the kernel buffer with the enabled pagefault handler.
*/
- ret = __copy_from_user(&env, buf, sizeof(env));
- if (ret)
- goto out;
- envp = &env;
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
+ state_size);
}
/*
+ * Copy the legacy state because the FP portion of the FX frame has
+ * to be ignored for histerical raisins. The legacy state is folded
+ * in once the larger state has been copied.
+ */
+ ret = __copy_from_user(&env, buf, sizeof(env));
+ if (ret)
+ return ret;
+
+ /*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
* not modified on context switch and that the xstate is considered
* to be loaded again on return to userland (overriding last_cpu avoids
* the optimisation).
*/
fpregs_lock();
-
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
-
/*
- * Supervisor states are not modified by user space input. Save
- * current supervisor states first and invalidate the FPU regs.
+ * If supervisor states are available then save the
+ * hardware state in current's fpstate so that the
+ * supervisor state is preserved. Save the full state for
+ * simplicity. There is no point in optimizing this by only
+ * saving the supervisor states and then shuffle them to
+ * the right place in memory. It's ia32 mode. Shrug.
*/
if (xfeatures_mask_supervisor())
- copy_supervisor_to_kernel(&fpu->state.xsave);
+ os_xsave(&fpu->state.xsave);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
+ __cpu_invalidate_fpregs_state();
fpregs_unlock();
if (use_xsave() && !fx_only) {
- u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
-
- ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
+ ret = copy_sigframe_from_user_to_xstate(&fpu->state.xsave, buf_fx);
if (ret)
- goto out;
+ return ret;
+ } else {
+ if (__copy_from_user(&fpu->state.fxsave, buf_fx,
+ sizeof(fpu->state.fxsave)))
+ return -EFAULT;
- sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
- fx_only);
+ /* Reject invalid MXCSR values. */
+ if (fpu->state.fxsave.mxcsr & ~mxcsr_feature_mask)
+ return -EINVAL;
- fpregs_lock();
- if (unlikely(init_bv))
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+ /* Enforce XFEATURE_MASK_FPSSE when XSAVE is enabled */
+ if (use_xsave())
+ fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ }
+
+ /* Fold the legacy FP storage */
+ convert_to_fxsr(&fpu->state.fxsave, &env);
+ fpregs_lock();
+ if (use_xsave()) {
/*
- * Restore previously saved supervisor xstates along with
- * copied-in user xstates.
+ * Remove all UABI feature bits not set in user_xfeatures
+ * from the memory xstate header which makes the full
+ * restore below bring them into init state. This works for
+ * fx_only mode as well because that has only FP and SSE
+ * set in user_xfeatures.
+ *
+ * Preserve supervisor states!
*/
- ret = copy_kernel_to_xregs_err(&fpu->state.xsave,
- user_xfeatures | xfeatures_mask_supervisor());
+ u64 mask = user_xfeatures | xfeatures_mask_supervisor();
- } else if (use_fxsr()) {
- ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
- fx_only);
-
- fpregs_lock();
- if (use_xsave()) {
- u64 init_bv;
-
- init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
- copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
- }
-
- ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave);
+ fpu->state.xsave.header.xfeatures &= mask;
+ ret = os_xrstor_safe(&fpu->state.xsave, xfeatures_mask_all);
} else {
- ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size);
- if (ret)
- goto out;
-
- fpregs_lock();
- ret = copy_kernel_to_fregs_err(&fpu->state.fsave);
+ ret = fxrstor_safe(&fpu->state.fxsave);
}
- if (!ret)
+
+ if (likely(!ret))
fpregs_mark_activate();
- else
- fpregs_deactivate(fpu);
- fpregs_unlock();
-out:
- if (ret)
- fpu__clear_user_states(fpu);
+ fpregs_unlock();
return ret;
}
-
static inline int xstate_sigframe_size(void)
{
return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE :
@@ -496,15 +427,47 @@ static inline int xstate_sigframe_size(void)
*/
int fpu__restore_sig(void __user *buf, int ia32_frame)
{
+ unsigned int size = xstate_sigframe_size();
+ struct fpu *fpu = &current->thread.fpu;
void __user *buf_fx = buf;
- int size = xstate_sigframe_size();
+ bool ia32_fxstate = false;
+ int ret;
+
+ if (unlikely(!buf)) {
+ fpu__clear_user_states(fpu);
+ return 0;
+ }
+ ia32_frame &= (IS_ENABLED(CONFIG_X86_32) ||
+ IS_ENABLED(CONFIG_IA32_EMULATION));
+
+ /*
+ * Only FXSR enabled systems need the FX state quirk.
+ * FRSTOR does not need it and can use the fast path.
+ */
if (ia32_frame && use_fxsr()) {
buf_fx = buf + sizeof(struct fregs_state);
size += sizeof(struct fregs_state);
+ ia32_fxstate = true;
}
- return __fpu__restore_sig(buf, buf_fx, size);
+ if (!access_ok(buf, size)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (!IS_ENABLED(CONFIG_X86_64) && !cpu_feature_enabled(X86_FEATURE_FPU)) {
+ ret = fpregs_soft_set(current, NULL, 0,
+ sizeof(struct user_i387_ia32_struct),
+ NULL, buf);
+ } else {
+ ret = __fpu_restore_sig(buf, buf_fx, ia32_fxstate);
+ }
+
+out:
+ if (unlikely(ret))
+ fpu__clear_user_states(fpu);
+ return ret;
}
unsigned long
@@ -523,6 +486,25 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
return sp;
}
+
+unsigned long fpu__get_fpstate_size(void)
+{
+ unsigned long ret = xstate_sigframe_size();
+
+ /*
+ * This space is needed on (most) 32-bit kernels, or when a 32-bit
+ * app is running on a 64-bit kernel. To keep things simple, just
+ * assume the worst case and always include space for 'freg_state',
+ * even for 64-bit apps on 64-bit kernels. This wastes a bit of
+ * space, but keeps the code simple.
+ */
+ if ((IS_ENABLED(CONFIG_IA32_EMULATION) ||
+ IS_ENABLED(CONFIG_X86_32)) && use_fxsr())
+ ret += sizeof(struct fregs_state);
+
+ return ret;
+}
+
/*
* Prepare the SW reserved portion of the fxsave memory layout, indicating
* the presence of the extended state information in the memory layout
@@ -536,7 +518,7 @@ void fpu__init_prepare_fx_sw_frame(void)
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
- fx_sw_reserved.xfeatures = xfeatures_mask_user();
+ fx_sw_reserved.xfeatures = xfeatures_mask_uabi();
fx_sw_reserved.xstate_size = fpu_user_xstate_size;
if (IS_ENABLED(CONFIG_IA32_EMULATION) ||
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d0eef963aad1..c8def1b7f8fb 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -59,19 +59,24 @@ static short xsave_cpuid_features[] __initdata = {
* This represents the full set of bits that should ever be set in a kernel
* XSAVE buffer, both supervisor and user xstates.
*/
-u64 xfeatures_mask_all __read_mostly;
-
-static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
+u64 xfeatures_mask_all __ro_after_init;
+EXPORT_SYMBOL_GPL(xfeatures_mask_all);
+
+static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
+static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] __ro_after_init =
+ { [ 0 ... XFEATURE_MAX - 1] = -1};
/*
* The XSAVE area of kernel can be in standard or compacted format;
* it is always in standard format for user mode. This is the user
* mode standard format size used for signal and ptrace frames.
*/
-unsigned int fpu_user_xstate_size;
+unsigned int fpu_user_xstate_size __ro_after_init;
/*
* Return whether the system supports a given xfeature.
@@ -125,103 +130,13 @@ static bool xfeature_is_supervisor(int xfeature_nr)
}
/*
- * When executing XSAVEOPT (or other optimized XSAVE instructions), if
- * a processor implementation detects that an FPU state component is still
- * (or is again) in its initialized state, it may clear the corresponding
- * bit in the header.xfeatures field, and can skip the writeout of registers
- * to the corresponding memory layout.
- *
- * This means that when the bit is zero, the state component might still contain
- * some previous - non-initialized register state.
- *
- * Before writing xstate information to user-space we sanitize those components,
- * to always ensure that the memory layout of a feature will be in the init state
- * if the corresponding header bit is zero. This is to ensure that user-space doesn't
- * see some stale state in the memory layout during signal handling, debugging etc.
- */
-void fpstate_sanitize_xstate(struct fpu *fpu)
-{
- struct fxregs_state *fx = &fpu->state.fxsave;
- int feature_bit;
- u64 xfeatures;
-
- if (!use_xsaveopt())
- return;
-
- xfeatures = fpu->state.xsave.header.xfeatures;
-
- /*
- * None of the feature bits are in init state. So nothing else
- * to do for us, as the memory layout is up to date.
- */
- if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
- return;
-
- /*
- * FP is in init state
- */
- if (!(xfeatures & XFEATURE_MASK_FP)) {
- fx->cwd = 0x37f;
- fx->swd = 0;
- fx->twd = 0;
- fx->fop = 0;
- fx->rip = 0;
- fx->rdp = 0;
- memset(fx->st_space, 0, sizeof(fx->st_space));
- }
-
- /*
- * SSE is in init state
- */
- if (!(xfeatures & XFEATURE_MASK_SSE))
- memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
-
- /*
- * First two features are FPU and SSE, which above we handled
- * in a special way already:
- */
- feature_bit = 0x2;
- xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
-
- /*
- * Update all the remaining memory layouts according to their
- * standard xstate layout, if their header bit is in the init
- * state:
- */
- while (xfeatures) {
- if (xfeatures & 0x1) {
- int offset = xstate_comp_offsets[feature_bit];
- int size = xstate_sizes[feature_bit];
-
- memcpy((void *)fx + offset,
- (void *)&init_fpstate.xsave + offset,
- size);
- }
-
- xfeatures >>= 1;
- feature_bit++;
- }
-}
-
-/*
* Enable the extended processor state save/restore feature.
* Called once per CPU onlining.
*/
void fpu__init_cpu_xstate(void)
{
- u64 unsup_bits;
-
if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
return;
- /*
- * Unsupported supervisor xstates should not be found in
- * the xfeatures mask.
- */
- unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
- WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n",
- unsup_bits);
-
- xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
cr4_set_bits(X86_CR4_OSXSAVE);
@@ -230,14 +145,14 @@ void fpu__init_cpu_xstate(void)
* managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
* states can be set here.
*/
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
- xfeatures_mask_dynamic());
+ xfeatures_mask_independent());
}
}
@@ -441,12 +356,35 @@ static void __init print_xstate_offset_size(void)
}
/*
+ * All supported features have either init state all zeros or are
+ * handled in setup_init_fpu() individually. This is an explicit
+ * feature list and does not use XFEATURE_MASK*SUPPORTED to catch
+ * newly added supported features at build time and make people
+ * actually look at the init state for the new feature.
+ */
+#define XFEATURES_INIT_FPSTATE_HANDLED \
+ (XFEATURE_MASK_FP | \
+ XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_YMM | \
+ XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_ZMM_Hi256 | \
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PASID)
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
{
static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED) !=
+ XFEATURES_INIT_FPSTATE_HANDLED);
+
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -463,13 +401,25 @@ static void __init setup_init_fpu_buf(void)
/*
* Init all the features state with header.xfeatures being 0x0
*/
- copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+ os_xrstor_booting(&init_fpstate.xsave);
/*
- * Dump the init state again. This is to identify the init state
- * of any feature which is not represented by all zero's.
+ * All components are now in init state. Read the state back so
+ * that init_fpstate contains all non-zero init state. This only
+ * works with XSAVE, but not with XSAVEOPT and XSAVES because
+ * those use the init optimization which skips writing data for
+ * components in init state.
+ *
+ * XSAVE could be used, but that would require to reshuffle the
+ * data when XSAVES is available because XSAVES uses xstate
+ * compaction. But doing so is a pointless exercise because most
+ * components have an all zeros init state except for the legacy
+ * ones (FP and SSE). Those can be saved with FXSAVE into the
+ * legacy area. Adding new features requires to ensure that init
+ * state is all zeroes or if not to add the necessary handling
+ * here.
*/
- copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+ fxsave(&init_fpstate.fxsave);
}
static int xfeature_uncompacted_offset(int xfeature_nr)
@@ -500,25 +450,11 @@ int xfeature_size(int xfeature_nr)
return eax;
}
-/*
- * 'XSAVES' implies two different things:
- * 1. saving of supervisor/system state
- * 2. using the compacted format
- *
- * Use this function when dealing with the compacted format so
- * that it is obvious which aspect of 'XSAVES' is being handled
- * by the calling code.
- */
-int using_compacted_format(void)
-{
- return boot_cpu_has(X86_FEATURE_XSAVES);
-}
-
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
-int validate_user_xstate_header(const struct xstate_header *hdr)
+static int validate_user_xstate_header(const struct xstate_header *hdr)
{
/* No unknown or supervisor features may be set */
- if (hdr->xfeatures & ~xfeatures_mask_user())
+ if (hdr->xfeatures & ~xfeatures_mask_uabi())
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -616,7 +552,7 @@ static void check_xstate_against_struct(int nr)
* how large the XSAVE buffer needs to be. We are recalculating
* it to be safe.
*
- * Dynamic XSAVE features allocate their own buffers and are not
+ * Independent XSAVE features allocate their own buffers and are not
* covered by these checks. Only the size of the buffer for task->fpu
* is checked here.
*/
@@ -632,9 +568,9 @@ static void do_extra_xstate_size_checks(void)
check_xstate_against_struct(i);
/*
* Supervisor state components can be managed only by
- * XSAVES, which is compacted-format only.
+ * XSAVES.
*/
- if (!using_compacted_format())
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
XSTATE_WARN_ON(xfeature_is_supervisor(i));
/* Align from the end of the previous feature */
@@ -644,9 +580,9 @@ static void do_extra_xstate_size_checks(void)
* The offset of a given state in the non-compacted
* format is given to us in a CPUID leaf. We check
* them for being ordered (increasing offsets) in
- * setup_xstate_features().
+ * setup_xstate_features(). XSAVES uses compacted format.
*/
- if (!using_compacted_format())
+ if (!cpu_feature_enabled(X86_FEATURE_XSAVES))
paranoid_xstate_size = xfeature_uncompacted_offset(i);
/*
* The compacted-format offset always depends on where
@@ -682,18 +618,18 @@ static unsigned int __init get_xsaves_size(void)
}
/*
- * Get the total size of the enabled xstates without the dynamic supervisor
+ * Get the total size of the enabled xstates without the independent supervisor
* features.
*/
-static unsigned int __init get_xsaves_size_no_dynamic(void)
+static unsigned int __init get_xsaves_size_no_independent(void)
{
- u64 mask = xfeatures_mask_dynamic();
+ u64 mask = xfeatures_mask_independent();
unsigned int size;
if (!mask)
return get_xsaves_size();
- /* Disable dynamic features. */
+ /* Disable independent features. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
/*
@@ -702,7 +638,7 @@ static unsigned int __init get_xsaves_size_no_dynamic(void)
*/
size = get_xsaves_size();
- /* Re-enable dynamic features so XSAVES will work on them again. */
+ /* Re-enable independent features so XSAVES will work on them again. */
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask);
return size;
@@ -745,7 +681,7 @@ static int __init init_xstate_size(void)
xsave_size = get_xsave_size();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- possible_xstate_size = get_xsaves_size_no_dynamic();
+ possible_xstate_size = get_xsaves_size_no_independent();
else
possible_xstate_size = xsave_size;
@@ -786,6 +722,7 @@ void __init fpu__init_system_xstate(void)
{
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
+ u64 xfeatures;
int err;
int i;
@@ -820,7 +757,7 @@ void __init fpu__init_system_xstate(void)
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
xfeatures_mask_all |= ecx + ((u64)edx << 32);
- if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ if ((xfeatures_mask_uabi() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
@@ -839,7 +776,11 @@ void __init fpu__init_system_xstate(void)
xfeatures_mask_all &= ~BIT_ULL(i);
}
- xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
+ xfeatures_mask_all &= XFEATURE_MASK_USER_SUPPORTED |
+ XFEATURE_MASK_SUPERVISOR_SUPPORTED;
+
+ /* Store it for paranoia check at the end */
+ xfeatures = xfeatures_mask_all;
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -851,14 +792,24 @@ void __init fpu__init_system_xstate(void)
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user());
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_uabi());
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp_offsets();
setup_supervisor_only_offsets();
- print_xstate_offset_size();
+ /*
+ * Paranoia check whether something in the setup modified the
+ * xfeatures mask.
+ */
+ if (xfeatures != xfeatures_mask_all) {
+ pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n",
+ xfeatures, xfeatures_mask_all);
+ goto out_disable;
+ }
+
+ print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
xfeatures_mask_all,
fpu_kernel_xstate_size,
@@ -878,16 +829,16 @@ void fpu__resume_cpu(void)
/*
* Restore XCR0 on xsave capable CPUs:
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_uabi());
/*
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) {
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
- xfeatures_mask_dynamic());
+ xfeatures_mask_independent());
}
}
@@ -955,36 +906,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
-/*
- * This wraps up the common operations that need to occur when retrieving
- * data from xsave state. It first ensures that the current task was
- * using the FPU and retrieves the data in to a buffer. It then calculates
- * the offset of the requested field in the buffer.
- *
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP,
- * XFEATURE_SSE, etc...)
- * Output:
- * address of the state in the xsave area or NULL if the state
- * is not present or is in its 'init state'.
- */
-const void *get_xsave_field_ptr(int xfeature_nr)
-{
- struct fpu *fpu = &current->thread.fpu;
-
- /*
- * fpu__save() takes the CPU's xstate registers
- * and saves them off to the 'fpu memory buffer.
- */
- fpu__save(fpu);
-
- return get_xsave_addr(&fpu->state.xsave, xfeature_nr);
-}
-
#ifdef CONFIG_ARCH_HAS_PKEYS
/*
@@ -992,17 +913,16 @@ const void *get_xsave_field_ptr(int xfeature_nr)
* rights for @pkey to @init_val.
*/
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
- unsigned long init_val)
+ unsigned long init_val)
{
- u32 old_pkru;
- int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
- u32 new_pkru_bits = 0;
+ u32 old_pkru, new_pkru_bits = 0;
+ int pkey_shift;
/*
* This check implies XSAVE support. OSPKE only gets
* set if we enable XSAVE and we enable PKU in XCR0.
*/
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return -EINVAL;
/*
@@ -1010,7 +930,8 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
* values originating from in-kernel users. Complain
* if a bad value is observed.
*/
- WARN_ON_ONCE(pkey >= arch_max_pkey());
+ if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
+ return -EINVAL;
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
@@ -1019,6 +940,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
new_pkru_bits |= PKRU_WD_BIT;
/* Shift the bits in to the correct place in PKRU for pkey: */
+ pkey_shift = pkey * PKRU_BITS_PER_PKEY;
new_pkru_bits <<= pkey_shift;
/* Get old PKRU and mask off any old bits in place: */
@@ -1032,170 +954,178 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
}
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
-/*
- * Weird legacy quirk: SSE and YMM states store information in the
- * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
- * area is marked as unused in the xfeatures header, we need to copy
- * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
- */
-static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
-{
- if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
- return false;
-
- if (xfeatures & XFEATURE_MASK_FP)
- return false;
-
- return true;
-}
-
-static void fill_gap(struct membuf *to, unsigned *last, unsigned offset)
-{
- if (*last >= offset)
- return;
- membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last);
- *last = offset;
-}
-
-static void copy_part(struct membuf *to, unsigned *last, unsigned offset,
- unsigned size, void *from)
+static void copy_feature(bool from_xstate, struct membuf *to, void *xstate,
+ void *init_xstate, unsigned int size)
{
- fill_gap(to, last, offset);
- membuf_write(to, from, size);
- *last = offset + size;
+ membuf_write(to, from_xstate ? xstate : init_xstate, size);
}
-/*
- * Convert from kernel XSAVES compacted format to standard format and copy
- * to a kernel-space ptrace buffer.
+/**
+ * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
+ * @to: membuf descriptor
+ * @tsk: The task from which to copy the saved xstate
+ * @copy_mode: The requested copy mode
+ *
+ * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
+ * format, i.e. from the kernel internal hardware dependent storage format
+ * to the requested @mode. UABI XSTATE is always uncompacted!
*
- * It supports partial copy but pos always starts from zero. This is called
- * from xstateregs_get() and there we check the CPU has XSAVES.
+ * It supports partial copy but @to.pos always starts from zero.
*/
-void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave)
+void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
+ enum xstate_copy_mode copy_mode)
{
+ const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr);
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xinit = &init_fpstate.xsave;
struct xstate_header header;
- const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
- unsigned size = to.left;
- unsigned last = 0;
+ unsigned int zerofrom;
int i;
- /*
- * The destination is a ptrace buffer; we put in only user xstates:
- */
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= xfeatures_mask_user();
-
- if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(&to, &last, 0, off_mxcsr, &xsave->i387);
- if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
- copy_part(&to, &last, off_mxcsr,
- MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr);
- if (header.xfeatures & XFEATURE_MASK_FP)
- copy_part(&to, &last, offsetof(struct fxregs_state, st_space),
- 128, &xsave->i387.st_space);
- if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(&to, &last, xstate_offsets[XFEATURE_SSE],
- 256, &xsave->i387.xmm_space);
- /*
- * Fill xsave->i387.sw_reserved value for ptrace frame:
- */
- copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved),
- 48, xstate_fx_sw_bytes);
- /*
- * Copy xregs_state->header:
- */
- copy_part(&to, &last, offsetof(struct xregs_state, header),
- sizeof(header), &header);
- for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
- /*
- * Copy only in-use xstates:
- */
- if ((header.xfeatures >> i) & 1) {
- void *src = __raw_xsave_addr(xsave, i);
+ /* Mask out the feature bits depending on copy mode */
+ switch (copy_mode) {
+ case XSTATE_COPY_FP:
+ header.xfeatures &= XFEATURE_MASK_FP;
+ break;
- copy_part(&to, &last, xstate_offsets[i],
- xstate_sizes[i], src);
- }
+ case XSTATE_COPY_FX:
+ header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE;
+ break;
+ case XSTATE_COPY_XSAVE:
+ header.xfeatures &= xfeatures_mask_uabi();
+ break;
}
- fill_gap(&to, &last, size);
-}
-/*
- * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set().
- */
-int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
-{
- unsigned int offset, size;
- int i;
- struct xstate_header hdr;
+ /* Copy FP state up to MXCSR */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387,
+ &xinit->i387, off_mxcsr);
- offset = offsetof(struct xregs_state, header);
- size = sizeof(hdr);
+ /* Copy MXCSR when SSE or YMM are set in the feature mask */
+ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM),
+ &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr,
+ MXCSR_AND_FLAGS_SIZE);
- memcpy(&hdr, kbuf + offset, size);
+ /* Copy the remaining FP state */
+ copy_feature(header.xfeatures & XFEATURE_MASK_FP,
+ &to, &xsave->i387.st_space, &xinit->i387.st_space,
+ sizeof(xsave->i387.st_space));
- if (validate_user_xstate_header(&hdr))
- return -EINVAL;
+ /* Copy the SSE state - shared with YMM, but independently managed */
+ copy_feature(header.xfeatures & XFEATURE_MASK_SSE,
+ &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space,
+ sizeof(xsave->i387.xmm_space));
- for (i = 0; i < XFEATURE_MAX; i++) {
- u64 mask = ((u64)1 << i);
+ if (copy_mode != XSTATE_COPY_XSAVE)
+ goto out;
- if (hdr.xfeatures & mask) {
- void *dst = __raw_xsave_addr(xsave, i);
+ /* Zero the padding area */
+ membuf_zero(&to, sizeof(xsave->i387.padding));
- offset = xstate_offsets[i];
- size = xstate_sizes[i];
+ /* Copy xsave->i387.sw_reserved */
+ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved));
- memcpy(dst, kbuf + offset, size);
- }
- }
+ /* Copy the user space relevant state of @xsave->header */
+ membuf_write(&to, &header, sizeof(header));
- if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
- }
+ zerofrom = offsetof(struct xregs_state, extended_state_area);
- /*
- * The state that came in from userspace was user-state only.
- * Mask all the user states out of 'xfeatures':
- */
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
+ for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+ /*
+ * The ptrace buffer is in non-compacted XSAVE format.
+ * In non-compacted format disabled features still occupy
+ * state space, but there is no state to copy from in the
+ * compacted init_fpstate. The gap tracking will zero this
+ * later.
+ */
+ if (!(xfeatures_mask_uabi() & BIT_ULL(i)))
+ continue;
- /*
- * Add back in the features that came in from userspace:
- */
- xsave->header.xfeatures |= hdr.xfeatures;
+ /*
+ * If there was a feature or alignment gap, zero the space
+ * in the destination buffer.
+ */
+ if (zerofrom < xstate_offsets[i])
+ membuf_zero(&to, xstate_offsets[i] - zerofrom);
+
+ if (i == XFEATURE_PKRU) {
+ struct pkru_state pkru = {0};
+ /*
+ * PKRU is not necessarily up to date in the
+ * thread's XSAVE buffer. Fill this part from the
+ * per-thread storage.
+ */
+ pkru.pkru = tsk->thread.pkru;
+ membuf_write(&to, &pkru, sizeof(pkru));
+ } else {
+ copy_feature(header.xfeatures & BIT_ULL(i), &to,
+ __raw_xsave_addr(xsave, i),
+ __raw_xsave_addr(xinit, i),
+ xstate_sizes[i]);
+ }
+ /*
+ * Keep track of the last copied state in the non-compacted
+ * target buffer for gap zeroing.
+ */
+ zerofrom = xstate_offsets[i] + xstate_sizes[i];
+ }
+out:
+ if (to.left)
+ membuf_zero(&to, to.left);
+}
+
+static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
+ const void *kbuf, const void __user *ubuf)
+{
+ if (kbuf) {
+ memcpy(dst, kbuf + offset, size);
+ } else {
+ if (copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
+ }
return 0;
}
-/*
- * Convert from a ptrace or sigreturn standard-format user-space buffer to
- * kernel XSAVES format and copy to the target thread. This is called from
- * xstateregs_set(), as well as potentially from the sigreturn() and
- * rt_sigreturn() system calls.
- */
-int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+
+static int copy_uabi_to_xstate(struct xregs_state *xsave, const void *kbuf,
+ const void __user *ubuf)
{
unsigned int offset, size;
- int i;
struct xstate_header hdr;
+ u64 mask;
+ int i;
offset = offsetof(struct xregs_state, header);
- size = sizeof(hdr);
-
- if (__copy_from_user(&hdr, ubuf + offset, size))
+ if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf))
return -EFAULT;
if (validate_user_xstate_header(&hdr))
return -EINVAL;
+ /* Validate MXCSR when any of the related features is in use */
+ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM;
+ if (hdr.xfeatures & mask) {
+ u32 mxcsr[2];
+
+ offset = offsetof(struct fxregs_state, mxcsr);
+ if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf))
+ return -EFAULT;
+
+ /* Reserved bits in MXCSR must be zero. */
+ if (mxcsr[0] & ~mxcsr_feature_mask)
+ return -EINVAL;
+
+ /* SSE and YMM require MXCSR even when FP is not in use. */
+ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) {
+ xsave->i387.mxcsr = mxcsr[0];
+ xsave->i387.mxcsr_mask = mxcsr[1];
+ }
+ }
+
for (i = 0; i < XFEATURE_MAX; i++) {
u64 mask = ((u64)1 << i);
@@ -1205,18 +1135,11 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
offset = xstate_offsets[i];
size = xstate_sizes[i];
- if (__copy_from_user(dst, ubuf + offset, size))
+ if (copy_from_buffer(dst, offset, size, kbuf, ubuf))
return -EFAULT;
}
}
- if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
- offset = offsetof(struct fxregs_state, mxcsr);
- size = MXCSR_AND_FLAGS_SIZE;
- if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
- return -EFAULT;
- }
-
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1232,130 +1155,94 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
}
/*
- * Save only supervisor states to the kernel buffer. This blows away all
- * old states, and is intended to be used only in __fpu__restore_sig(), where
- * user states are restored from the user buffer.
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
+ * format and copy to the target thread. This is called from
+ * xstateregs_set().
*/
-void copy_supervisor_to_kernel(struct xregs_state *xstate)
+int copy_uabi_from_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
{
- struct xstate_header *header;
- u64 max_bit, min_bit;
- u32 lmask, hmask;
- int err, i;
-
- if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES)))
- return;
-
- if (!xfeatures_mask_supervisor())
- return;
-
- max_bit = __fls(xfeatures_mask_supervisor());
- min_bit = __ffs(xfeatures_mask_supervisor());
-
- lmask = xfeatures_mask_supervisor();
- hmask = xfeatures_mask_supervisor() >> 32;
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+ return copy_uabi_to_xstate(xsave, kbuf, NULL);
+}
- /* We should never fault when copying to a kernel buffer: */
- if (WARN_ON_FPU(err))
- return;
+/*
+ * Convert from a sigreturn standard-format user-space buffer to kernel
+ * XSAVE[S] format and copy to the target thread. This is called from the
+ * sigreturn() and rt_sigreturn() system calls.
+ */
+int copy_sigframe_from_user_to_xstate(struct xregs_state *xsave,
+ const void __user *ubuf)
+{
+ return copy_uabi_to_xstate(xsave, NULL, ubuf);
+}
- /*
- * At this point, the buffer has only supervisor states and must be
- * converted back to normal kernel format.
- */
- header = &xstate->header;
- header->xcomp_bv |= xfeatures_mask_all;
+static bool validate_xsaves_xrstors(u64 mask)
+{
+ u64 xchk;
+ if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES)))
+ return false;
/*
- * This only moves states up in the buffer. Start with
- * the last state and move backwards so that states are
- * not overwritten until after they are moved. Note:
- * memmove() allows overlapping src/dst buffers.
+ * Validate that this is either a task->fpstate related component
+ * subset or an independent one.
*/
- for (i = max_bit; i >= min_bit; i--) {
- u8 *xbuf = (u8 *)xstate;
+ if (mask & xfeatures_mask_independent())
+ xchk = ~xfeatures_mask_independent();
+ else
+ xchk = ~xfeatures_mask_all;
- if (!((header->xfeatures >> i) & 1))
- continue;
+ if (WARN_ON_ONCE(!mask || mask & xchk))
+ return false;
- /* Move xfeature 'i' into its normal location */
- memmove(xbuf + xstate_comp_offsets[i],
- xbuf + xstate_supervisor_only_offsets[i],
- xstate_sizes[i]);
- }
+ return true;
}
/**
- * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
- * an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features saved into the xsave area
+ * xsaves - Save selected components to a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to save
*
- * Only the dynamic supervisor states sets in the mask are saved into the xsave
- * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
- * supervisor feature). Besides the dynamic supervisor states, the legacy
- * region and XSAVE header are also saved into the xsave area. The supervisor
- * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ * The @xstate buffer must be 64 byte aligned and correctly initialized as
+ * XSAVES does not write the full xstate header. Before first use the
+ * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer
+ * can #GP.
*
- * The xsave area must be 64-bytes aligned.
+ * The feature mask must either be a subset of the independent features or
+ * a subset of the task->fpstate related features.
*/
-void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+void xsaves(struct xregs_state *xstate, u64 mask)
{
- u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
- u32 lmask, hmask;
int err;
- if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+ if (!validate_xsaves_xrstors(mask))
return;
- if (WARN_ON_FPU(!dynamic_mask))
- return;
-
- lmask = dynamic_mask;
- hmask = dynamic_mask >> 32;
-
- XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
-
- /* Should never fault when copying to a kernel buffer */
- WARN_ON_FPU(err);
+ XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
}
/**
- * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
- * an xsave area
- * @xstate: A pointer to an xsave area
- * @mask: Represent the dynamic supervisor features restored from the xsave area
+ * xrstors - Restore selected components from a kernel xstate buffer
+ * @xstate: Pointer to the buffer
+ * @mask: Feature mask to select the components to restore
+ *
+ * The @xstate buffer must be 64 byte aligned and correctly initialized
+ * otherwise XRSTORS from that buffer can #GP.
*
- * Only the dynamic supervisor states sets in the mask are restored from the
- * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
- * dynamic supervisor feature). Besides the dynamic supervisor states, the
- * legacy region and XSAVE header are also restored from the xsave area. The
- * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
- * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ * Proper usage is to restore the state which was saved with
+ * xsaves() into @xstate.
*
- * The xsave area must be 64-bytes aligned.
+ * The feature mask must either be a subset of the independent features or
+ * a subset of the task->fpstate related features.
*/
-void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+void xrstors(struct xregs_state *xstate, u64 mask)
{
- u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
- u32 lmask, hmask;
int err;
- if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
- return;
-
- if (WARN_ON_FPU(!dynamic_mask))
+ if (!validate_xsaves_xrstors(mask))
return;
- lmask = dynamic_mask;
- hmask = dynamic_mask >> 32;
-
- XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
-
- /* Should never fault when copying from a kernel buffer */
- WARN_ON_FPU(err);
+ XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err);
+ WARN_ON_ONCE(err);
}
#ifdef CONFIG_PROC_PID_ARCH_STATUS
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 04bddaaba8e2..d8b3ebd2bb85 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -62,7 +62,7 @@ SYM_CODE_START_NOALIGN(startup_64)
*/
/* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+ leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
pushq %rsi
@@ -343,10 +343,10 @@ SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
+ * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
* reliably detect the end of the stack.
*/
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS)
+SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA
__INIT
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index d552f177eca0..df0fa695bb09 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -35,12 +35,16 @@
#define SYSG(_vector, _addr) \
G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
+#ifdef CONFIG_X86_64
/*
* Interrupt gate with interrupt stack. The _ist index is the index in
* the tss.ist[] array, but for the descriptor it needs to start at 1.
*/
#define ISTG(_vector, _addr, _ist) \
G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
+#else
+#define ISTG(_vector, _addr, _ist) INTG(_vector, _addr)
+#endif
/* Task gate */
#define TSKG(_vector, _gdt) \
@@ -74,7 +78,7 @@ static const __initconst struct idt_data early_idts[] = {
*/
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, asm_exc_divide_error),
- INTG(X86_TRAP_NMI, asm_exc_nmi),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
INTG(X86_TRAP_BR, asm_exc_bounds),
INTG(X86_TRAP_UD, asm_exc_invalid_op),
INTG(X86_TRAP_NM, asm_exc_device_not_available),
@@ -91,12 +95,16 @@ static const __initconst struct idt_data def_idts[] = {
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
- INTG(X86_TRAP_DF, asm_exc_double_fault),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
#endif
- INTG(X86_TRAP_DB, asm_exc_debug),
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
#ifdef CONFIG_X86_MCE
- INTG(X86_TRAP_MC, asm_exc_machine_check),
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
#endif
SYSG(X86_TRAP_OF, asm_exc_overflow),
@@ -221,22 +229,6 @@ static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, asm_exc_page_fault),
};
-/*
- * The exceptions which use Interrupt stacks. They are setup after
- * cpu_init() when the TSS has been initialized.
- */
-static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
-#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
-#endif
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
-#endif
-};
-
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -254,14 +246,6 @@ void __init idt_setup_early_pf(void)
idt_setup_from_table(idt_table, early_pf_idts,
ARRAY_SIZE(early_pf_idts), true);
}
-
-/**
- * idt_setup_ist_traps - Initialize the idt table with traps using IST
- */
-void __init idt_setup_ist_traps(void)
-{
- idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
-}
#endif
static void __init idt_map_in_cea(void)
@@ -331,11 +315,10 @@ void __init idt_setup_early_handler(void)
/**
* idt_invalidate - Invalidate interrupt descriptor table
- * @addr: The virtual address of the 'invalid' IDT
*/
-void idt_invalidate(void *addr)
+void idt_invalidate(void)
{
- struct desc_ptr idt = { .address = (unsigned long) addr, .size = 0 };
+ static const struct desc_ptr idt = { .address = 0, .size = 0 };
load_idt(&idt);
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 6a2eb62c85e6..674906fad43b 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -15,50 +15,75 @@
#include <asm/kprobes.h>
#include <asm/alternative.h>
#include <asm/text-patching.h>
+#include <asm/insn.h>
-static void bug_at(const void *ip, int line)
+int arch_jump_entry_size(struct jump_entry *entry)
{
- /*
- * The location is not an op that we were expecting.
- * Something went wrong. Crash the box, as something could be
- * corrupting the kernel.
- */
- pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
- BUG();
+ struct insn insn = {};
+
+ insn_decode_kernel(&insn, (void *)jump_entry_code(entry));
+ BUG_ON(insn.length != 2 && insn.length != 5);
+
+ return insn.length;
}
-static const void *
-__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type)
+struct jump_label_patch {
+ const void *code;
+ int size;
+};
+
+static struct jump_label_patch
+__jump_label_patch(struct jump_entry *entry, enum jump_label_type type)
{
- const void *expect, *code;
+ const void *expect, *code, *nop;
const void *addr, *dest;
- int line;
+ int size;
addr = (void *)jump_entry_code(entry);
dest = (void *)jump_entry_target(entry);
- code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ size = arch_jump_entry_size(entry);
+ switch (size) {
+ case JMP8_INSN_SIZE:
+ code = text_gen_insn(JMP8_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
- if (type == JUMP_LABEL_JMP) {
- expect = x86_nops[5]; line = __LINE__;
- } else {
- expect = code; line = __LINE__;
+ case JMP32_INSN_SIZE:
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
+ nop = x86_nops[size];
+ break;
+
+ default: BUG();
}
- if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
- bug_at(addr, line);
+ if (type == JUMP_LABEL_JMP)
+ expect = nop;
+ else
+ expect = code;
+
+ if (memcmp(addr, expect, size)) {
+ /*
+ * The location is not an op that we were expecting.
+ * Something went wrong. Crash the box, as something could be
+ * corrupting the kernel.
+ */
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph != %5ph)) size:%d type:%d\n",
+ addr, addr, addr, expect, size, type);
+ BUG();
+ }
if (type == JUMP_LABEL_NOP)
- code = x86_nops[5];
+ code = nop;
- return code;
+ return (struct jump_label_patch){.code = code, .size = size};
}
static inline void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
{
- const void *opcode = __jump_label_set_jump_code(entry, type);
+ const struct jump_label_patch jlp = __jump_label_patch(entry, type);
/*
* As long as only a single processor is running and the code is still
@@ -72,12 +97,11 @@ static inline void __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), opcode,
- JUMP_LABEL_NOP_SIZE);
+ text_poke_early((void *)jump_entry_code(entry), jlp.code, jlp.size);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
}
static void __ref jump_label_transform(struct jump_entry *entry,
@@ -98,7 +122,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- const void *opcode;
+ struct jump_label_patch jlp;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -109,9 +133,8 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
mutex_lock(&text_mutex);
- opcode = __jump_label_set_jump_code(entry, type);
- text_poke_queue((void *)jump_entry_code(entry),
- opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ jlp = __jump_label_patch(entry, type);
+ text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL);
mutex_unlock(&text_mutex);
return true;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d3d65545cb8b..b6e046e4b289 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -422,12 +422,6 @@ void *alloc_insn_page(void)
return page;
}
-/* Recover page to RW mode before releasing it */
-void free_insn_page(void *page)
-{
- module_memfree(page);
-}
-
/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
@@ -674,7 +668,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
break;
if (insn->addr_bytes != sizeof(unsigned long))
- return -EOPNOTSUPP; /* Don't support differnt size */
+ return -EOPNOTSUPP; /* Don't support different size */
if (X86_MODRM_MOD(opcode) != 3)
return -EOPNOTSUPP; /* TODO: support memory addressing */
@@ -1102,24 +1096,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
restore_previous_kprobe(kcb);
else
reset_current_kprobe();
- } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
- kcb->kprobe_status == KPROBE_HIT_SSDONE) {
- /*
- * We increment the nmissed count for accounting,
- * we can also use npre/npostfault count for accounting
- * these specific fault cases.
- */
- kprobes_inc_nmissed_count(cur);
-
- /*
- * We come here because instructions in the pre/post
- * handler caused the page_fault, this could happen
- * if handler tries to access user space by
- * copy_from_user(), get_user() etc. Let the
- * user-specified handler try to fix it first.
- */
- if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
- return 1;
}
return 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 64b00b0d7fe8..1b373d79cedc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,17 +23,6 @@
#include <asm/set_memory.h>
#include <asm/debugreg.h>
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
-
static void load_segments(void)
{
#define __STR(X) #X
@@ -232,8 +221,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- idt_invalidate(phys_to_virt(0));
- set_gdt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index c078b0d3ab0e..131f30fdcfbd 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -256,35 +256,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
return init_transition_pgtable(image, level4p);
}
-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
-
- /* x86-64 supports unaligned loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
-
static void load_segments(void)
{
__asm__ __volatile__ (
@@ -379,8 +350,8 @@ void machine_kexec(struct kimage *image)
* The gdt & idt are now invalid.
* If you want to load them you must set up your own idt & gdt.
*/
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
+ native_idt_invalidate();
+ native_gdt_invalidate();
/* now call it */
image->start = relocate_kernel((unsigned long)image->head,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5e1f38179f49..1d9463e3096b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -87,8 +87,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
-
- return fpu__copy(dst, src);
+ return fpu_clone(dst);
}
/*
@@ -157,11 +156,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
+ p->thread.pkru = pkru_get_init_value();
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
}
+ /*
+ * Clone current's PKRU value from hardware. tsk->thread.pkru
+ * is only valid when scheduled out.
+ */
+ p->thread.pkru = read_pkru();
+
frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
@@ -199,6 +205,15 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
return ret;
}
+static void pkru_flush_thread(void)
+{
+ /*
+ * If PKRU is enabled the default PKRU value has to be loaded into
+ * the hardware right here (similar to context switch).
+ */
+ pkru_write_default();
+}
+
void flush_thread(void)
{
struct task_struct *tsk = current;
@@ -206,7 +221,8 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- fpu__clear_all(&tsk->thread.fpu);
+ fpu_flush_thread();
+ pkru_flush_thread();
}
void disable_TSC(void)
@@ -931,7 +947,7 @@ unsigned long get_wchan(struct task_struct *p)
unsigned long start, bottom, top, sp, fp, ip, ret = 0;
int count = 0;
- if (p == current || p->state == TASK_RUNNING)
+ if (p == current || task_is_running(p))
return 0;
if (!try_get_task_stack(p))
@@ -975,7 +991,7 @@ unsigned long get_wchan(struct task_struct *p)
goto out;
}
fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
- } while (count++ < 16 && p->state != TASK_RUNNING);
+ } while (count++ < 16 && !task_is_running(p));
out:
put_task_stack(p);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d08307df69ad..ec0d836a13b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -41,6 +41,7 @@
#include <linux/syscalls.h>
#include <asm/processor.h>
+#include <asm/pkru.h>
#include <asm/fpu/internal.h>
#include <asm/mmu_context.h>
#include <asm/prctl.h>
@@ -136,7 +137,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
log_lvl, d3, d6, d7);
}
- if (boot_cpu_has(X86_FEATURE_OSPKE))
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE))
printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
@@ -339,6 +340,29 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
}
}
+/*
+ * Store prev's PKRU value and load next's PKRU value if they differ. PKRU
+ * is not XSTATE managed on context switch because that would require a
+ * lookup in the task's FPU xsave buffer and require to keep that updated
+ * in various places.
+ */
+static __always_inline void x86_pkru_load(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Stash the prev task's value: */
+ prev->pkru = rdpkru();
+
+ /*
+ * PKRU writes are slightly expensive. Avoid them when not
+ * strictly necessary:
+ */
+ if (prev->pkru != next->pkru)
+ wrpkru(next->pkru);
+}
+
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
@@ -588,6 +612,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
x86_fsgsbase_load(prev, next);
+ x86_pkru_load(prev, next);
+
/*
* Switch the PDA and FPU contexts.
*/
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 87a4143aa7d7..4c208ea3bd9f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -911,7 +911,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
* syscall with TS_COMPAT still set.
*/
regs->orig_ax = value;
- if (syscall_get_nr(child, regs) >= 0)
+ if (syscall_get_nr(child, regs) != -1)
child->thread_info.status |= TS_I386_REGS_POKED;
break;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index b29657b76e3f..ebfb91108232 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -669,7 +669,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_TRIPLE:
- idt_invalidate(NULL);
+ idt_invalidate();
__asm__ __volatile__("int3");
/* We're probably dead after this, but... */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1e720626069a..bff3a784aec5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -14,6 +14,7 @@
#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
#include <linux/memblock.h>
+#include <linux/panic_notifier.h>
#include <linux/pci.h>
#include <linux/root_dev.h>
#include <linux/hugetlb.h>
@@ -695,30 +696,6 @@ static void __init e820_add_kernel_range(void)
e820__range_add(start, size, E820_TYPE_RAM);
}
-static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
-
-static int __init parse_reservelow(char *p)
-{
- unsigned long long size;
-
- if (!p)
- return -EINVAL;
-
- size = memparse(p, &p);
-
- if (size < 4096)
- size = 4096;
-
- if (size > 640*1024)
- size = 640*1024;
-
- reserve_low = size;
-
- return 0;
-}
-
-early_param("reservelow", parse_reservelow);
-
static void __init early_reserve_memory(void)
{
/*
@@ -870,10 +847,7 @@ void __init setup_arch(char **cmdline_p)
if (!boot_params.hdr.root_flags)
root_mountflags &= ~MS_RDONLY;
- init_mm.start_code = (unsigned long) _text;
- init_mm.end_code = (unsigned long) _etext;
- init_mm.end_data = (unsigned long) _edata;
- init_mm.brk = _brk_end;
+ setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end);
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
@@ -1084,17 +1058,18 @@ void __init setup_arch(char **cmdline_p)
#endif
/*
- * Find free memory for the real mode trampoline and place it
- * there.
- * If there is not enough free memory under 1M, on EFI-enabled
- * systems there will be additional attempt to reclaim the memory
- * for the real mode trampoline at efi_free_boot_services().
+ * Find free memory for the real mode trampoline and place it there. If
+ * there is not enough free memory under 1M, on EFI-enabled systems
+ * there will be additional attempt to reclaim the memory for the real
+ * mode trampoline at efi_free_boot_services().
+ *
+ * Unconditionally reserve the entire first 1M of RAM because BIOSes
+ * are known to corrupt low memory and several hundred kilobytes are not
+ * worth complex detection what memory gets clobbered. Windows does the
+ * same thing for very similar reasons.
*
- * Unconditionally reserve the entire first 1M of RAM because
- * BIOSes are know to corrupt low memory and several
- * hundred kilobytes are not worth complex detection what memory gets
- * clobbered. Moreover, on machines with SandyBridge graphics or in
- * setups that use crashkernel the entire 1M is reserved anyway.
+ * Moreover, on machines with SandyBridge graphics or in setups that use
+ * crashkernel the entire 1M is reserved anyway.
*/
reserve_real_mode();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0941d2f44f2a..78a32b956e81 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -66,7 +66,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
*/
static bool __init pcpu_need_numa(void)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
pg_data_t *last = NULL;
unsigned int cpu;
@@ -101,7 +101,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
unsigned long align)
{
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
int node = early_cpu_to_node(cpu);
void *ptr;
@@ -140,7 +140,7 @@ static void __init pcpu_fc_free(void *ptr, size_t size)
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
+#ifdef CONFIG_NUMA
if (early_cpu_to_node(from) == early_cpu_to_node(to))
return LOCAL_DISTANCE;
else
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 651b81cd648e..a6895e440bc3 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -7,12 +7,11 @@
* Author: Joerg Roedel <jroedel@suse.de>
*/
-#define pr_fmt(fmt) "SEV-ES: " fmt
+#define pr_fmt(fmt) "SEV: " fmt
#include <linux/sched/debug.h> /* For show_regs() */
#include <linux/percpu-defs.h>
#include <linux/mem_encrypt.h>
-#include <linux/lockdep.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
@@ -192,11 +191,19 @@ void noinstr __sev_es_ist_exit(void)
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
}
-static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
+ WARN_ON(!irqs_disabled());
+
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
@@ -213,7 +220,9 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
data->ghcb_active = false;
data->backup_ghcb_active = false;
+ instrumentation_begin();
panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ instrumentation_end();
}
/* Mark backup_ghcb active before writing to it */
@@ -258,17 +267,24 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
- int res;
+ int insn_bytes;
- res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (!res) {
+ insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (insn_bytes == 0) {
+ /* Nothing could be copied */
ctxt->fi.vector = X86_TRAP_PF;
ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
ctxt->fi.cr2 = ctxt->regs->ip;
return ES_EXCEPTION;
+ } else if (insn_bytes == -EINVAL) {
+ /* Effective RIP could not be calculated */
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ ctxt->fi.cr2 = 0;
+ return ES_EXCEPTION;
}
- if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res))
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
return ES_DECODE_FAILED;
if (ctxt->insn.immediate.got)
@@ -479,11 +495,13 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt
/* Include code shared with pre-decompression boot stage */
#include "sev-shared.c"
-static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
+static noinstr void __sev_put_ghcb(struct ghcb_state *state)
{
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
+ WARN_ON(!irqs_disabled());
+
data = this_cpu_read(runtime_data);
ghcb = &data->ghcb_page;
@@ -507,7 +525,7 @@ void noinstr __sev_es_nmi_complete(void)
struct ghcb_state state;
struct ghcb *ghcb;
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
@@ -517,7 +535,7 @@ void noinstr __sev_es_nmi_complete(void)
sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
VMGEXIT();
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
}
static u64 get_jump_table_addr(void)
@@ -529,7 +547,7 @@ static u64 get_jump_table_addr(void)
local_irq_save(flags);
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
@@ -543,7 +561,7 @@ static u64 get_jump_table_addr(void)
ghcb_sw_exit_info_2_is_valid(ghcb))
ret = ghcb->save.sw_exit_info_2;
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
local_irq_restore(flags);
@@ -668,7 +686,7 @@ static void sev_es_ap_hlt_loop(void)
struct ghcb_state state;
struct ghcb *ghcb;
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
while (true) {
vc_ghcb_invalidate(ghcb);
@@ -685,7 +703,7 @@ static void sev_es_ap_hlt_loop(void)
break;
}
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
}
/*
@@ -775,7 +793,7 @@ void __init sev_es_init_vc_handling(void)
sev_es_setup_play_dead();
/* Secondary CPUs use the runtime #VC handler */
- initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
+ initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
}
static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
@@ -1213,14 +1231,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
return ES_EXCEPTION;
}
-static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
-{
- if (user_mode(regs))
- noist_exc_debug(regs);
- else
- exc_debug(regs);
-}
-
static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
struct ghcb *ghcb,
unsigned long exit_code)
@@ -1316,44 +1326,15 @@ static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
}
-/*
- * Main #VC exception handler. It is called when the entry code was able to
- * switch off the IST to a safe kernel stack.
- *
- * With the current implementation it is always possible to switch to a safe
- * stack because #VC exceptions only happen at known places, like intercepted
- * instructions or accesses to MMIO areas/IO ports. They can also happen with
- * code instrumentation when the hypervisor intercepts #DB, but the critical
- * paths are forbidden to be instrumented, so #DB exceptions currently also
- * only happen in safe places.
- */
-DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
{
- irqentry_state_t irq_state;
struct ghcb_state state;
struct es_em_ctxt ctxt;
enum es_result result;
struct ghcb *ghcb;
+ bool ret = true;
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
- vc_handle_trap_db(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
- lockdep_assert_irqs_disabled();
- instrumentation_begin();
-
- /*
- * This is invoked through an interrupt gate, so IRQs are disabled. The
- * code below might walk page-tables for user or kernel addresses, so
- * keep the IRQs disabled to protect us against concurrent TLB flushes.
- */
-
- ghcb = sev_es_get_ghcb(&state);
+ ghcb = __sev_get_ghcb(&state);
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(&ctxt, regs, error_code);
@@ -1361,7 +1342,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
if (result == ES_OK)
result = vc_handle_exitcode(&ctxt, ghcb, error_code);
- sev_es_put_ghcb(&state);
+ __sev_put_ghcb(&state);
/* Done - now check the result */
switch (result) {
@@ -1369,17 +1350,20 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
vc_finish_insn(&ctxt);
break;
case ES_UNSUPPORTED:
- pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_VMM_ERROR:
pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_DECODE_FAILED:
pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
error_code, regs->ip);
- goto fail;
+ ret = false;
+ break;
case ES_EXCEPTION:
vc_forward_exception(&ctxt);
break;
@@ -1395,24 +1379,52 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
BUG();
}
-out:
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
+ return ret;
+}
- return;
+static __always_inline bool vc_is_db(unsigned long error_code)
+{
+ return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
+}
-fail:
- if (user_mode(regs)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- } else {
- pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
- result);
+/*
+ * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
+ * and will panic when an error happens.
+ */
+DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
+{
+ irqentry_state_t irq_state;
+
+ /*
+ * With the current implementation it is always possible to switch to a
+ * safe stack because #VC exceptions only happen at known places, like
+ * intercepted instructions or accesses to MMIO areas/IO ports. They can
+ * also happen with code instrumentation when the hypervisor intercepts
+ * #DB, but the critical paths are forbidden to be instrumented, so #DB
+ * exceptions currently also only happen in safe places.
+ *
+ * But keep this here in case the noinstr annotations are violated due
+ * to bug elsewhere.
+ */
+ if (unlikely(on_vc_fallback_stack(regs))) {
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+ }
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ exc_debug(regs);
+ return;
+ }
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ instrumentation_begin();
+
+ if (!vc_raw_handle_exception(regs, error_code)) {
/* Show some debug info */
show_regs(regs);
@@ -1423,23 +1435,38 @@ fail:
panic("Returned from Terminate-Request to Hypervisor\n");
}
- goto out;
+ instrumentation_end();
+ irqentry_nmi_exit(regs, irq_state);
}
-/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
-DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
+/*
+ * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
+ * and will kill the current task with SIGBUS when an error happens.
+ */
+DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
{
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (vc_is_db(error_code)) {
+ noist_exc_debug(regs);
+ return;
+ }
+
+ irqentry_enter_from_user_mode(regs);
instrumentation_begin();
- panic("Can't handle #VC exception from unsupported context\n");
- instrumentation_end();
-}
-DEFINE_IDTENTRY_VC(exc_vmm_communication)
-{
- if (likely(!on_vc_fallback_stack(regs)))
- safe_stack_exc_vmm_communication(regs, error_code);
- else
- ist_exc_vmm_communication(regs, error_code);
+ if (!vc_raw_handle_exception(regs, error_code)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ }
+
+ instrumentation_end();
+ irqentry_exit_to_user_mode(regs);
}
bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a06cb107c0e8..f4d21e470083 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -212,6 +212,11 @@ do { \
* Set up a signal frame.
*/
+/* x86 ABI requires 16-byte alignment */
+#define FRAME_ALIGNMENT 16UL
+
+#define MAX_FRAME_PADDING (FRAME_ALIGNMENT - 1)
+
/*
* Determine which stack to use..
*/
@@ -222,9 +227,9 @@ static unsigned long align_sigframe(unsigned long sp)
* Align the stack pointer according to the i386 ABI,
* i.e. so that on function entry ((sp + 4) & 15) == 0.
*/
- sp = ((sp + 4) & -16ul) - 4;
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
#else /* !CONFIG_X86_32 */
- sp = round_down(sp, 16) - 8;
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
#endif
return sp;
}
@@ -234,10 +239,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
/* Default to using normal stack */
+ bool nested_altstack = on_sig_stack(regs->sp);
+ bool entering_altstack = false;
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
- int onsigstack = on_sig_stack(sp);
int ret;
/* redzone */
@@ -246,15 +252,23 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
/* This is the X/Open sanctioned signal stack switching. */
if (ka->sa.sa_flags & SA_ONSTACK) {
- if (sas_ss_flags(sp) == 0)
+ /*
+ * This checks nested_altstack via sas_ss_flags(). Sensible
+ * programs use SS_AUTODISARM, which disables that check, and
+ * programs that don't use SS_AUTODISARM get compatible.
+ */
+ if (sas_ss_flags(sp) == 0) {
sp = current->sas_ss_sp + current->sas_ss_size;
+ entering_altstack = true;
+ }
} else if (IS_ENABLED(CONFIG_X86_32) &&
- !onsigstack &&
+ !nested_altstack &&
regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
ka->sa.sa_restorer) {
/* This is the legacy signal stack switching. */
sp = (unsigned long) ka->sa.sa_restorer;
+ entering_altstack = true;
}
sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
@@ -267,8 +281,15 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
* If we are on the alternate signal stack and would overflow it, don't.
* Return an always-bogus address instead so we will die with SIGSEGV.
*/
- if (onsigstack && !likely(on_sig_stack(sp)))
+ if (unlikely((nested_altstack || entering_altstack) &&
+ !__on_sig_stack(sp))) {
+
+ if (show_unhandled_signals && printk_ratelimit())
+ pr_info("%s[%d] overflowed sigaltstack\n",
+ current->comm, task_pid_nr(current));
+
return (void __user *)-1L;
+ }
/* save i387 and extended state */
ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size);
@@ -663,6 +684,61 @@ badframe:
return 0;
}
+/*
+ * There are four different struct types for signal frame: sigframe_ia32,
+ * rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
+ * -- the largest size. It means the size for 64-bit apps is a bit more
+ * than needed, but this keeps the code simple.
+ */
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct sigframe_ia32)
+#else
+# define MAX_FRAME_SIGINFO_UCTXT_SIZE sizeof(struct rt_sigframe)
+#endif
+
+/*
+ * The FP state frame contains an XSAVE buffer which must be 64-byte aligned.
+ * If a signal frame starts at an unaligned address, extra space is required.
+ * This is the max alignment padding, conservatively.
+ */
+#define MAX_XSAVE_PADDING 63UL
+
+/*
+ * The frame data is composed of the following areas and laid out as:
+ *
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | (f)xsave frame |
+ * -------------------------
+ * | fsave header |
+ * -------------------------
+ * | alignment padding |
+ * -------------------------
+ * | siginfo + ucontext |
+ * -------------------------
+ */
+
+/* max_frame_size tells userspace the worst case signal stack size. */
+static unsigned long __ro_after_init max_frame_size;
+
+void __init init_sigframe_size(void)
+{
+ max_frame_size = MAX_FRAME_SIGINFO_UCTXT_SIZE + MAX_FRAME_PADDING;
+
+ max_frame_size += fpu__get_fpstate_size() + MAX_XSAVE_PADDING;
+
+ /* Userspace expects an aligned size. */
+ max_frame_size = round_up(max_frame_size, FRAME_ALIGNMENT);
+
+ pr_info("max sigframe size: %lu\n", max_frame_size);
+}
+
+unsigned long get_sigframe_size(void)
+{
+ return max_frame_size;
+}
+
static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
@@ -713,7 +789,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
/* Are we from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* If so, check system call restarting.. */
switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
@@ -793,7 +869,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
}
/* Did we come from a system call? */
- if (syscall_get_nr(current, regs) >= 0) {
+ if (syscall_get_nr(current, regs) != -1) {
/* Restart the system call - no handlers present */
switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7770245cc7fa..9320285a5e29 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -232,11 +232,9 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- cpu_init_exception_handling();
- cpu_init();
+ cpu_init_secondary();
rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
smp_callin();
enable_start_cpu0 = 0;
diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c
new file mode 100644
index 000000000000..6b73b6f92ad3
--- /dev/null
+++ b/arch/x86/kernel/trace.c
@@ -0,0 +1,234 @@
+#include <asm/trace/irq_vectors.h>
+#include <linux/trace.h>
+
+#if defined(CONFIG_OSNOISE_TRACER) && defined(CONFIG_X86_LOCAL_APIC)
+/*
+ * trace_intel_irq_entry - record intel specific IRQ entry
+ */
+static void trace_intel_irq_entry(void *data, int vector)
+{
+ osnoise_trace_irq_entry(vector);
+}
+
+/*
+ * trace_intel_irq_exit - record intel specific IRQ exit
+ */
+static void trace_intel_irq_exit(void *data, int vector)
+{
+ char *vector_desc = (char *) data;
+
+ osnoise_trace_irq_exit(vector, vector_desc);
+}
+
+/*
+ * register_intel_irq_tp - Register intel specific IRQ entry tracepoints
+ */
+int osnoise_arch_register(void)
+{
+ int ret;
+
+ ret = register_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_err;
+
+ ret = register_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ if (ret)
+ goto out_timer_entry;
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ ret = register_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_timer_exit;
+
+ ret = register_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ if (ret)
+ goto out_thermal_entry;
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+#ifdef CONFIG_X86_MCE_AMD
+ ret = register_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_thermal_exit;
+
+ ret = register_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ if (ret)
+ goto out_deferred_entry;
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ ret = register_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_deferred_exit;
+
+ ret = register_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ if (ret)
+ goto out_threshold_entry;
+#endif /* CONFIG_X86_MCE_THRESHOLD */
+
+#ifdef CONFIG_SMP
+ ret = register_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_threshold_exit;
+
+ ret = register_trace_call_function_single_exit(trace_intel_irq_exit,
+ "call_function_single");
+ if (ret)
+ goto out_call_function_single_entry;
+
+ ret = register_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_single_exit;
+
+ ret = register_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ if (ret)
+ goto out_call_function_entry;
+
+ ret = register_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_call_function_exit;
+
+ ret = register_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ if (ret)
+ goto out_reschedule_entry;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_IRQ_WORK
+ ret = register_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_reschedule_exit;
+
+ ret = register_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ if (ret)
+ goto out_irq_work_entry;
+#endif
+
+ ret = register_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_irq_work_exit;
+
+ ret = register_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ if (ret)
+ goto out_x86_ipi_entry;
+
+ ret = register_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_x86_ipi_exit;
+
+ ret = register_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ if (ret)
+ goto out_error_apic_entry;
+
+ ret = register_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ if (ret)
+ goto out_error_apic_exit;
+
+ ret = register_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ if (ret)
+ goto out_spurious_apic_entry;
+
+ return 0;
+
+out_spurious_apic_entry:
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+out_error_apic_exit:
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+out_error_apic_entry:
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+out_x86_ipi_exit:
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+out_x86_ipi_entry:
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+out_irq_work_exit:
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+out_irq_work_entry:
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+out_reschedule_exit:
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+out_reschedule_entry:
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+out_call_function_exit:
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+out_call_function_entry:
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+out_call_function_single_exit:
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+out_call_function_single_entry:
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+out_threshold_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+out_threshold_entry:
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+out_deferred_exit:
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+out_deferred_entry:
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+out_thermal_exit:
+#endif /* CONFIG_X86_MCE_AMD */
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+out_thermal_entry:
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+out_timer_exit:
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+out_timer_entry:
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+out_err:
+ return -EINVAL;
+}
+
+void osnoise_arch_unregister(void)
+{
+ unregister_trace_spurious_apic_exit(trace_intel_irq_exit, "spurious_apic");
+ unregister_trace_spurious_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_error_apic_exit(trace_intel_irq_exit, "error_apic");
+ unregister_trace_error_apic_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_x86_platform_ipi_exit(trace_intel_irq_exit, "x86_platform_ipi");
+ unregister_trace_x86_platform_ipi_entry(trace_intel_irq_entry, NULL);
+
+#ifdef CONFIG_IRQ_WORK
+ unregister_trace_irq_work_exit(trace_intel_irq_exit, "irq_work");
+ unregister_trace_irq_work_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_SMP
+ unregister_trace_reschedule_exit(trace_intel_irq_exit, "reschedule");
+ unregister_trace_reschedule_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_exit(trace_intel_irq_exit, "call_function");
+ unregister_trace_call_function_entry(trace_intel_irq_entry, NULL);
+ unregister_trace_call_function_single_exit(trace_intel_irq_exit, "call_function_single");
+ unregister_trace_call_function_single_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+ unregister_trace_threshold_apic_exit(trace_intel_irq_exit, "threshold_apic");
+ unregister_trace_threshold_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+ unregister_trace_deferred_error_apic_exit(trace_intel_irq_exit, "deferred_error");
+ unregister_trace_deferred_error_apic_entry(trace_intel_irq_entry, NULL);
+#endif
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ unregister_trace_thermal_apic_exit(trace_intel_irq_exit, "thermal_apic");
+ unregister_trace_thermal_apic_entry(trace_intel_irq_entry, NULL);
+#endif /* CONFIG_X86_THERMAL_VECTOR */
+
+ unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer");
+ unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL);
+}
+#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 853ea7a80806..a58800973aed 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1046,9 +1046,10 @@ static void math_error(struct pt_regs *regs, int trapnr)
}
/*
- * Save the info for the exception handler and clear the error.
+ * Synchronize the FPU register state to the memory register state
+ * if necessary. This allows the exception handler to inspect it.
*/
- fpu__save(fpu);
+ fpu_sync_fpstate(fpu);
task->thread.trap_nr = trapnr;
task->thread.error_code = 0;
@@ -1160,12 +1161,9 @@ void __init trap_init(void)
/* Init GHCB memory pages when running as an SEV-ES guest */
sev_es_init_vc_handling();
+ /* Initialize TSS before setting up traps so ISTs work */
+ cpu_init_exception_handling();
+ /* Setup traps as cpu_init() might #GP */
idt_setup_traps();
-
- /*
- * Should be a barrier for any external CPU state:
- */
cpu_init();
-
- idt_setup_ist_traps();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57ec01192180..2e076a459a0c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1128,6 +1128,7 @@ static int tsc_cs_enable(struct clocksource *cs)
static struct clocksource clocksource_tsc_early = {
.name = "tsc-early",
.rating = 299,
+ .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = read_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -1152,7 +1153,8 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_VALID_FOR_HRES |
- CLOCK_SOURCE_MUST_VERIFY,
+ CLOCK_SOURCE_MUST_VERIFY |
+ CLOCK_SOURCE_VERIFY_PERCPU,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 8daa70b0d2da..576b47e7523d 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -346,14 +346,12 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (!regs)
return false;
- nr_copied = insn_fetch_from_user(regs, buf);
-
/*
- * The insn_fetch_from_user above could have failed if user code
- * is protected by a memory protection key. Give up on emulation
- * in such a case. Should we issue a page fault?
+ * Give up on emulation if fetching the instruction failed. Should a
+ * page fault or a #GP be issued?
*/
- if (!nr_copied)
+ nr_copied = insn_fetch_from_user(regs, buf);
+ if (nr_copied <= 0)
return false;
if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 7a78b88c0f1a..ac69894eab88 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,8 +22,6 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on HIGH_RES_TIMERS
- # for TASKSTATS/TASK_DELAY_ACCT:
- depends on NET && MULTIUSER
depends on X86_LOCAL_APIC
select PREEMPT_NOTIFIERS
select MMU_NOTIFIER
@@ -36,8 +34,7 @@ config KVM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
- select TASKSTATS
- select TASK_DELAY_ACCT
+ select SCHED_INFO
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b888385d1933..66f7f5bc3482 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2452,7 +2452,7 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
* page is available, while the caller may end up allocating as many as
* four pages, e.g. for PAE roots or for 5-level paging. Temporarily
* exceeding the (arbitrary by default) limit will not harm the host,
- * being too agressive may unnecessarily kill the guest, and getting an
+ * being too aggressive may unnecessarily kill the guest, and getting an
* exact count is far more trouble than it's worth, especially in the
* page fault paths.
*/
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index caac4ddb46df..0853370bd811 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1018,7 +1018,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (!is_shadow_present_pte(iter.old_spte)) {
/*
- * If SPTE has been forzen by another thread, just
+ * If SPTE has been frozen by another thread, just
* give up and retry, avoiding unnecessary page table
* allocation and free.
*/
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 02d60d7f903d..6710d9ee2e4b 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -19,6 +19,7 @@
#include <linux/trace_events.h>
#include <asm/fpu/internal.h>
+#include <asm/pkru.h>
#include <asm/trapnr.h>
#include "x86.h"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c471b1375f36..a4fd10604f72 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -66,6 +66,7 @@
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mce.h>
+#include <asm/pkru.h>
#include <linux/kernel_stat.h>
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
@@ -939,7 +940,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
vcpu->arch.pkru != vcpu->arch.host_pkru)
- __write_pkru(vcpu->arch.pkru);
+ write_pkru(vcpu->arch.pkru);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -953,7 +954,7 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
- __write_pkru(vcpu->arch.host_pkru);
+ write_pkru(vcpu->arch.host_pkru);
}
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@ -4704,20 +4705,21 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
+ u32 size, offset, ecx, edx;
u64 xfeature_mask = valid & -valid;
int xfeature_nr = fls64(xfeature_mask) - 1;
- void *src = get_xsave_addr(xsave, xfeature_nr);
-
- if (src) {
- u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
- if (xfeature_nr == XFEATURE_PKRU)
- memcpy(dest + offset, &vcpu->arch.pkru,
- sizeof(vcpu->arch.pkru));
- else
- memcpy(dest + offset, src, size);
+ void *src;
+
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
+ &size, &offset, &ecx, &edx);
+ if (xfeature_nr == XFEATURE_PKRU) {
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ } else {
+ src = get_xsave_addr(xsave, xfeature_nr);
+ if (src)
+ memcpy(dest + offset, src, size);
}
valid -= xfeature_mask;
@@ -4747,18 +4749,20 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
+ u32 size, offset, ecx, edx;
u64 xfeature_mask = valid & -valid;
int xfeature_nr = fls64(xfeature_mask) - 1;
- void *dest = get_xsave_addr(xsave, xfeature_nr);
-
- if (dest) {
- u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
- if (xfeature_nr == XFEATURE_PKRU)
- memcpy(&vcpu->arch.pkru, src + offset,
- sizeof(vcpu->arch.pkru));
- else
+
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
+ &size, &offset, &ecx, &edx);
+
+ if (xfeature_nr == XFEATURE_PKRU) {
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ } else {
+ void *dest = get_xsave_addr(xsave, xfeature_nr);
+
+ if (dest)
memcpy(dest, src + offset, size);
}
@@ -9887,7 +9891,7 @@ static void kvm_save_current_fpu(struct fpu *fpu)
memcpy(&fpu->state, &current->thread.fpu.state,
fpu_kernel_xstate_size);
else
- copy_fpregs_to_fpstate(fpu);
+ save_fpregs_to_fpstate(fpu);
}
/* Swap (qemu) user FPU context for the guest FPU context. */
@@ -9903,7 +9907,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
if (vcpu->arch.guest_fpu)
/* PKRU is separately restored in kvm_x86_ops.run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
+ __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
fpregs_mark_activate();
@@ -9924,7 +9928,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_fpu)
kvm_save_current_fpu(vcpu->arch.guest_fpu);
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+ restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
fpregs_mark_activate();
fpregs_unlock();
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index a67afd74232c..a1d24fdc07cf 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1417,7 +1417,7 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
}
}
-static unsigned long insn_get_effective_ip(struct pt_regs *regs)
+static int insn_get_effective_ip(struct pt_regs *regs, unsigned long *ip)
{
unsigned long seg_base = 0;
@@ -1430,10 +1430,12 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs)
if (!user_64bit_mode(regs)) {
seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
if (seg_base == -1L)
- return 0;
+ return -EINVAL;
}
- return seg_base + regs->ip;
+ *ip = seg_base + regs->ip;
+
+ return 0;
}
/**
@@ -1446,18 +1448,17 @@ static unsigned long insn_get_effective_ip(struct pt_regs *regs)
*
* Returns:
*
- * Number of instruction bytes copied.
- *
- * 0 if nothing was copied.
+ * - number of instruction bytes copied.
+ * - 0 if nothing was copied.
+ * - -EINVAL if the linear address of the instruction could not be calculated
*/
int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
- ip = insn_get_effective_ip(regs);
- if (!ip)
- return 0;
+ if (insn_get_effective_ip(regs, &ip))
+ return -EINVAL;
not_copied = copy_from_user(buf, (void __user *)ip, MAX_INSN_SIZE);
@@ -1475,18 +1476,17 @@ int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
*
* Returns:
*
- * Number of instruction bytes copied.
- *
- * 0 if nothing was copied.
+ * - number of instruction bytes copied.
+ * - 0 if nothing was copied.
+ * - -EINVAL if the linear address of the instruction could not be calculated.
*/
int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
{
unsigned long ip;
int not_copied;
- ip = insn_get_effective_ip(regs);
- if (!ip)
- return 0;
+ if (insn_get_effective_ip(regs, &ip))
+ return -EINVAL;
not_copied = __copy_from_user_inatomic(buf, (void __user *)ip, MAX_INSN_SIZE);
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 4d32cb06ffd5..ec9922cba30a 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -58,12 +58,16 @@ SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg)
2: .skip 5-(2b-1b), 0x90
SYM_FUNC_END(__x86_indirect_alt_call_\reg)
+STACK_FRAME_NON_STANDARD(__x86_indirect_alt_call_\reg)
+
SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg)
ANNOTATE_RETPOLINE_SAFE
1: jmp *%\reg
2: .skip 5-(2b-1b), 0x90
SYM_FUNC_END(__x86_indirect_alt_jmp_\reg)
+STACK_FRAME_NON_STANDARD(__x86_indirect_alt_jmp_\reg)
+
.endm
/*
diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h
index 70d35c200945..94c4023092f3 100644
--- a/arch/x86/math-emu/fpu_proto.h
+++ b/arch/x86/math-emu/fpu_proto.h
@@ -144,7 +144,7 @@ extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d);
extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d);
extern int FPU_round_to_int(FPU_REG *r, u_char tag);
extern u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s);
-extern void frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
+extern void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address);
extern u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d);
extern void fsave(fpu_addr_modes addr_modes, u_char __user *data_address);
extern int FPU_tagof(FPU_REG *ptr);
diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c
index f15263e158e8..4092df79de4f 100644
--- a/arch/x86/math-emu/load_store.c
+++ b/arch/x86/math-emu/load_store.c
@@ -240,7 +240,7 @@ int FPU_load_store(u_char type, fpu_addr_modes addr_modes,
fix-up operations. */
return 1;
case 022: /* frstor m94/108byte */
- frstor(addr_modes, (u_char __user *) data_address);
+ FPU_frstor(addr_modes, (u_char __user *) data_address);
/* Ensure that the values just loaded are not changed by
fix-up operations. */
return 1;
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index 7ca6417c0c8d..7e4521fbe7da 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -1117,7 +1117,7 @@ u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s)
return s;
}
-void frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
+void FPU_frstor(fpu_addr_modes addr_modes, u_char __user *data_address)
{
int i, regnr;
u_char __user *s = fldenv(addr_modes, data_address);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 121921b2927c..e1664e9f969c 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -65,7 +65,7 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
(void *)instruction_pointer(regs));
- __copy_kernel_to_fpregs(&init_fpstate, -1);
+ __restore_fpregs_from_fpstate(&init_fpstate, xfeatures_mask_fpstate());
return true;
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6bda7f67d737..b2eefdefc108 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -875,7 +875,7 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
/* This code is always called on the current mm */
bool foreign = false;
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return false;
if (error_code & X86_PF_PK)
return true;
@@ -1186,7 +1186,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
return;
/* kprobes don't want to hook the spurious faults: */
- if (kprobe_page_fault(regs, X86_TRAP_PF))
+ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
@@ -1239,7 +1239,7 @@ void do_user_addr_fault(struct pt_regs *regs,
}
/* kprobes don't want to hook the spurious faults: */
- if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
+ if (WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;
/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 21ffb03f6c72..74b78840182d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -651,7 +651,7 @@ void __init find_low_pfn_range(void)
highmem_pfn_init();
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
+#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
#ifdef CONFIG_HIGHMEM
@@ -677,7 +677,7 @@ void __init initmem_init(void)
setup_bootmem_allocator();
}
-#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+#endif /* !CONFIG_NUMA */
void __init setup_bootmem_allocator(void)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e527d829e1ed..ddeaba947eb3 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -33,6 +33,7 @@
#include <linux/nmi.h>
#include <linux/gfp.h>
#include <linux/kcore.h>
+#include <linux/bootmem_info.h>
#include <asm/processor.h>
#include <asm/bios_ebda.h>
@@ -193,8 +194,8 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
spin_lock(pgt_lock);
if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
- BUG_ON(p4d_page_vaddr(*p4d)
- != p4d_page_vaddr(*p4d_ref));
+ BUG_ON(p4d_pgtable(*p4d)
+ != p4d_pgtable(*p4d_ref));
if (p4d_none(*p4d))
set_p4d(p4d, *p4d_ref);
@@ -1269,7 +1270,7 @@ static struct kcore_list kcore_vsyscall;
static void __init register_page_bootmem_info(void)
{
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
int i;
for_each_online_node(i)
@@ -1623,7 +1624,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
return err;
}
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
void register_page_bootmem_memmap(unsigned long section_nr,
struct page *start_page, unsigned long nr_pages)
{
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 156cd235659f..ad8a5c586a35 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1134,7 +1134,7 @@ static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(pud_pgtable(*pud)))
pud_clear(pud);
}
@@ -1178,7 +1178,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(pud_pgtable(*pud)))
pud_clear(pud);
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d27cf69e811d..3364fe62b903 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -682,6 +682,7 @@ int p4d_clear_huge(p4d_t *p4d)
}
#endif
+#if CONFIG_PGTABLE_LEVELS > 3
/**
* pud_set_huge - setup kernel PUD mapping
*
@@ -721,6 +722,23 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
}
/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 2
+/**
* pmd_set_huge - setup kernel PMD mapping
*
* See text over pud_set_huge() above.
@@ -751,21 +769,6 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
}
/**
- * pud_clear_huge - clear kernel PUD mapping when it is set
- *
- * Returns 1 on success and 0 on failure (no PUD map is found).
- */
-int pud_clear_huge(pud_t *pud)
-{
- if (pud_large(*pud)) {
- pud_clear(pud);
- return 1;
- }
-
- return 0;
-}
-
-/**
* pmd_clear_huge - clear kernel PMD mapping when it is set
*
* Returns 1 on success and 0 on failure (no PMD map is found).
@@ -779,6 +782,7 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+#endif
#ifdef CONFIG_X86_64
/**
@@ -797,7 +801,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
pte_t *pte;
int i;
- pmd = (pmd_t *)pud_page_vaddr(*pud);
+ pmd = pud_pgtable(*pud);
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
if (!pmd_sv)
return 0;
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index a2332eef66e9..e44e938885b7 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -10,7 +10,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/fpu/internal.h> /* init_fpstate */
int __execute_only_pkey(struct mm_struct *mm)
{
@@ -125,22 +124,6 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
-/*
- * Called from the FPU code when creating a fresh set of FPU
- * registers. This is called from a very specific context where
- * we know the FPU registers are safe for use and we can use PKRU
- * directly.
- */
-void copy_init_pkru_to_fpregs(void)
-{
- u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
- /*
- * Override the PKRU state that came from 'init_fpstate'
- * with the baseline from the process.
- */
- write_pkru(init_pkru_value_snapshot);
-}
-
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -154,7 +137,6 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
static ssize_t init_pkru_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
- struct pkru_state *pk;
char buf[32];
ssize_t len;
u32 new_init_pkru;
@@ -177,10 +159,6 @@ static ssize_t init_pkru_write_file(struct file *file,
return -EINVAL;
WRITE_ONCE(init_pkru_value, new_init_pkru);
- pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU);
- if (!pk)
- return -EINVAL;
- pk->pkru = new_init_pkru;
return count;
}
@@ -192,6 +170,10 @@ static const struct file_operations fops_init_pkru = {
static int __init create_init_pkru_value(void)
{
+ /* Do not expose the file if pkeys are not supported. */
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return 0;
+
debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_init_pkru);
return 0;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 78804680e923..cfe6b1e85fa6 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,6 +14,7 @@
#include <asm/nospec-branch.h>
#include <asm/cache.h>
#include <asm/apic.h>
+#include <asm/perf_event.h>
#include "mm_internal.h"
@@ -404,9 +405,14 @@ static inline void cr4_update_pce_mm(struct mm_struct *mm)
{
if (static_branch_unlikely(&rdpmc_always_available_key) ||
(!static_branch_unlikely(&rdpmc_never_available_key) &&
- atomic_read(&mm->context.perf_rdpmc_allowed)))
+ atomic_read(&mm->context.perf_rdpmc_allowed))) {
+ /*
+ * Clear the existing dirty counters to
+ * prevent the leak for an RDPMC task.
+ */
+ perf_clear_dirty_counters();
cr4_set_bits_irqsoff(X86_CR4_PCE);
- else
+ } else
cr4_clear_bits_irqsoff(X86_CR4_PCE);
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2a2e290fa5d8..4b951458c9fc 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -31,7 +31,7 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
}
#define EMIT(bytes, len) \
- do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+ do { prog = emit_code(prog, bytes, len); } while (0)
#define EMIT1(b1) EMIT(b1, 1)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
@@ -239,7 +239,6 @@ struct jit_context {
static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
- int cnt = 0;
if (callee_regs_used[0])
EMIT1(0x53); /* push rbx */
@@ -255,7 +254,6 @@ static void push_callee_regs(u8 **pprog, bool *callee_regs_used)
static void pop_callee_regs(u8 **pprog, bool *callee_regs_used)
{
u8 *prog = *pprog;
- int cnt = 0;
if (callee_regs_used[3])
EMIT2(0x41, 0x5F); /* pop r15 */
@@ -277,13 +275,12 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
bool tail_call_reachable, bool is_subprog)
{
u8 *prog = *pprog;
- int cnt = X86_PATCH_SIZE;
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- memcpy(prog, x86_nops[5], cnt);
- prog += cnt;
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
+ prog += X86_PATCH_SIZE;
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
EMIT2(0x31, 0xC0); /* xor eax, eax */
@@ -303,7 +300,6 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{
u8 *prog = *pprog;
- int cnt = 0;
s64 offset;
offset = func - (ip + X86_PATCH_SIZE);
@@ -423,7 +419,6 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used,
int off1 = 42;
int off2 = 31;
int off3 = 9;
- int cnt = 0;
/* count the additional bytes used for popping callee regs from stack
* that need to be taken into account for each of the offsets that
@@ -513,7 +508,6 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
int pop_bytes = 0;
int off1 = 20;
int poke_off;
- int cnt = 0;
/* count the additional bytes used for popping callee regs to stack
* that need to be taken into account for jump offset that is used for
@@ -576,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
for (i = 0; i < prog->aux->size_poke_tab; i++) {
poke = &prog->aux->poke_tab[i];
+ if (poke->aux && poke->aux != prog->aux)
+ continue;
+
WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
@@ -615,7 +612,6 @@ static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
{
u8 *prog = *pprog;
u8 b1, b2, b3;
- int cnt = 0;
/*
* Optimization: if imm32 is positive, use 'mov %eax, imm32'
@@ -655,7 +651,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
const u32 imm32_hi, const u32 imm32_lo)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is_uimm32(((u64)imm32_hi << 32) | (u32)imm32_lo)) {
/*
@@ -678,7 +673,6 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is64) {
/* mov dst, src */
@@ -697,7 +691,6 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is_imm8(off)) {
/* 1-byte signed displacement.
@@ -720,7 +713,6 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
{
u8 *prog = *pprog;
- int cnt = 0;
if (is64)
EMIT1(add_2mod(0x48, dst_reg, src_reg));
@@ -733,7 +725,6 @@ static void maybe_emit_mod(u8 **pprog, u32 dst_reg, u32 src_reg, bool is64)
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
switch (size) {
case BPF_B:
@@ -764,7 +755,6 @@ static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
u8 *prog = *pprog;
- int cnt = 0;
switch (size) {
case BPF_B:
@@ -799,7 +789,6 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
{
u8 *prog = *pprog;
- int cnt = 0;
EMIT1(0xF0); /* lock prefix */
@@ -869,10 +858,10 @@ static void detect_reg_usage(struct bpf_insn *insn, int insn_cnt,
}
}
-static int emit_nops(u8 **pprog, int len)
+static void emit_nops(u8 **pprog, int len)
{
u8 *prog = *pprog;
- int i, noplen, cnt = 0;
+ int i, noplen;
while (len > 0) {
noplen = len;
@@ -886,8 +875,6 @@ static int emit_nops(u8 **pprog, int len)
}
*pprog = prog;
-
- return cnt;
}
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
@@ -902,7 +889,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
bool tail_call_seen = false;
bool seen_exit = false;
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
- int i, cnt = 0, excnt = 0;
+ int i, excnt = 0;
int ilen, proglen = 0;
u8 *prog = temp;
int err;
@@ -1297,7 +1284,7 @@ st: if (is_imm8(insn->off))
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
- u8 *_insn = image + proglen;
+ u8 *_insn = image + proglen + (start_of_ldx - temp);
s64 delta;
/* populate jmp_offset for JMP above */
@@ -1576,7 +1563,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, nops);
+ emit_nops(&prog, nops);
}
EMIT2(jmp_cond, jmp_offset);
} else if (is_simm32(jmp_offset)) {
@@ -1622,7 +1609,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, nops);
+ emit_nops(&prog, nops);
}
break;
}
@@ -1647,7 +1634,7 @@ emit_jmp:
nops);
return -EFAULT;
}
- cnt += emit_nops(&prog, INSN_SZ_DIFF - 2);
+ emit_nops(&prog, INSN_SZ_DIFF - 2);
}
EMIT2(0xEB, jmp_offset);
} else if (is_simm32(jmp_offset)) {
@@ -1754,7 +1741,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
{
u8 *prog = *pprog;
u8 *jmp_insn;
- int cnt = 0;
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
@@ -1822,7 +1808,6 @@ static void emit_align(u8 **pprog, u32 align)
static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
{
u8 *prog = *pprog;
- int cnt = 0;
s64 offset;
offset = func - (ip + 2 + 4);
@@ -1854,7 +1839,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
u8 **branches)
{
u8 *prog = *pprog;
- int i, cnt = 0;
+ int i;
/* The first fmod_ret program will receive a garbage return value.
* Set this to 0 to avoid confusing the program.
@@ -1950,7 +1935,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
struct bpf_tramp_progs *tprogs,
void *orig_call)
{
- int ret, i, cnt = 0, nr_args = m->nr_args;
+ int ret, i, nr_args = m->nr_args;
int stack_size = nr_args * 8;
struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY];
struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT];
@@ -2095,8 +2080,6 @@ static int emit_fallback_jump(u8 **pprog)
*/
err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
#else
- int cnt = 0;
-
EMIT2(0xFF, 0xE2); /* jmp rdx */
#endif
*pprog = prog;
@@ -2106,7 +2089,7 @@ static int emit_fallback_jump(u8 **pprog)
static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
{
u8 *jg_reloc, *prog = *pprog;
- int pivot, err, jg_bytes = 1, cnt = 0;
+ int pivot, err, jg_bytes = 1;
s64 jg_offset;
if (a == b) {
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index de6bf0e7e8f8..758cbfe55daa 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -461,7 +461,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
}
if (size < (16UL<<20) && size != old_size)
- return 0;
+ return false;
if (dev)
dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
@@ -493,7 +493,7 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
&cfg->res, (unsigned long) cfg->address);
}
- return 1;
+ return true;
}
static bool __ref
@@ -501,7 +501,7 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
{
if (!early && !acpi_disabled) {
if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
- return 1;
+ return true;
if (dev)
dev_info(dev, FW_INFO
@@ -522,14 +522,14 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
* _CBA method, just assume it's reserved.
*/
if (pci_mmcfg_running_state)
- return 1;
+ return true;
/* Don't try to do this check unless configuration
type 1 is available. how about type 2 ?*/
if (raw_pci_ops)
return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1);
- return 0;
+ return false;
}
static void __init pci_mmcfg_reject_broken(int early)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 8a26e705cb06..147c30a81f15 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -468,7 +468,7 @@ void __init efi_init(void)
*/
if (!efi_runtime_supported())
- pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
+ pr_err("No EFI runtime due to 32/64-bit mismatch with kernel\n");
if (!efi_runtime_supported() || efi_runtime_disabled()) {
efi_memmap_unmap();
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
index f03b64d9cb51..7558139920f8 100644
--- a/arch/x86/purgatory/purgatory.c
+++ b/arch/x86/purgatory/purgatory.c
@@ -9,6 +9,8 @@
*/
#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
#include <crypto/sha2.h>
#include <asm/purgatory.h>
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 6b1f3a4eeb44..a0b491ae2de8 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -10,7 +10,6 @@
# Sanitizer runtimes are unavailable and cannot be linked here.
KASAN_SANITIZE := n
KCSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD := y
subdir- := rm
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 2ed81e581755..0575decb5e54 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
#define __NO_STUBS
@@ -26,20 +25,17 @@
#define old_mmap sys_old_mmap
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
+
+#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 2e8544dafbb0..95725b5a41ac 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -7,7 +7,6 @@
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
-#include <asm/unistd.h>
#include <asm/syscall.h>
#define __NO_STUBS
@@ -36,23 +35,15 @@
#define stub_execveat sys_execveat
#define stub_rt_sigreturn sys_rt_sigreturn
-#define __SYSCALL_X32(nr, sym)
-#define __SYSCALL_COMMON(nr, sym) __SYSCALL_64(nr, sym)
-
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
#include <asm/syscalls_64.h>
-#undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#undef __SYSCALL
+#define __SYSCALL(nr, sym) sym,
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aa9f50fccc5d..c79bd0af2e8c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/slab.h>
+#include <linux/panic_notifier.h>
#include <xen/xen.h>
#include <xen/features.h>
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index e87699aa2dc8..03149422dce2 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -592,8 +592,10 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_debug)
DEFINE_IDTENTRY_RAW(exc_xen_unknown_trap)
{
/* This should never happen and there is no way to handle it. */
+ instrumentation_begin();
pr_err("Unknown trap in Xen PV mode.");
BUG();
+ instrumentation_end();
}
#ifdef CONFIG_X86_MCE