From abef0695f9665c3dbe9473f964c4da3c1f7c5d3f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 13 Sep 2023 15:48:12 +0100
Subject: arm64/sve: Remove ZCR pseudo register from cpufeature code

For reasons that are not currently apparent during cpufeature enumeration
we maintain a pseudo register for ZCR which records the maximum supported
vector length using the value that would be written to ZCR_EL1.LEN to
configure it. This is not exposed to userspace and is not sufficient for
detecting unsupportable configurations, we need the more detailed checks in
vec_update_vq_map() for that since we can't cope with missing vector
lengths on late CPUs and KVM requires an exactly matching set of supported
vector lengths as EL1 can enumerate VLs directly with the hardware.

Remove the code, replacing the usage in sve_setup() with a query of the
vq_map.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230913-arm64-vec-len-cpufeature-v1-1-cc69b0600a8a@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpu.h    |  3 ---
 arch/arm64/include/asm/fpsimd.h |  1 -
 arch/arm64/kernel/cpufeature.c  | 30 ++++++------------------------
 arch/arm64/kernel/fpsimd.c      | 35 +++--------------------------------
 4 files changed, 9 insertions(+), 60 deletions(-)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index e749838b9c5d..f85650dc561c 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -64,9 +64,6 @@ struct cpuinfo_arm64 {
 
 	struct cpuinfo_32bit	aarch32;
 
-	/* pseudo-ZCR for recording maximum ZCR_EL1 LEN value: */
-	u64		reg_zcr;
-
 	/* pseudo-SMCR for recording maximum SMCR_EL1 LEN value: */
 	u64		reg_smcr;
 };
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 8df46f186c64..9e5d3a0812b6 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -128,7 +128,6 @@ extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused);
 extern void sme2_kernel_enable(const struct arm64_cpu_capabilities *__unused);
 extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused);
 
-extern u64 read_zcr_features(void);
 extern u64 read_smcr_features(void);
 
 /*
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 444a73c2e638..5783e9349563 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -611,12 +611,6 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = {
 	ARM64_FTR_END,
 };
 
-static const struct arm64_ftr_bits ftr_zcr[] = {
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE,
-		ZCR_ELx_LEN_SHIFT, ZCR_ELx_LEN_WIDTH, 0),	/* LEN */
-	ARM64_FTR_END,
-};
-
 static const struct arm64_ftr_bits ftr_smcr[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE,
 		SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0),	/* LEN */
@@ -736,7 +730,6 @@ static const struct __ftr_reg_entry {
 	ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3),
 
 	/* Op1 = 0, CRn = 1, CRm = 2 */
-	ARM64_FTR_REG(SYS_ZCR_EL1, ftr_zcr),
 	ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr),
 
 	/* Op1 = 1, CRn = 0, CRm = 0 */
@@ -1040,8 +1033,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
-		info->reg_zcr = read_zcr_features();
-		init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
+		sve_kernel_enable(NULL);
 		vec_init_vq_map(ARM64_VEC_SVE);
 	}
 
@@ -1289,15 +1281,13 @@ void update_cpu_features(int cpu,
 	taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu,
 				      info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0);
 
+	/* Probe vector lengths */
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
-		info->reg_zcr = read_zcr_features();
-		taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu,
-					info->reg_zcr, boot->reg_zcr);
-
-		/* Probe vector lengths */
-		if (!system_capabilities_finalized())
+		if (!system_capabilities_finalized()) {
+			sve_kernel_enable(NULL);
 			vec_update_vq_map(ARM64_VEC_SVE);
+		}
 	}
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
@@ -3153,19 +3143,11 @@ static void verify_local_elf_hwcaps(void)
 
 static void verify_sve_features(void)
 {
-	u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
-	u64 zcr = read_zcr_features();
-
-	unsigned int safe_len = safe_zcr & ZCR_ELx_LEN_MASK;
-	unsigned int len = zcr & ZCR_ELx_LEN_MASK;
-
-	if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SVE)) {
+	if (vec_verify_vq_map(ARM64_VEC_SVE)) {
 		pr_crit("CPU%d: SVE: vector length support mismatch\n",
 			smp_processor_id());
 		cpu_die_early();
 	}
-
-	/* Add checks on other ZCR bits here if necessary */
 }
 
 static void verify_sme_features(void)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 91e44ac7150f..35bf40c2f972 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1170,32 +1170,12 @@ void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 	isb();
 }
 
-/*
- * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE
- * vector length.
- *
- * Use only if SVE is present.
- * This function clobbers the SVE vector length.
- */
-u64 read_zcr_features(void)
-{
-	/*
-	 * Set the maximum possible VL, and write zeroes to all other
-	 * bits to see if they stick.
-	 */
-	sve_kernel_enable(NULL);
-	write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1);
-
-	/* Return LEN value that would be written to get the maximum VL */
-	return sve_vq_from_vl(sve_get_vl()) - 1;
-}
-
 void __init sve_setup(void)
 {
 	struct vl_info *info = &vl_info[ARM64_VEC_SVE];
-	u64 zcr;
 	DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
 	unsigned long b;
+	int max_bit;
 
 	if (!system_supports_sve())
 		return;
@@ -1208,17 +1188,8 @@ void __init sve_setup(void)
 	if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map)))
 		set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map);
 
-	zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
-	info->max_vl = sve_vl_from_vq((zcr & ZCR_ELx_LEN_MASK) + 1);
-
-	/*
-	 * Sanity-check that the max VL we determined through CPU features
-	 * corresponds properly to sve_vq_map.  If not, do our best:
-	 */
-	if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SVE,
-								 info->max_vl)))
-		info->max_vl = find_supported_vector_length(ARM64_VEC_SVE,
-							    info->max_vl);
+	max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX);
+	info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit));
 
 	/*
 	 * For the default VL, pick the maximum supported value <= 64.
-- 
cgit v1.2.3


From 391208485c3ad50ff4f2ddd9f7d4a1d77341acbb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 13 Sep 2023 15:48:13 +0100
Subject: arm64/sve: Remove SMCR pseudo register from cpufeature code

For reasons that are not currently apparent during cpufeature enumeration
we maintain a pseudo register for SMCR which records the maximum supported
vector length using the value that would be written to SMCR_EL1.LEN to
configure it. This is not exposed to userspace and is not sufficient for
detecting unsupportable configurations, we need the more detailed checks in
vec_update_vq_map() for that since we can't cope with missing vector
lengths on late CPUs and KVM requires an exactly matching set of supported
vector lengths as EL1 can enumerate VLs directly with the hardware.

Remove the code, replacing the usage in sme_setup() with a query of the
vq_map.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230913-arm64-vec-len-cpufeature-v1-2-cc69b0600a8a@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpu.h   |  3 ---
 arch/arm64/kernel/cpufeature.c | 28 +++++-----------------------
 arch/arm64/kernel/fpsimd.c     | 40 +++++-----------------------------------
 3 files changed, 10 insertions(+), 61 deletions(-)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index f85650dc561c..f3034099fd95 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -63,9 +63,6 @@ struct cpuinfo_arm64 {
 	u64		reg_id_aa64smfr0;
 
 	struct cpuinfo_32bit	aarch32;
-
-	/* pseudo-SMCR for recording maximum SMCR_EL1 LEN value: */
-	u64		reg_smcr;
 };
 
 DECLARE_PER_CPU(struct cpuinfo_arm64, cpu_data);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5783e9349563..ea36dc532b3b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -611,12 +611,6 @@ static const struct arm64_ftr_bits ftr_id_dfr1[] = {
 	ARM64_FTR_END,
 };
 
-static const struct arm64_ftr_bits ftr_smcr[] = {
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE,
-		SMCR_ELx_LEN_SHIFT, SMCR_ELx_LEN_WIDTH, 0),	/* LEN */
-	ARM64_FTR_END,
-};
-
 /*
  * Common ftr bits for a 32bit register with all hidden, strict
  * attributes, with 4bit feature fields and a default safe value of
@@ -729,9 +723,6 @@ static const struct __ftr_reg_entry {
 	ARM64_FTR_REG(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2),
 	ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3),
 
-	/* Op1 = 0, CRn = 1, CRm = 2 */
-	ARM64_FTR_REG(SYS_SMCR_EL1, ftr_smcr),
-
 	/* Op1 = 1, CRn = 0, CRm = 0 */
 	ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid),
 
@@ -1039,14 +1030,14 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
 	    id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
-		info->reg_smcr = read_smcr_features();
+		sme_kernel_enable(NULL);
+
 		/*
 		 * We mask out SMPS since even if the hardware
 		 * supports priorities the kernel does not at present
 		 * and we block access to them.
 		 */
 		info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
-		init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr);
 		vec_init_vq_map(ARM64_VEC_SME);
 	}
 
@@ -1292,15 +1283,14 @@ void update_cpu_features(int cpu,
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
 	    id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
-		info->reg_smcr = read_smcr_features();
+		sme_kernel_enable(NULL);
+
 		/*
 		 * We mask out SMPS since even if the hardware
 		 * supports priorities the kernel does not at present
 		 * and we block access to them.
 		 */
 		info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
-		taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu,
-					info->reg_smcr, boot->reg_smcr);
 
 		/* Probe vector lengths */
 		if (!system_capabilities_finalized())
@@ -3152,19 +3142,11 @@ static void verify_sve_features(void)
 
 static void verify_sme_features(void)
 {
-	u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1);
-	u64 smcr = read_smcr_features();
-
-	unsigned int safe_len = safe_smcr & SMCR_ELx_LEN_MASK;
-	unsigned int len = smcr & SMCR_ELx_LEN_MASK;
-
-	if (len < safe_len || vec_verify_vq_map(ARM64_VEC_SME)) {
+	if (vec_verify_vq_map(ARM64_VEC_SME)) {
 		pr_crit("CPU%d: SME: vector length support mismatch\n",
 			smp_processor_id());
 		cpu_die_early();
 	}
-
-	/* Add checks on other SMCR bits here if necessary */
 }
 
 static void verify_hyp_capabilities(void)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 35bf40c2f972..04c801001767 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1304,32 +1304,10 @@ void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 		       SYS_SMCR_EL1);
 }
 
-/*
- * Read the pseudo-SMCR used by cpufeatures to identify the supported
- * vector length.
- *
- * Use only if SME is present.
- * This function clobbers the SME vector length.
- */
-u64 read_smcr_features(void)
-{
-	sme_kernel_enable(NULL);
-
-	/*
-	 * Set the maximum possible VL.
-	 */
-	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_LEN_MASK,
-		       SYS_SMCR_EL1);
-
-	/* Return LEN value that would be written to get the maximum VL */
-	return sve_vq_from_vl(sme_get_vl()) - 1;
-}
-
 void __init sme_setup(void)
 {
 	struct vl_info *info = &vl_info[ARM64_VEC_SME];
-	u64 smcr;
-	int min_bit;
+	int min_bit, max_bit;
 
 	if (!system_supports_sme())
 		return;
@@ -1338,24 +1316,16 @@ void __init sme_setup(void)
 	 * SME doesn't require any particular vector length be
 	 * supported but it does require at least one.  We should have
 	 * disabled the feature entirely while bringing up CPUs but
-	 * let's double check here.
+	 * let's double check here.  The bitmap is SVE_VQ_MAP sized for
+	 * sharing with SVE.
 	 */
 	WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX));
 
 	min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX);
 	info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit));
 
-	smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1);
-	info->max_vl = sve_vl_from_vq((smcr & SMCR_ELx_LEN_MASK) + 1);
-
-	/*
-	 * Sanity-check that the max VL we determined through CPU features
-	 * corresponds properly to sme_vq_map.  If not, do our best:
-	 */
-	if (WARN_ON(info->max_vl != find_supported_vector_length(ARM64_VEC_SME,
-								 info->max_vl)))
-		info->max_vl = find_supported_vector_length(ARM64_VEC_SME,
-							    info->max_vl);
+	max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX);
+	info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit));
 
 	WARN_ON(info->min_vl > info->max_vl);
 
-- 
cgit v1.2.3


From a02026bf9da13cd44fb444857d5aebc934e1af5a Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:02:56 -0700
Subject: irqchip/gic-v3: Enable support for SGIs to act as NMIs

As of commit 6abbd6988971 ("irqchip/gic, gic-v3: Make SGIs use
handle_percpu_devid_irq()") SGIs are treated the same as PPIs/EPPIs
and use handle_percpu_devid_irq() by default. Unfortunately,
handle_percpu_devid_irq() isn't NMI safe, and so to run in an NMI
context those should use handle_percpu_devid_fasteoi_nmi().

In order to accomplish this, we just have to make room for SGIs in the
array of refcounts that keeps track of which interrupts are set as
NMI. We also rename the array and create a new indexing scheme that
accounts for SGIs.

Also, enable NMI support prior to gic_smp_init() as allocation of SGIs
as IRQs/NMIs happen as part of this routine.

Co-developed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20230906090246.v13.1.I1223c11c88937bd0cbd9b086d4ef216985797302@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/irqchip/irq-gic-v3.c | 59 ++++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index eedfa8e9f077..787ccc880b22 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -78,6 +78,13 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
 #define GIC_LINE_NR	min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U)
 #define GIC_ESPI_NR	GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer)
 
+/*
+ * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs
+ * are potentially stolen by the secure side. Some code, especially code dealing
+ * with hwirq IDs, is simplified by accounting for all 16.
+ */
+#define SGI_NR		16
+
 /*
  * The behaviours of RPR and PMR registers differ depending on the value of
  * SCR_EL3.FIQ, and the behaviour of non-secure priority registers of the
@@ -125,8 +132,8 @@ EXPORT_SYMBOL(gic_nonsecure_priorities);
 		__priority;						\
 	})
 
-/* ppi_nmi_refs[n] == number of cpus having ppi[n + 16] set as NMI */
-static refcount_t *ppi_nmi_refs;
+/* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */
+static refcount_t *rdist_nmi_refs;
 
 static struct gic_kvm_info gic_v3_kvm_info __initdata;
 static DEFINE_PER_CPU(bool, has_rss);
@@ -519,9 +526,22 @@ static u32 __gic_get_ppi_index(irq_hw_number_t hwirq)
 	}
 }
 
-static u32 gic_get_ppi_index(struct irq_data *d)
+static u32 __gic_get_rdist_index(irq_hw_number_t hwirq)
+{
+	switch (__get_intid_range(hwirq)) {
+	case SGI_RANGE:
+	case PPI_RANGE:
+		return hwirq;
+	case EPPI_RANGE:
+		return hwirq - EPPI_BASE_INTID + 32;
+	default:
+		unreachable();
+	}
+}
+
+static u32 gic_get_rdist_index(struct irq_data *d)
 {
-	return __gic_get_ppi_index(d->hwirq);
+	return __gic_get_rdist_index(d->hwirq);
 }
 
 static int gic_irq_nmi_setup(struct irq_data *d)
@@ -545,11 +565,14 @@ static int gic_irq_nmi_setup(struct irq_data *d)
 
 	/* desc lock should already be held */
 	if (gic_irq_in_rdist(d)) {
-		u32 idx = gic_get_ppi_index(d);
+		u32 idx = gic_get_rdist_index(d);
 
-		/* Setting up PPI as NMI, only switch handler for first NMI */
-		if (!refcount_inc_not_zero(&ppi_nmi_refs[idx])) {
-			refcount_set(&ppi_nmi_refs[idx], 1);
+		/*
+		 * Setting up a percpu interrupt as NMI, only switch handler
+		 * for first NMI
+		 */
+		if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) {
+			refcount_set(&rdist_nmi_refs[idx], 1);
 			desc->handle_irq = handle_percpu_devid_fasteoi_nmi;
 		}
 	} else {
@@ -582,10 +605,10 @@ static void gic_irq_nmi_teardown(struct irq_data *d)
 
 	/* desc lock should already be held */
 	if (gic_irq_in_rdist(d)) {
-		u32 idx = gic_get_ppi_index(d);
+		u32 idx = gic_get_rdist_index(d);
 
 		/* Tearing down NMI, only switch handler for last NMI */
-		if (refcount_dec_and_test(&ppi_nmi_refs[idx]))
+		if (refcount_dec_and_test(&rdist_nmi_refs[idx]))
 			desc->handle_irq = handle_percpu_devid_irq;
 	} else {
 		desc->handle_irq = handle_fasteoi_irq;
@@ -1279,10 +1302,10 @@ static void gic_cpu_init(void)
 	rbase = gic_data_rdist_sgi_base();
 
 	/* Configure SGIs/PPIs as non-secure Group-1 */
-	for (i = 0; i < gic_data.ppi_nr + 16; i += 32)
+	for (i = 0; i < gic_data.ppi_nr + SGI_NR; i += 32)
 		writel_relaxed(~0, rbase + GICR_IGROUPR0 + i / 8);
 
-	gic_cpu_config(rbase, gic_data.ppi_nr + 16, gic_redist_wait_for_rwp);
+	gic_cpu_config(rbase, gic_data.ppi_nr + SGI_NR, gic_redist_wait_for_rwp);
 
 	/* initialise system registers */
 	gic_cpu_sys_reg_init();
@@ -1939,12 +1962,13 @@ static void gic_enable_nmi_support(void)
 		return;
 	}
 
-	ppi_nmi_refs = kcalloc(gic_data.ppi_nr, sizeof(*ppi_nmi_refs), GFP_KERNEL);
-	if (!ppi_nmi_refs)
+	rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR,
+				 sizeof(*rdist_nmi_refs), GFP_KERNEL);
+	if (!rdist_nmi_refs)
 		return;
 
-	for (i = 0; i < gic_data.ppi_nr; i++)
-		refcount_set(&ppi_nmi_refs[i], 0);
+	for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++)
+		refcount_set(&rdist_nmi_refs[i], 0);
 
 	pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n",
 		gic_has_relaxed_pmr_sync() ? "relaxed" : "forced");
@@ -2061,6 +2085,7 @@ static int __init gic_init_bases(phys_addr_t dist_phys_base,
 
 	gic_dist_init();
 	gic_cpu_init();
+	gic_enable_nmi_support();
 	gic_smp_init();
 	gic_cpu_pm_init();
 
@@ -2073,8 +2098,6 @@ static int __init gic_init_bases(phys_addr_t dist_phys_base,
 			gicv2m_init(handle, gic_data.domain);
 	}
 
-	gic_enable_nmi_support();
-
 	return 0;
 
 out_free:
-- 
cgit v1.2.3


From d0c14a7d36f035aeae1bdd6f4afc6488400ed5cf Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:02:57 -0700
Subject: arm64: idle: Tag the arm64 idle functions as __cpuidle

As per the (somewhat recent) comment before the definition of
`__cpuidle`, the tag is like `noinstr` but also marks a function so it
can be identified by cpu_in_idle(). Let's add these markings to arm64
cpuidle functions

With this change we get useful backtraces like:

  NMI backtrace for cpu N skipped: idling at cpu_do_idle+0x94/0x98

instead of useless backtraces when dumping all processors using
nmi_cpu_backtrace().

NOTE: this patch won't make cpu_in_idle() work perfectly for arm64,
but it doesn't hurt and does catch some cases. Specifically an example
that wasn't caught in my testing looked like this:

 gic_cpu_sys_reg_init+0x1f8/0x314
 gic_cpu_pm_notifier+0x40/0x78
 raw_notifier_call_chain+0x5c/0x134
 cpu_pm_notify+0x38/0x64
 cpu_pm_exit+0x20/0x2c
 psci_enter_idle_state+0x48/0x70
 cpuidle_enter_state+0xb8/0x260
 cpuidle_enter+0x44/0x5c
 do_idle+0x188/0x30c

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Sumit Garg <sumit.garg@linaro.org>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20230906090246.v13.2.I4baba13e220bdd24d11400c67f137c35f07f82c7@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/idle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/idle.c b/arch/arm64/kernel/idle.c
index c1125753fe9b..05cfb347ec26 100644
--- a/arch/arm64/kernel/idle.c
+++ b/arch/arm64/kernel/idle.c
@@ -20,7 +20,7 @@
  *	ensure that interrupts are not masked at the PMR (because the core will
  *	not wake up if we block the wake up signal in the interrupt controller).
  */
-void noinstr cpu_do_idle(void)
+void __cpuidle cpu_do_idle(void)
 {
 	struct arm_cpuidle_irq_context context;
 
@@ -35,7 +35,7 @@ void noinstr cpu_do_idle(void)
 /*
  * This is our default idle handler.
  */
-void noinstr arch_cpu_idle(void)
+void __cpuidle arch_cpu_idle(void)
 {
 	/*
 	 * This should do all the clock switching and wait for interrupt
-- 
cgit v1.2.3


From 2b2d0a7a96ab36ed6d963e29b6211b184ef81596 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 6 Sep 2023 09:02:58 -0700
Subject: arm64: smp: Remove dedicated wakeup IPI

To enable NMI backtrace and KGDB's NMI cpu roundup, we need to free up
at least one dedicated IPI.

On arm64 the IPI_WAKEUP IPI is only used for the ACPI parking protocol,
which itself is only used on some very early ARMv8 systems which
couldn't implement PSCI.

Remove the IPI_WAKEUP IPI, and rely on the IPI_RESCHEDULE IPI to wake
CPUs from the parked state. This will cause a tiny amonut of redundant
work to check the thread flags, but this is miniscule in relation to the
cost of taking and handling the IPI in the first place. We can safely
handle redundant IPI_RESCHEDULE IPIs, so there should be no functional
impact as a result of this change.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230906090246.v13.3.I7209db47ef8ec151d3de61f59005bbc59fe8f113@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/smp.h              |  4 ++--
 arch/arm64/kernel/acpi_parking_protocol.c |  2 +-
 arch/arm64/kernel/smp.c                   | 28 +++++++++++-----------------
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 9b31e6d0da17..efb13112b408 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -89,9 +89,9 @@ extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
-extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
+extern void arch_send_wakeup_ipi(unsigned int cpu);
 #else
-static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
+static inline void arch_send_wakeup_ipi(unsigned int cpu)
 {
 	BUILD_BUG();
 }
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
index b1990e38aed0..e1be29e608b7 100644
--- a/arch/arm64/kernel/acpi_parking_protocol.c
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -103,7 +103,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
 		       &mailbox->entry_point);
 	writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
 
-	arch_send_wakeup_ipi_mask(cpumask_of(cpu));
+	arch_send_wakeup_ipi(cpu);
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..a5848f1ef817 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -72,7 +72,6 @@ enum ipi_msg_type {
 	IPI_CPU_CRASH_STOP,
 	IPI_TIMER,
 	IPI_IRQ_WORK,
-	IPI_WAKEUP,
 	NR_IPI
 };
 
@@ -764,7 +763,6 @@ static const char *ipi_types[NR_IPI] __tracepoint_string = {
 	[IPI_CPU_CRASH_STOP]	= "CPU stop (for crash dump) interrupts",
 	[IPI_TIMER]		= "Timer broadcast interrupts",
 	[IPI_IRQ_WORK]		= "IRQ work interrupts",
-	[IPI_WAKEUP]		= "CPU wake-up interrupts",
 };
 
 static void smp_cross_call(const struct cpumask *target, unsigned int ipinr);
@@ -797,13 +795,6 @@ void arch_send_call_function_single_ipi(int cpu)
 	smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
 }
 
-#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
-void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
-{
-	smp_cross_call(mask, IPI_WAKEUP);
-}
-#endif
-
 #ifdef CONFIG_IRQ_WORK
 void arch_irq_work_raise(void)
 {
@@ -897,14 +888,6 @@ static void do_handle_IPI(int ipinr)
 		break;
 #endif
 
-#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
-	case IPI_WAKEUP:
-		WARN_ONCE(!acpi_parking_protocol_valid(cpu),
-			  "CPU%u: Wake-up IPI outside the ACPI parking protocol\n",
-			  cpu);
-		break;
-#endif
-
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
 		break;
@@ -979,6 +962,17 @@ void arch_smp_send_reschedule(int cpu)
 	smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
 }
 
+#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
+void arch_send_wakeup_ipi(unsigned int cpu)
+{
+	/*
+	 * We use a scheduler IPI to wake the CPU as this avoids the need for a
+	 * dedicated IPI and we can safely handle spurious scheduler IPIs.
+	 */
+	arch_smp_send_reschedule(cpu);
+}
+#endif
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 void tick_broadcast(const struct cpumask *mask)
 {
-- 
cgit v1.2.3


From 331a1b3a836c0f38165dcec168c0a03b93cf0c17 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:02:59 -0700
Subject: arm64: smp: Add arch support for backtrace using pseudo-NMI

Enable arch_trigger_cpumask_backtrace() support on arm64. This enables
things much like they are enabled on arm32 (including some of the
funky logic around NR_IPI, nr_ipi, and MAX_IPI) but with the
difference that, unlike arm32, we'll try to enable the backtrace to
use pseudo-NMI.

NOTE: this patch is a squash of the little bit of code adding the
ability to mark an IPI to try to use pseudo-NMI plus the little bit of
code to hook things up for kgdb. This approach was decided upon in the
discussion of v9 [1].

This patch depends on commit 8d539b84f1e3 ("nmi_backtrace: allow
excluding an arbitrary CPU") since that commit changed the prototype
of arch_trigger_cpumask_backtrace(), which this patch implements.

[1] https://lore.kernel.org/r/ZORY51mF4alI41G1@FVFF77S0Q05N

Co-developed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Misono Tomohiro <misono.tomohiro@fujitsu.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20230906090246.v13.4.Ie6c132b96ebbbcddbf6954b9469ed40a6960343c@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/irq.h |  3 ++
 arch/arm64/kernel/smp.c      | 86 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
index fac08e18bcd5..50ce8b697ff3 100644
--- a/arch/arm64/include/asm/irq.h
+++ b/arch/arm64/include/asm/irq.h
@@ -6,6 +6,9 @@
 
 #include <asm-generic/irq.h>
 
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
+
 struct pt_regs;
 
 int set_handle_irq(void (*handle_irq)(struct pt_regs *));
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index a5848f1ef817..28c904ca499a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -33,6 +33,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/kexec.h>
 #include <linux/kvm_host.h>
+#include <linux/nmi.h>
 
 #include <asm/alternative.h>
 #include <asm/atomic.h>
@@ -72,12 +73,18 @@ enum ipi_msg_type {
 	IPI_CPU_CRASH_STOP,
 	IPI_TIMER,
 	IPI_IRQ_WORK,
-	NR_IPI
+	NR_IPI,
+	/*
+	 * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
+	 * with trace_ipi_*
+	 */
+	IPI_CPU_BACKTRACE = NR_IPI,
+	MAX_IPI
 };
 
 static int ipi_irq_base __read_mostly;
 static int nr_ipi __read_mostly = NR_IPI;
-static struct irq_desc *ipi_desc[NR_IPI] __read_mostly;
+static struct irq_desc *ipi_desc[MAX_IPI] __read_mostly;
 
 static void ipi_setup(int cpu);
 
@@ -845,6 +852,22 @@ static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs
 #endif
 }
 
+static void arm64_backtrace_ipi(cpumask_t *mask)
+{
+	__ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
+{
+	/*
+	 * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name,
+	 * nothing about it truly needs to be implemented using an NMI, it's
+	 * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi()
+	 * returned false our backtrace attempt will just use a regular IPI.
+	 */
+	nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);
+}
+
 /*
  * Main handler for inter-processor interrupts
  */
@@ -888,6 +911,14 @@ static void do_handle_IPI(int ipinr)
 		break;
 #endif
 
+	case IPI_CPU_BACKTRACE:
+		/*
+		 * NOTE: in some cases this _won't_ be NMI context. See the
+		 * comment in arch_trigger_cpumask_backtrace().
+		 */
+		nmi_cpu_backtrace(get_irq_regs());
+		break;
+
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
 		break;
@@ -909,6 +940,19 @@ static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
 	__ipi_send_mask(ipi_desc[ipinr], target);
 }
 
+static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
+{
+	if (!system_uses_irq_prio_masking())
+		return false;
+
+	switch (ipi) {
+	case IPI_CPU_BACKTRACE:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static void ipi_setup(int cpu)
 {
 	int i;
@@ -916,8 +960,14 @@ static void ipi_setup(int cpu)
 	if (WARN_ON_ONCE(!ipi_irq_base))
 		return;
 
-	for (i = 0; i < nr_ipi; i++)
-		enable_percpu_irq(ipi_irq_base + i, 0);
+	for (i = 0; i < nr_ipi; i++) {
+		if (ipi_should_be_nmi(i)) {
+			prepare_percpu_nmi(ipi_irq_base + i);
+			enable_percpu_nmi(ipi_irq_base + i, 0);
+		} else {
+			enable_percpu_irq(ipi_irq_base + i, 0);
+		}
+	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -928,8 +978,14 @@ static void ipi_teardown(int cpu)
 	if (WARN_ON_ONCE(!ipi_irq_base))
 		return;
 
-	for (i = 0; i < nr_ipi; i++)
-		disable_percpu_irq(ipi_irq_base + i);
+	for (i = 0; i < nr_ipi; i++) {
+		if (ipi_should_be_nmi(i)) {
+			disable_percpu_nmi(ipi_irq_base + i);
+			teardown_percpu_nmi(ipi_irq_base + i);
+		} else {
+			disable_percpu_irq(ipi_irq_base + i);
+		}
+	}
 }
 #endif
 
@@ -937,15 +993,23 @@ void __init set_smp_ipi_range(int ipi_base, int n)
 {
 	int i;
 
-	WARN_ON(n < NR_IPI);
-	nr_ipi = min(n, NR_IPI);
+	WARN_ON(n < MAX_IPI);
+	nr_ipi = min(n, MAX_IPI);
 
 	for (i = 0; i < nr_ipi; i++) {
 		int err;
 
-		err = request_percpu_irq(ipi_base + i, ipi_handler,
-					 "IPI", &cpu_number);
-		WARN_ON(err);
+		if (ipi_should_be_nmi(i)) {
+			err = request_percpu_nmi(ipi_base + i, ipi_handler,
+						 "IPI", &cpu_number);
+			WARN(err, "Could not request IPI %d as NMI, err=%d\n",
+			     i, err);
+		} else {
+			err = request_percpu_irq(ipi_base + i, ipi_handler,
+						 "IPI", &cpu_number);
+			WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
+			     i, err);
+		}
 
 		ipi_desc[i] = irq_to_desc(ipi_base + i);
 		irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
-- 
cgit v1.2.3


From d7402513c935ad87413b01aa51a7ada0ad2f0163 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:03:00 -0700
Subject: arm64: smp: IPI_CPU_STOP and IPI_CPU_CRASH_STOP should try for NMI

There's no reason why IPI_CPU_STOP and IPI_CPU_CRASH_STOP can't be
handled as NMI. They are very simple and everything in them is
NMI-safe. Mark them as things to use NMI for if NMI is available.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Reviewed-by: Misono Tomohiro <misono.tomohiro@fujitsu.com>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20230906090246.v13.5.Ifadbfd45b22c52edcb499034dd4783d096343260@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 28c904ca499a..800c59cf9b64 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -946,6 +946,8 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
 		return false;
 
 	switch (ipi) {
+	case IPI_CPU_STOP:
+	case IPI_CPU_CRASH_STOP:
 	case IPI_CPU_BACKTRACE:
 		return true;
 	default:
-- 
cgit v1.2.3


From 2f5cd0c7ffde0ec7779f27e5c4ed30e131b66393 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:03:01 -0700
Subject: arm64: kgdb: Implement kgdb_roundup_cpus() to enable pseudo-NMI
 roundup

Up until now we've been using the generic (weak) implementation for
kgdb_roundup_cpus() when using kgdb on arm64. Let's move to a custom
one. The advantage here is that, when pseudo-NMI is enabled on a
device, we'll be able to round up CPUs using pseudo-NMI. This allows
us to debug CPUs that are stuck with interrupts disabled. If
pseudo-NMIs are not enabled then we'll fallback to just using an IPI,
which is still slightly better than the generic implementation since
it avoids the potential situation described in the generic
kgdb_call_nmi_hook().

Co-developed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20230906090246.v13.6.I2ef26d1b3bfbed2d10a281942b0da7d9854de05e@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 800c59cf9b64..1a53e57c81d0 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -32,6 +32,7 @@
 #include <linux/irq_work.h>
 #include <linux/kernel_stat.h>
 #include <linux/kexec.h>
+#include <linux/kgdb.h>
 #include <linux/kvm_host.h>
 #include <linux/nmi.h>
 
@@ -79,6 +80,7 @@ enum ipi_msg_type {
 	 * with trace_ipi_*
 	 */
 	IPI_CPU_BACKTRACE = NR_IPI,
+	IPI_KGDB_ROUNDUP,
 	MAX_IPI
 };
 
@@ -868,6 +870,22 @@ void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
 	nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);
 }
 
+#ifdef CONFIG_KGDB
+void kgdb_roundup_cpus(void)
+{
+	int this_cpu = raw_smp_processor_id();
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		/* No need to roundup ourselves */
+		if (cpu == this_cpu)
+			continue;
+
+		__ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu);
+	}
+}
+#endif
+
 /*
  * Main handler for inter-processor interrupts
  */
@@ -919,6 +937,10 @@ static void do_handle_IPI(int ipinr)
 		nmi_cpu_backtrace(get_irq_regs());
 		break;
 
+	case IPI_KGDB_ROUNDUP:
+		kgdb_nmicallback(cpu, get_irq_regs());
+		break;
+
 	default:
 		pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
 		break;
@@ -949,6 +971,7 @@ static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
 	case IPI_CPU_STOP:
 	case IPI_CPU_CRASH_STOP:
 	case IPI_CPU_BACKTRACE:
+	case IPI_KGDB_ROUNDUP:
 		return true;
 	default:
 		return false;
-- 
cgit v1.2.3


From 62817d5ba25d2361488184f85d628b5b540e254b Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 6 Sep 2023 09:03:02 -0700
Subject: arm64: smp: Mark IPI globals as __ro_after_init

Mark the three IPI-related globals in smp.c as "__ro_after_init" since
they are only ever set in set_smp_ipi_range(), which is marked
"__init". This is a better and more secure marking than the old
"__read_mostly".

Suggested-by: Stephen Boyd <swboyd@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/20230906090246.v13.7.I625d393afd71e1766ef73d3bfaac0b347a4afd19@changeid
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 1a53e57c81d0..814d9aa93b21 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -84,9 +84,9 @@ enum ipi_msg_type {
 	MAX_IPI
 };
 
-static int ipi_irq_base __read_mostly;
-static int nr_ipi __read_mostly = NR_IPI;
-static struct irq_desc *ipi_desc[MAX_IPI] __read_mostly;
+static int ipi_irq_base __ro_after_init;
+static int nr_ipi __ro_after_init = NR_IPI;
+static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;
 
 static void ipi_setup(int cpu);
 
-- 
cgit v1.2.3


From 11a7a42ea76e61a8d2f7374ecdd95460dec4413f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 22 Sep 2023 14:42:55 +0100
Subject: kselftest/arm64: Validate SVCR in streaming SVE stress test

In the ZA and ZT test programs we explicitly validate that PSTATE.ZA is as
expected on each loop but we do not do the equivalent for our streaming
SVE test, add a check that we are still in streaming mode on every loop
in case that goes wrong.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230922-arm64-ssve-validate-svcr-v1-1-f518960eaeda@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-test.S | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index 4328895dfc87..547d077e3517 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -473,6 +473,13 @@ function _start
 //	mov	x8, #__NR_sched_yield	// Encourage preemption
 //	svc	#0
 
+#ifdef SSVE
+	mrs	x0, S3_3_C4_C2_2	// SVCR should have ZA=0,SM=1
+	and	x1, x0, #3
+	cmp	x1, #1
+	b.ne	svcr_barf
+#endif
+
 	mov	x21, #0
 0:	mov	x0, x21
 	bl	check_zreg
@@ -553,3 +560,15 @@ function vl_barf
 	mov	x1, #1
 	svc	#0
 endfunction
+
+function svcr_barf
+	mov	x10, x0
+
+	puts	"Bad SVCR: "
+	mov	x0, x10
+	bl	putdecn
+
+	mov	x8, #__NR_exit
+	mov	x1, #1
+	svc	#0
+endfunction
-- 
cgit v1.2.3


From 5d5b4e8c2d9ec12a5cc013b82c1873cd387e0ddf Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 15 Sep 2023 13:18:06 +0100
Subject: arm64/sve: Report FEAT_SVE_B16B16 to userspace

SVE 2.1 introduced a new feature FEAT_SVE_B16B16 which adds instructions
supporting the BFloat16 floating point format. Report this to userspace
through the ID registers and hwcap.

Reported-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230915-arm64-zfr-b16b16-el0-v1-1-f9aba807bdb5@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/cpu-feature-registers.rst | 2 ++
 Documentation/arch/arm64/elf_hwcaps.rst            | 3 +++
 arch/arm64/include/asm/hwcap.h                     | 1 +
 arch/arm64/include/uapi/asm/hwcap.h                | 1 +
 arch/arm64/kernel/cpufeature.c                     | 3 +++
 arch/arm64/kernel/cpuinfo.c                        | 1 +
 arch/arm64/tools/sysreg                            | 6 +++++-
 7 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/Documentation/arch/arm64/cpu-feature-registers.rst b/Documentation/arch/arm64/cpu-feature-registers.rst
index de6d8a4790e2..44f9bd78539d 100644
--- a/Documentation/arch/arm64/cpu-feature-registers.rst
+++ b/Documentation/arch/arm64/cpu-feature-registers.rst
@@ -268,6 +268,8 @@ infrastructure:
      +------------------------------+---------+---------+
      | SHA3                         | [35-32] |    y    |
      +------------------------------+---------+---------+
+     | B16B16                       | [27-24] |    y    |
+     +------------------------------+---------+---------+
      | BF16                         | [23-20] |    y    |
      +------------------------------+---------+---------+
      | BitPerm                      | [19-16] |    y    |
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 76ff9d7398fd..2ad0a369d96a 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -308,6 +308,9 @@ HWCAP2_MOPS
 HWCAP2_HBC
     Functionality implied by ID_AA64ISAR2_EL1.BC == 0b0001.
 
+HWCAP2_SVE_B16B16
+    Functionality implied by ID_AA64ZFR0_EL1.B16B16 == 0b0001.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 521267478d18..210a41f8b10a 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -139,6 +139,7 @@
 #define KERNEL_HWCAP_SME_F16F16		__khwcap2_feature(SME_F16F16)
 #define KERNEL_HWCAP_MOPS		__khwcap2_feature(MOPS)
 #define KERNEL_HWCAP_HBC		__khwcap2_feature(HBC)
+#define KERNEL_HWCAP_SVE_B16B16		__khwcap2_feature(SVE_B16B16)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 53026f45a509..6faf549077c5 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -104,5 +104,6 @@
 #define HWCAP2_SME_F16F16	(1UL << 42)
 #define HWCAP2_MOPS		(1UL << 43)
 #define HWCAP2_HBC		(1UL << 44)
+#define HWCAP2_SVE_B16B16	(1UL << 45)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 444a73c2e638..a013dfd5b6e9 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -278,6 +278,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
 		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
+		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
 		       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
@@ -2821,6 +2823,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
+	HWCAP_CAP(ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16),
 	HWCAP_CAP(ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 98fda8500535..ea2a31988103 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -127,6 +127,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_SME_F16F16]	= "smef16f16",
 	[KERNEL_HWCAP_MOPS]		= "mops",
 	[KERNEL_HWCAP_HBC]		= "hbc",
+	[KERNEL_HWCAP_SVE_B16B16]	= "sveb16b16",
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 76ce150e7347..bb69ab34202b 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1026,7 +1026,11 @@ UnsignedEnum	35:32	SHA3
 	0b0000	NI
 	0b0001	IMP
 EndEnum
-Res0	31:24
+Res0	31:28
+UnsignedEnum	27:24	B16B16
+	0b0000	NI
+	0b0001	IMP
+EndEnum
 UnsignedEnum	23:20	BF16
 	0b0000	NI
 	0b0001	IMP
-- 
cgit v1.2.3


From 3accaef1f61e0c5531ae9e92ff0d66981cdf1516 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 15 Sep 2023 13:18:07 +0100
Subject: kselftest/arm64: Verify HWCAP2_SVE_B16B16

Validate that SVE B16B16 support is reported correctly and consistently to
userspace.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20230915-arm64-zfr-b16b16-el0-v1-2-f9aba807bdb5@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index e3d262831d91..d8a144213d04 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -226,6 +226,12 @@ static void sveaes_sigill(void)
 	asm volatile(".inst 0x4522e400" : : : "z0");
 }
 
+static void sveb16b16_sigill(void)
+{
+	/* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */
+	asm volatile(".inst 0xC1E41C00" : : : );
+}
+
 static void svepmull_sigill(void)
 {
 	/* PMULLB Z0.Q, Z0.D, Z0.D */
@@ -493,6 +499,13 @@ static const struct hwcap_data {
 		.cpuinfo = "sveaes",
 		.sigill_fn = sveaes_sigill,
 	},
+	{
+		.name = "SVE2 B16B16",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_SVE_B16B16,
+		.cpuinfo = "sveb16b16",
+		.sigill_fn = sveb16b16_sigill,
+	},
 	{
 		.name = "SVE2 PMULL",
 		.at_hwcap = AT_HWCAP2,
-- 
cgit v1.2.3


From bfc653aa89cb05796d7b4e046600accb442c9b7a Mon Sep 17 00:00:00 2001
From: Besar Wicaksono <bwicaksono@nvidia.com>
Date: Mon, 21 Aug 2023 18:16:08 -0500
Subject: perf: arm_cspmu: Separate Arm and vendor module

Arm Coresight PMU driver consists of main standard code and
vendor backend code. Both are currently built as a single module.
This patch adds vendor registration API to separate the two to
keep things modular. The main driver requests each known backend
module during initialization and defer device binding process.
The backend module then registers an init callback to the main
driver and continue the device driver binding process.

Signed-off-by: Besar Wicaksono <bwicaksono@nvidia.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-and-tested-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20230821231608.50911-1-bwicaksono@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/Kconfig        |   9 +-
 drivers/perf/arm_cspmu/Makefile       |   6 +-
 drivers/perf/arm_cspmu/arm_cspmu.c    | 168 +++++++++++++++++++++++++++-------
 drivers/perf/arm_cspmu/arm_cspmu.h    |  25 ++++-
 drivers/perf/arm_cspmu/nvidia_cspmu.c |  34 ++++++-
 drivers/perf/arm_cspmu/nvidia_cspmu.h |  17 ----
 6 files changed, 199 insertions(+), 60 deletions(-)
 delete mode 100644 drivers/perf/arm_cspmu/nvidia_cspmu.h

diff --git a/drivers/perf/arm_cspmu/Kconfig b/drivers/perf/arm_cspmu/Kconfig
index 25d25ded0983..d5f787d22234 100644
--- a/drivers/perf/arm_cspmu/Kconfig
+++ b/drivers/perf/arm_cspmu/Kconfig
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 #
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 config ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
 	tristate "ARM Coresight Architecture PMU"
@@ -10,3 +10,10 @@ config ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
 	  based on ARM CoreSight PMU architecture. Note that this PMU
 	  architecture does not have relationship with the ARM CoreSight
 	  Self-Hosted Tracing.
+
+config NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+	tristate "NVIDIA Coresight Architecture PMU"
+	depends on ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+	help
+	  Provides NVIDIA specific attributes for performance monitoring unit
+	  (PMU) devices based on ARM CoreSight PMU architecture.
diff --git a/drivers/perf/arm_cspmu/Makefile b/drivers/perf/arm_cspmu/Makefile
index fedb17df982d..0309d2ff264a 100644
--- a/drivers/perf/arm_cspmu/Makefile
+++ b/drivers/perf/arm_cspmu/Makefile
@@ -1,6 +1,8 @@
-# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu_module.o
-arm_cspmu_module-y := arm_cspmu.o nvidia_cspmu.o
+arm_cspmu_module-y := arm_cspmu.o
+
+obj-$(CONFIG_NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += nvidia_cspmu.o
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index e2b7827c4563..c59f1e5a35a3 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -16,7 +16,7 @@
  * The user should refer to the vendor technical documentation to get details
  * about the supported events.
  *
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
@@ -26,11 +26,11 @@
 #include <linux/interrupt.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/perf_event.h>
 #include <linux/platform_device.h>
 
 #include "arm_cspmu.h"
-#include "nvidia_cspmu.h"
 
 #define PMUNAME "arm_cspmu"
 #define DRVNAME "arm-cs-arch-pmu"
@@ -112,11 +112,10 @@
  */
 #define HILOHI_MAX_POLL	1000
 
-/* JEDEC-assigned JEP106 identification code */
-#define ARM_CSPMU_IMPL_ID_NVIDIA		0x36B
-
 static unsigned long arm_cspmu_cpuhp_state;
 
+static DEFINE_MUTEX(arm_cspmu_lock);
+
 static struct acpi_apmt_node *arm_cspmu_apmt_node(struct device *dev)
 {
 	return *(struct acpi_apmt_node **)dev_get_platdata(dev);
@@ -373,27 +372,37 @@ static struct attribute_group arm_cspmu_cpumask_attr_group = {
 	.attrs = arm_cspmu_cpumask_attrs,
 };
 
-struct impl_match {
-	u32 pmiidr;
-	u32 mask;
-	int (*impl_init_ops)(struct arm_cspmu *cspmu);
-};
-
-static const struct impl_match impl_match[] = {
+static struct arm_cspmu_impl_match impl_match[] = {
 	{
-	  .pmiidr = ARM_CSPMU_IMPL_ID_NVIDIA,
-	  .mask = ARM_CSPMU_PMIIDR_IMPLEMENTER,
-	  .impl_init_ops = nv_cspmu_init_ops
+		.module_name	= "nvidia_cspmu",
+		.pmiidr_val	= ARM_CSPMU_IMPL_ID_NVIDIA,
+		.pmiidr_mask	= ARM_CSPMU_PMIIDR_IMPLEMENTER,
+		.module		= NULL,
+		.impl_init_ops	= NULL,
 	},
-	{}
+	{0}
 };
 
+static struct arm_cspmu_impl_match *arm_cspmu_impl_match_get(u32 pmiidr)
+{
+	struct arm_cspmu_impl_match *match = impl_match;
+
+	for (; match->pmiidr_val; match++) {
+		u32 mask = match->pmiidr_mask;
+
+		if ((match->pmiidr_val & mask) == (pmiidr & mask))
+			return match;
+	}
+
+	return NULL;
+}
+
 static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 {
-	int ret;
+	int ret = 0;
 	struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
 	struct acpi_apmt_node *apmt_node = arm_cspmu_apmt_node(cspmu->dev);
-	const struct impl_match *match = impl_match;
+	struct arm_cspmu_impl_match *match;
 
 	/*
 	 * Get PMU implementer and product id from APMT node.
@@ -405,17 +414,36 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 				       readl(cspmu->base0 + PMIIDR);
 
 	/* Find implementer specific attribute ops. */
-	for (; match->pmiidr; match++) {
-		const u32 mask = match->mask;
+	match = arm_cspmu_impl_match_get(cspmu->impl.pmiidr);
+
+	/* Load implementer module and initialize the callbacks. */
+	if (match) {
+		mutex_lock(&arm_cspmu_lock);
+
+		if (match->impl_init_ops) {
+			/* Prevent unload until PMU registration is done. */
+			if (try_module_get(match->module)) {
+				cspmu->impl.module = match->module;
+				cspmu->impl.match = match;
+				ret = match->impl_init_ops(cspmu);
+				if (ret)
+					module_put(match->module);
+			} else {
+				WARN(1, "arm_cspmu failed to get module: %s\n",
+					match->module_name);
+				ret = -EINVAL;
+			}
+		} else {
+			request_module_nowait(match->module_name);
+			ret = -EPROBE_DEFER;
+		}
 
-		if ((match->pmiidr & mask) == (cspmu->impl.pmiidr & mask)) {
-			ret = match->impl_init_ops(cspmu);
-			if (ret)
-				return ret;
+		mutex_unlock(&arm_cspmu_lock);
 
-			break;
-		}
-	}
+		if (ret)
+			return ret;
+	} else
+		cspmu->impl.module = THIS_MODULE;
 
 	/* Use default callbacks if implementer doesn't provide one. */
 	CHECK_DEFAULT_IMPL_OPS(impl_ops, get_event_attrs);
@@ -478,11 +506,6 @@ arm_cspmu_alloc_attr_group(struct arm_cspmu *cspmu)
 	struct attribute_group **attr_groups = NULL;
 	struct device *dev = cspmu->dev;
 	const struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
-	int ret;
-
-	ret = arm_cspmu_init_impl_ops(cspmu);
-	if (ret)
-		return NULL;
 
 	cspmu->identifier = impl_ops->get_identifier(cspmu);
 	cspmu->name = impl_ops->get_name(cspmu);
@@ -1149,7 +1172,7 @@ static int arm_cspmu_register_pmu(struct arm_cspmu *cspmu)
 
 	cspmu->pmu = (struct pmu){
 		.task_ctx_nr	= perf_invalid_context,
-		.module		= THIS_MODULE,
+		.module		= cspmu->impl.module,
 		.pmu_enable	= arm_cspmu_enable,
 		.pmu_disable	= arm_cspmu_disable,
 		.event_init	= arm_cspmu_event_init,
@@ -1196,11 +1219,17 @@ static int arm_cspmu_device_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = arm_cspmu_register_pmu(cspmu);
+	ret = arm_cspmu_init_impl_ops(cspmu);
 	if (ret)
 		return ret;
 
-	return 0;
+	ret = arm_cspmu_register_pmu(cspmu);
+
+	/* Matches arm_cspmu_init_impl_ops() above. */
+	if (cspmu->impl.module != THIS_MODULE)
+		module_put(cspmu->impl.module);
+
+	return ret;
 }
 
 static int arm_cspmu_device_remove(struct platform_device *pdev)
@@ -1300,6 +1329,75 @@ static void __exit arm_cspmu_exit(void)
 	cpuhp_remove_multi_state(arm_cspmu_cpuhp_state);
 }
 
+int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match)
+{
+	struct arm_cspmu_impl_match *match;
+	int ret = 0;
+
+	match = arm_cspmu_impl_match_get(impl_match->pmiidr_val);
+
+	if (match) {
+		mutex_lock(&arm_cspmu_lock);
+
+		if (!match->impl_init_ops) {
+			match->module = impl_match->module;
+			match->impl_init_ops = impl_match->impl_init_ops;
+		} else {
+			/* Broken match table may contain non-unique entries */
+			WARN(1, "arm_cspmu backend already registered for module: %s, pmiidr: 0x%x, mask: 0x%x\n",
+				match->module_name,
+				match->pmiidr_val,
+				match->pmiidr_mask);
+
+			ret = -EINVAL;
+		}
+
+		mutex_unlock(&arm_cspmu_lock);
+
+		if (!ret)
+			ret = driver_attach(&arm_cspmu_driver.driver);
+	} else {
+		pr_err("arm_cspmu reg failed, unable to find a match for pmiidr: 0x%x\n",
+			impl_match->pmiidr_val);
+
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(arm_cspmu_impl_register);
+
+static int arm_cspmu_match_device(struct device *dev, const void *match)
+{
+	struct arm_cspmu *cspmu = platform_get_drvdata(to_platform_device(dev));
+
+	return (cspmu && cspmu->impl.match == match) ? 1 : 0;
+}
+
+void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match)
+{
+	struct device *dev;
+	struct arm_cspmu_impl_match *match;
+
+	match = arm_cspmu_impl_match_get(impl_match->pmiidr_val);
+
+	if (WARN_ON(!match))
+		return;
+
+	/* Unbind the driver from all matching backend devices. */
+	while ((dev = driver_find_device(&arm_cspmu_driver.driver, NULL,
+			match, arm_cspmu_match_device)))
+		device_release_driver(dev);
+
+	mutex_lock(&arm_cspmu_lock);
+
+	match->module = NULL;
+	match->impl_init_ops = NULL;
+
+	mutex_unlock(&arm_cspmu_lock);
+}
+EXPORT_SYMBOL_GPL(arm_cspmu_impl_unregister);
+
 module_init(arm_cspmu_init);
 module_exit(arm_cspmu_exit);
 
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index 83df53d1c132..7936a90ded7f 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
  * ARM CoreSight Architecture PMU driver.
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
@@ -69,6 +69,9 @@
 #define ARM_CSPMU_PMIIDR_IMPLEMENTER	GENMASK(11, 0)
 #define ARM_CSPMU_PMIIDR_PRODUCTID	GENMASK(31, 20)
 
+/* JEDEC-assigned JEP106 identification code */
+#define ARM_CSPMU_IMPL_ID_NVIDIA	0x36B
+
 struct arm_cspmu;
 
 /* This tracks the events assigned to each counter in the PMU. */
@@ -106,9 +109,23 @@ struct arm_cspmu_impl_ops {
 					 struct attribute *attr, int unused);
 };
 
+/* Vendor/implementer registration parameter. */
+struct arm_cspmu_impl_match {
+	/* Backend module. */
+	struct module *module;
+	const char *module_name;
+	/* PMIIDR value/mask. */
+	u32 pmiidr_val;
+	u32 pmiidr_mask;
+	/* Callback to vendor backend to init arm_cspmu_impl::ops. */
+	int (*impl_init_ops)(struct arm_cspmu *cspmu);
+};
+
 /* Vendor/implementer descriptor. */
 struct arm_cspmu_impl {
 	u32 pmiidr;
+	struct module *module;
+	struct arm_cspmu_impl_match *match;
 	struct arm_cspmu_impl_ops ops;
 	void *ctx;
 };
@@ -147,4 +164,10 @@ ssize_t arm_cspmu_sysfs_format_show(struct device *dev,
 				    struct device_attribute *attr,
 				    char *buf);
 
+/* Register vendor backend. */
+int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match);
+
+/* Unregister vendor backend. */
+void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match);
+
 #endif /* __ARM_CSPMU_H__ */
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index 72ef80caa3c8..0382b702f092 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -1,14 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  */
 
 /* Support for NVIDIA specific attributes. */
 
+#include <linux/module.h>
 #include <linux/topology.h>
 
-#include "nvidia_cspmu.h"
+#include "arm_cspmu.h"
 
 #define NV_PCIE_PORT_COUNT           10ULL
 #define NV_PCIE_FILTER_ID_MASK       GENMASK_ULL(NV_PCIE_PORT_COUNT - 1, 0)
@@ -351,7 +352,7 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
 	return name;
 }
 
-int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
+static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 {
 	u32 prodid;
 	struct nv_cspmu_ctx *ctx;
@@ -395,6 +396,31 @@ int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nv_cspmu_init_ops);
+
+/* Match all NVIDIA Coresight PMU devices */
+static const struct arm_cspmu_impl_match nv_cspmu_param = {
+	.pmiidr_val	= ARM_CSPMU_IMPL_ID_NVIDIA,
+	.module		= THIS_MODULE,
+	.impl_init_ops	= nv_cspmu_init_ops
+};
+
+static int __init nvidia_cspmu_init(void)
+{
+	int ret;
+
+	ret = arm_cspmu_impl_register(&nv_cspmu_param);
+	if (ret)
+		pr_err("nvidia_cspmu backend registration error: %d\n", ret);
+
+	return ret;
+}
+
+static void __exit nvidia_cspmu_exit(void)
+{
+	arm_cspmu_impl_unregister(&nv_cspmu_param);
+}
+
+module_init(nvidia_cspmu_init);
+module_exit(nvidia_cspmu_exit);
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.h b/drivers/perf/arm_cspmu/nvidia_cspmu.h
deleted file mode 100644
index 71e18f0dc50b..000000000000
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- *
- */
-
-/* Support for NVIDIA specific attributes. */
-
-#ifndef __NVIDIA_CSPMU_H__
-#define __NVIDIA_CSPMU_H__
-
-#include "arm_cspmu.h"
-
-/* Allocate NVIDIA descriptor. */
-int nv_cspmu_init_ops(struct arm_cspmu *cspmu);
-
-#endif /* __NVIDIA_CSPMU_H__ */
-- 
cgit v1.2.3


From 8c282414ca6209977cb6d6cc66470ca2d1e56bf6 Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Wed, 13 Sep 2023 16:39:38 -0700
Subject: perf: arm_cspmu: Split 64-bit write to 32-bit writes

Split the 64-bit register accesses if 64-bit access is not supported
by the PMU.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Reviewed-by: Besar Wicaksono <bwicaksono@nvidia.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20230913233941.9814-2-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index c59f1e5a35a3..5f4f04135a22 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -719,7 +719,10 @@ static void arm_cspmu_write_counter(struct perf_event *event, u64 val)
 	if (use_64b_counter_reg(cspmu)) {
 		offset = counter_offset(sizeof(u64), event->hw.idx);
 
-		writeq(val, cspmu->base1 + offset);
+		if (cspmu->has_atomic_dword)
+			writeq(val, cspmu->base1 + offset);
+		else
+			lo_hi_writeq(val, cspmu->base1 + offset);
 	} else {
 		offset = counter_offset(sizeof(u32), event->hw.idx);
 
-- 
cgit v1.2.3


From 0a7603ab242e9bab530227cf0d0d344d4e334acc Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Wed, 13 Sep 2023 16:39:39 -0700
Subject: perf: arm_cspmu: Support implementation specific filters

ARM Coresight PMU architecture specification [1] defines PMEVTYPER and
PMEVFILT* registers as optional in Chapter 2.1. Moreover, implementers may
choose to use PMIMPDEF* registers (offset: 0xD80-> 0xDFF) to filter the
events. Add support for those by adding implementation specific filter
callback function.

[1] https://developer.arm.com/documentation/ihi0091/latest

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Reviewed-by: Besar Wicaksono <bwicaksono@nvidia.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20230913233941.9814-3-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 12 ++++++++----
 drivers/perf/arm_cspmu/arm_cspmu.h |  3 +++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 5f4f04135a22..2ecbf8d7714b 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -116,6 +116,9 @@ static unsigned long arm_cspmu_cpuhp_state;
 
 static DEFINE_MUTEX(arm_cspmu_lock);
 
+static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
+				    struct hw_perf_event *hwc, u32 filter);
+
 static struct acpi_apmt_node *arm_cspmu_apmt_node(struct device *dev)
 {
 	return *(struct acpi_apmt_node **)dev_get_platdata(dev);
@@ -454,6 +457,7 @@ static int arm_cspmu_init_impl_ops(struct arm_cspmu *cspmu)
 	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_type);
 	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_filter);
 	CHECK_DEFAULT_IMPL_OPS(impl_ops, event_attr_is_visible);
+	CHECK_DEFAULT_IMPL_OPS(impl_ops, set_ev_filter);
 
 	return 0;
 }
@@ -815,9 +819,9 @@ static inline void arm_cspmu_set_event(struct arm_cspmu *cspmu,
 	writel(hwc->config, cspmu->base0 + offset);
 }
 
-static inline void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
-					   struct hw_perf_event *hwc,
-					   u32 filter)
+static void arm_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
+					struct hw_perf_event *hwc,
+					u32 filter)
 {
 	u32 offset = PMEVFILTR + (4 * hwc->idx);
 
@@ -849,7 +853,7 @@ static void arm_cspmu_start(struct perf_event *event, int pmu_flags)
 		arm_cspmu_set_cc_filter(cspmu, filter);
 	} else {
 		arm_cspmu_set_event(cspmu, hwc);
-		arm_cspmu_set_ev_filter(cspmu, hwc, filter);
+		cspmu->impl.ops.set_ev_filter(cspmu, hwc, filter);
 	}
 
 	hwc->state = 0;
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index 7936a90ded7f..e0cca5b4aab9 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -104,6 +104,9 @@ struct arm_cspmu_impl_ops {
 	u32 (*event_type)(const struct perf_event *event);
 	/* Decode filter value from configs */
 	u32 (*event_filter)(const struct perf_event *event);
+	/* Set event filter */
+	void (*set_ev_filter)(struct arm_cspmu *cspmu,
+			      struct hw_perf_event *hwc, u32 filter);
 	/* Hide/show unsupported events */
 	umode_t (*event_attr_is_visible)(struct kobject *kobj,
 					 struct attribute *attr, int unused);
-- 
cgit v1.2.3


From 647d5c5a9e7672e285f54f0e141ee759e69382f2 Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Wed, 13 Sep 2023 16:39:40 -0700
Subject: perf: arm_cspmu: Support implementation specific validation

Some platforms may use e.g. different filtering mechanism and, thus,
may need different way to validate the events and group.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20230913233941.9814-4-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_cspmu/arm_cspmu.c | 8 +++++++-
 drivers/perf/arm_cspmu/arm_cspmu.h | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 2ecbf8d7714b..1ba00d640352 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -576,7 +576,7 @@ static void arm_cspmu_disable(struct pmu *pmu)
 static int arm_cspmu_get_event_idx(struct arm_cspmu_hw_events *hw_events,
 				struct perf_event *event)
 {
-	int idx;
+	int idx, ret;
 	struct arm_cspmu *cspmu = to_arm_cspmu(event->pmu);
 
 	if (supports_cycle_counter(cspmu)) {
@@ -610,6 +610,12 @@ static int arm_cspmu_get_event_idx(struct arm_cspmu_hw_events *hw_events,
 	if (idx >= cspmu->num_logical_ctrs)
 		return -EAGAIN;
 
+	if (cspmu->impl.ops.validate_event) {
+		ret = cspmu->impl.ops.validate_event(cspmu, event);
+		if (ret)
+			return ret;
+	}
+
 	set_bit(idx, hw_events->used_ctrs);
 
 	return idx;
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index e0cca5b4aab9..a30c8372214c 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -107,6 +107,9 @@ struct arm_cspmu_impl_ops {
 	/* Set event filter */
 	void (*set_ev_filter)(struct arm_cspmu *cspmu,
 			      struct hw_perf_event *hwc, u32 filter);
+	/* Implementation specific event validation */
+	int (*validate_event)(struct arm_cspmu *cspmu,
+			      struct perf_event *event);
 	/* Hide/show unsupported events */
 	umode_t (*event_attr_is_visible)(struct kobject *kobj,
 					 struct attribute *attr, int unused);
-- 
cgit v1.2.3


From a07a594152173a3dd3bdd12fc7d73dbba54cdbca Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 2 Oct 2023 18:00:36 +0100
Subject: arm64: smp: avoid NMI IPIs with broken MediaTek FW

Some MediaTek devices have broken firmware which corrupts some GICR
registers behind the back of the OS, and pseudo-NMIs cannot be used on
these devices. For more details see commit:

  44bd78dd2b8897f5 ("irqchip/gic-v3: Disable pseudo NMIs on Mediatek devices w/ firmware issues")

We did not take this problem into account in commit:

  331a1b3a836c0f38 ("arm64: smp: Add arch support for backtrace using pseudo-NMI")

Since that commit arm64's SMP code will try to setup some IPIs as
pseudo-NMIs, even on systems with broken FW. The GICv3 code will
(rightly) reject attempts to request interrupts as pseudo-NMIs,
resulting in boot-time failures.

Avoid the problem by taking the broken FW into account when deciding to
request IPIs as pseudo-NMIs. The GICv3 driver maintains a static_key
named "supports_pseudo_nmis" which is false on systems with broken FW,
and we can consult this within ipi_should_be_nmi().

Fixes: 331a1b3a836c ("arm64: smp: Add arch support for backtrace using pseudo-NMI")
Reported-by: Chen-Yu Tsai <wenst@chromium.org>
Closes: https://issuetracker.google.com/issues/197061987#comment68
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Tested-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Tested-by: Chen-Yu Tsai <wenst@chromium.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c      | 5 ++++-
 drivers/irqchip/irq-gic-v3.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 814d9aa93b21..061c69160f90 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -964,7 +964,10 @@ static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
 
 static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
 {
-	if (!system_uses_irq_prio_masking())
+	DECLARE_STATIC_KEY_FALSE(supports_pseudo_nmis);
+
+	if (!system_uses_irq_prio_masking() ||
+	    !static_branch_likely(&supports_pseudo_nmis))
 		return false;
 
 	switch (ipi) {
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 787ccc880b22..737da1b9aabf 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -106,7 +106,7 @@ static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);
  * - Figure 4-7 Secure read of the priority field for a Non-secure Group 1
  *   interrupt.
  */
-static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis);
+DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis);
 
 DEFINE_STATIC_KEY_FALSE(gic_nonsecure_priorities);
 EXPORT_SYMBOL(gic_nonsecure_priorities);
-- 
cgit v1.2.3


From ef31b8ce313eaf891bf705d5db754e549351816f Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 2 Oct 2023 09:45:30 -0700
Subject: arm64: smp: Don't directly call arch_smp_send_reschedule() for wakeup

In commit 2b2d0a7a96ab ("arm64: smp: Remove dedicated wakeup IPI") we
started using a scheduler IPI to avoid a dedicated reschedule. When we
did this, we used arch_smp_send_reschedule() directly rather than
calling smp_send_reschedule(). The only difference is that calling
arch_smp_send_reschedule() directly avoids tracing. Presumably we
_don't_ want to avoid tracing here, so switch to
smp_send_reschedule().

Fixes: 2b2d0a7a96ab ("arm64: smp: Remove dedicated wakeup IPI")
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 061c69160f90..16ead57a583d 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1061,7 +1061,7 @@ void arch_send_wakeup_ipi(unsigned int cpu)
 	 * We use a scheduler IPI to wake the CPU as this avoids the need for a
 	 * dedicated IPI and we can safely handle spurious scheduler IPIs.
 	 */
-	arch_smp_send_reschedule(cpu);
+	smp_send_reschedule(cpu);
 }
 #endif
 
-- 
cgit v1.2.3


From 53a810ad3c5cde674cac71e629e6d10bfc9d838c Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Wed, 13 Sep 2023 16:39:41 -0700
Subject: perf: arm_cspmu: ampere_cspmu: Add support for Ampere SoC PMU

Ampere SoC PMU follows CoreSight PMU architecture. It uses implementation
specific registers to filter events rather than PMEVFILTnR registers.

Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20230913233941.9814-5-ilkka@os.amperecomputing.com
[will: Include linux/io.h in ampere_cspmu.c for writel()]
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/ampere_cspmu.rst |  29 +++
 drivers/perf/arm_cspmu/Kconfig                  |  10 +
 drivers/perf/arm_cspmu/Makefile                 |   2 +
 drivers/perf/arm_cspmu/ampere_cspmu.c           | 272 ++++++++++++++++++++++++
 drivers/perf/arm_cspmu/arm_cspmu.c              |   8 +
 drivers/perf/arm_cspmu/arm_cspmu.h              |   1 +
 6 files changed, 322 insertions(+)
 create mode 100644 Documentation/admin-guide/perf/ampere_cspmu.rst
 create mode 100644 drivers/perf/arm_cspmu/ampere_cspmu.c

diff --git a/Documentation/admin-guide/perf/ampere_cspmu.rst b/Documentation/admin-guide/perf/ampere_cspmu.rst
new file mode 100644
index 000000000000..94f93f5aee6c
--- /dev/null
+++ b/Documentation/admin-guide/perf/ampere_cspmu.rst
@@ -0,0 +1,29 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================================
+Ampere SoC Performance Monitoring Unit (PMU)
+============================================
+
+Ampere SoC PMU is a generic PMU IP that follows Arm CoreSight PMU architecture.
+Therefore, the driver is implemented as a submodule of arm_cspmu driver. At the
+first phase it's used for counting MCU events on AmpereOne.
+
+
+MCU PMU events
+--------------
+
+The PMU driver supports setting filters for "rank", "bank", and "threshold".
+Note, that the filters are per PMU instance rather than per event.
+
+
+Example for perf tool use::
+
+  / # perf list ampere
+
+    ampere_mcu_pmu_0/act_sent/                         [Kernel PMU event]
+    <...>
+    ampere_mcu_pmu_1/rd_sent/                          [Kernel PMU event]
+    <...>
+
+  / # perf stat -a -e ampere_mcu_pmu_0/act_sent,bank=5,rank=3,threshold=2/,ampere_mcu_pmu_1/rd_sent/ \
+        sleep 1
diff --git a/drivers/perf/arm_cspmu/Kconfig b/drivers/perf/arm_cspmu/Kconfig
index d5f787d22234..6f4e28fc84a2 100644
--- a/drivers/perf/arm_cspmu/Kconfig
+++ b/drivers/perf/arm_cspmu/Kconfig
@@ -17,3 +17,13 @@ config NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU
 	help
 	  Provides NVIDIA specific attributes for performance monitoring unit
 	  (PMU) devices based on ARM CoreSight PMU architecture.
+
+config AMPERE_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+       tristate "Ampere Coresight Architecture PMU"
+       depends on  ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU
+	help
+	  Provides Ampere specific attributes for performance monitoring unit
+	  (PMU) devices based on ARM CoreSight PMU architecture.
+
+	  In the first phase, the driver enables support on MCU PMU used in
+	  AmpereOne SoC family.
diff --git a/drivers/perf/arm_cspmu/Makefile b/drivers/perf/arm_cspmu/Makefile
index 0309d2ff264a..220a734efd54 100644
--- a/drivers/perf/arm_cspmu/Makefile
+++ b/drivers/perf/arm_cspmu/Makefile
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu_module.o
+
 arm_cspmu_module-y := arm_cspmu.o
 
 obj-$(CONFIG_NVIDIA_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += nvidia_cspmu.o
+obj-$(CONFIG_AMPERE_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += ampere_cspmu.o
diff --git a/drivers/perf/arm_cspmu/ampere_cspmu.c b/drivers/perf/arm_cspmu/ampere_cspmu.c
new file mode 100644
index 000000000000..f146a455e838
--- /dev/null
+++ b/drivers/perf/arm_cspmu/ampere_cspmu.c
@@ -0,0 +1,272 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ampere SoC PMU (Performance Monitor Unit)
+ *
+ * Copyright (c) 2023, Ampere Computing LLC
+ */
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+
+#include "arm_cspmu.h"
+
+#define PMAUXR0		0xD80
+#define PMAUXR1		0xD84
+#define PMAUXR2		0xD88
+#define PMAUXR3		0xD8C
+
+#define to_ampere_cspmu_ctx(cspmu)	((struct ampere_cspmu_ctx *)(cspmu->impl.ctx))
+
+struct ampere_cspmu_ctx {
+	const char *name;
+	struct attribute **event_attr;
+	struct attribute **format_attr;
+};
+
+static DEFINE_IDA(mcu_pmu_ida);
+
+#define SOC_PMU_EVENT_ATTR_EXTRACTOR(_name, _config, _start, _end)        \
+	static inline u32 get_##_name(const struct perf_event *event)     \
+	{                                                                 \
+		return FIELD_GET(GENMASK_ULL(_end, _start),               \
+				 event->attr._config);                    \
+	}                                                                 \
+
+SOC_PMU_EVENT_ATTR_EXTRACTOR(event, config, 0, 8);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(threshold, config1, 0, 7);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(rank, config1, 8, 23);
+SOC_PMU_EVENT_ATTR_EXTRACTOR(bank, config1, 24, 55);
+
+static struct attribute *ampereone_mcu_pmu_event_attrs[] = {
+	ARM_CSPMU_EVENT_ATTR(cycle_count,		0x00),
+	ARM_CSPMU_EVENT_ATTR(act_sent,			0x01),
+	ARM_CSPMU_EVENT_ATTR(pre_sent,			0x02),
+	ARM_CSPMU_EVENT_ATTR(rd_sent,			0x03),
+	ARM_CSPMU_EVENT_ATTR(rda_sent,			0x04),
+	ARM_CSPMU_EVENT_ATTR(wr_sent,			0x05),
+	ARM_CSPMU_EVENT_ATTR(wra_sent,			0x06),
+	ARM_CSPMU_EVENT_ATTR(pd_entry_vld,		0x07),
+	ARM_CSPMU_EVENT_ATTR(sref_entry_vld,		0x08),
+	ARM_CSPMU_EVENT_ATTR(prea_sent,			0x09),
+	ARM_CSPMU_EVENT_ATTR(pre_sb_sent,		0x0a),
+	ARM_CSPMU_EVENT_ATTR(ref_sent,			0x0b),
+	ARM_CSPMU_EVENT_ATTR(rfm_sent,			0x0c),
+	ARM_CSPMU_EVENT_ATTR(ref_sb_sent,		0x0d),
+	ARM_CSPMU_EVENT_ATTR(rfm_sb_sent,		0x0e),
+	ARM_CSPMU_EVENT_ATTR(rd_rda_sent,		0x0f),
+	ARM_CSPMU_EVENT_ATTR(wr_wra_sent,		0x10),
+	ARM_CSPMU_EVENT_ATTR(raw_hazard,		0x11),
+	ARM_CSPMU_EVENT_ATTR(war_hazard,		0x12),
+	ARM_CSPMU_EVENT_ATTR(waw_hazard,		0x13),
+	ARM_CSPMU_EVENT_ATTR(rar_hazard,		0x14),
+	ARM_CSPMU_EVENT_ATTR(raw_war_waw_hazard,	0x15),
+	ARM_CSPMU_EVENT_ATTR(hprd_lprd_wr_req_vld,	0x16),
+	ARM_CSPMU_EVENT_ATTR(lprd_req_vld,		0x17),
+	ARM_CSPMU_EVENT_ATTR(hprd_req_vld,		0x18),
+	ARM_CSPMU_EVENT_ATTR(hprd_lprd_req_vld,		0x19),
+	ARM_CSPMU_EVENT_ATTR(prefetch_tgt,		0x1a),
+	ARM_CSPMU_EVENT_ATTR(wr_req_vld,		0x1b),
+	ARM_CSPMU_EVENT_ATTR(partial_wr_req_vld,	0x1c),
+	ARM_CSPMU_EVENT_ATTR(rd_retry,			0x1d),
+	ARM_CSPMU_EVENT_ATTR(wr_retry,			0x1e),
+	ARM_CSPMU_EVENT_ATTR(retry_gnt,			0x1f),
+	ARM_CSPMU_EVENT_ATTR(rank_change,		0x20),
+	ARM_CSPMU_EVENT_ATTR(dir_change,		0x21),
+	ARM_CSPMU_EVENT_ATTR(rank_dir_change,		0x22),
+	ARM_CSPMU_EVENT_ATTR(rank_active,		0x23),
+	ARM_CSPMU_EVENT_ATTR(rank_idle,			0x24),
+	ARM_CSPMU_EVENT_ATTR(rank_pd,			0x25),
+	ARM_CSPMU_EVENT_ATTR(rank_sref,			0x26),
+	ARM_CSPMU_EVENT_ATTR(queue_fill_gt_thresh,	0x27),
+	ARM_CSPMU_EVENT_ATTR(queue_rds_gt_thresh,	0x28),
+	ARM_CSPMU_EVENT_ATTR(queue_wrs_gt_thresh,	0x29),
+	ARM_CSPMU_EVENT_ATTR(phy_updt_complt,		0x2a),
+	ARM_CSPMU_EVENT_ATTR(tz_fail,			0x2b),
+	ARM_CSPMU_EVENT_ATTR(dram_errc,			0x2c),
+	ARM_CSPMU_EVENT_ATTR(dram_errd,			0x2d),
+	ARM_CSPMU_EVENT_ATTR(read_data_return,		0x32),
+	ARM_CSPMU_EVENT_ATTR(chi_wr_data_delta,		0x33),
+	ARM_CSPMU_EVENT_ATTR(zq_start,			0x34),
+	ARM_CSPMU_EVENT_ATTR(zq_latch,			0x35),
+	ARM_CSPMU_EVENT_ATTR(wr_fifo_full,		0x36),
+	ARM_CSPMU_EVENT_ATTR(info_fifo_full,		0x37),
+	ARM_CSPMU_EVENT_ATTR(cmd_fifo_full,		0x38),
+	ARM_CSPMU_EVENT_ATTR(dfi_nop,			0x39),
+	ARM_CSPMU_EVENT_ATTR(dfi_cmd,			0x3a),
+	ARM_CSPMU_EVENT_ATTR(rd_run_len,		0x3b),
+	ARM_CSPMU_EVENT_ATTR(wr_run_len,		0x3c),
+
+	ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+	NULL,
+};
+
+static struct attribute *ampereone_mcu_format_attrs[] = {
+	ARM_CSPMU_FORMAT_EVENT_ATTR,
+	ARM_CSPMU_FORMAT_ATTR(threshold, "config1:0-7"),
+	ARM_CSPMU_FORMAT_ATTR(rank, "config1:8-23"),
+	ARM_CSPMU_FORMAT_ATTR(bank, "config1:24-55"),
+	NULL,
+};
+
+static struct attribute **
+ampere_cspmu_get_event_attrs(const struct arm_cspmu *cspmu)
+{
+	const struct ampere_cspmu_ctx *ctx = to_ampere_cspmu_ctx(cspmu);
+
+	return ctx->event_attr;
+}
+
+static struct attribute **
+ampere_cspmu_get_format_attrs(const struct arm_cspmu *cspmu)
+{
+	const struct ampere_cspmu_ctx *ctx = to_ampere_cspmu_ctx(cspmu);
+
+	return ctx->format_attr;
+}
+
+static const char *
+ampere_cspmu_get_name(const struct arm_cspmu *cspmu)
+{
+	const struct ampere_cspmu_ctx *ctx = to_ampere_cspmu_ctx(cspmu);
+
+	return ctx->name;
+}
+
+static u32 ampere_cspmu_event_filter(const struct perf_event *event)
+{
+	/*
+	 * PMEVFILTR or PMCCFILTR aren't used in Ampere SoC PMU but are marked
+	 * as RES0. Make sure, PMCCFILTR is written zero.
+	 */
+	return 0;
+}
+
+static void ampere_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
+				       struct hw_perf_event *hwc,
+				       u32 filter)
+{
+	struct perf_event *event;
+	unsigned int idx;
+	u32 threshold, rank, bank;
+
+	/*
+	 * At this point, all the events have the same filter settings.
+	 * Therefore, take the first event and use its configuration.
+	 */
+	idx = find_first_bit(cspmu->hw_events.used_ctrs,
+			     cspmu->cycle_counter_logical_idx);
+
+	event = cspmu->hw_events.events[idx];
+
+	threshold	= get_threshold(event);
+	rank		= get_rank(event);
+	bank		= get_bank(event);
+
+	writel(threshold, cspmu->base0 + PMAUXR0);
+	writel(rank, cspmu->base0 + PMAUXR1);
+	writel(bank, cspmu->base0 + PMAUXR2);
+}
+
+static int ampere_cspmu_validate_configs(struct perf_event *event,
+					 struct perf_event *event2)
+{
+	if (get_threshold(event) != get_threshold(event2) ||
+	    get_rank(event) != get_rank(event2) ||
+	    get_bank(event) != get_bank(event2))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ampere_cspmu_validate_event(struct arm_cspmu *cspmu,
+				       struct perf_event *new)
+{
+	struct perf_event *curr, *leader = new->group_leader;
+	unsigned int idx;
+	int ret;
+
+	ret = ampere_cspmu_validate_configs(new, leader);
+	if (ret)
+		return ret;
+
+	/* We compare the global filter settings to the existing events */
+	idx = find_first_bit(cspmu->hw_events.used_ctrs,
+			     cspmu->cycle_counter_logical_idx);
+
+	/* This is the first event, thus any configuration is fine */
+	if (idx == cspmu->cycle_counter_logical_idx)
+		return 0;
+
+	curr = cspmu->hw_events.events[idx];
+
+	return ampere_cspmu_validate_configs(curr, new);
+}
+
+static char *ampere_cspmu_format_name(const struct arm_cspmu *cspmu,
+				      const char *name_pattern)
+{
+	struct device *dev = cspmu->dev;
+	int id;
+
+	id = ida_alloc(&mcu_pmu_ida, GFP_KERNEL);
+	if (id < 0)
+		return ERR_PTR(id);
+
+	return devm_kasprintf(dev, GFP_KERNEL, name_pattern, id);
+}
+
+static int ampere_cspmu_init_ops(struct arm_cspmu *cspmu)
+{
+	struct device *dev = cspmu->dev;
+	struct ampere_cspmu_ctx *ctx;
+	struct arm_cspmu_impl_ops *impl_ops = &cspmu->impl.ops;
+
+	ctx = devm_kzalloc(dev, sizeof(struct ampere_cspmu_ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->event_attr	= ampereone_mcu_pmu_event_attrs;
+	ctx->format_attr = ampereone_mcu_format_attrs;
+	ctx->name = ampere_cspmu_format_name(cspmu, "ampere_mcu_pmu_%d");
+	if (IS_ERR_OR_NULL(ctx->name))
+		return ctx->name ? PTR_ERR(ctx->name) : -ENOMEM;
+
+	cspmu->impl.ctx = ctx;
+
+	impl_ops->event_filter		= ampere_cspmu_event_filter;
+	impl_ops->set_ev_filter		= ampere_cspmu_set_ev_filter;
+	impl_ops->validate_event	= ampere_cspmu_validate_event;
+	impl_ops->get_name		= ampere_cspmu_get_name;
+	impl_ops->get_event_attrs	= ampere_cspmu_get_event_attrs;
+	impl_ops->get_format_attrs	= ampere_cspmu_get_format_attrs;
+
+	return 0;
+}
+
+/* Match all Ampere Coresight PMU devices */
+static const struct arm_cspmu_impl_match ampere_cspmu_param = {
+	.pmiidr_val	= ARM_CSPMU_IMPL_ID_AMPERE,
+	.module		= THIS_MODULE,
+	.impl_init_ops	= ampere_cspmu_init_ops
+};
+
+static int __init ampere_cspmu_init(void)
+{
+	int ret;
+
+	ret = arm_cspmu_impl_register(&ampere_cspmu_param);
+	if (ret)
+		pr_err("ampere_cspmu backend registration error: %d\n", ret);
+
+	return ret;
+}
+
+static void __exit ampere_cspmu_exit(void)
+{
+	arm_cspmu_impl_unregister(&ampere_cspmu_param);
+}
+
+module_init(ampere_cspmu_init);
+module_exit(ampere_cspmu_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 1ba00d640352..0e3fe00d741d 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -383,6 +383,14 @@ static struct arm_cspmu_impl_match impl_match[] = {
 		.module		= NULL,
 		.impl_init_ops	= NULL,
 	},
+	{
+		.module_name	= "ampere_cspmu",
+		.pmiidr_val	= ARM_CSPMU_IMPL_ID_AMPERE,
+		.pmiidr_mask	= ARM_CSPMU_PMIIDR_IMPLEMENTER,
+		.module		= NULL,
+		.impl_init_ops	= NULL,
+	},
+
 	{0}
 };
 
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index a30c8372214c..2fe723555a6b 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -71,6 +71,7 @@
 
 /* JEDEC-assigned JEP106 identification code */
 #define ARM_CSPMU_IMPL_ID_NVIDIA	0x36B
+#define ARM_CSPMU_IMPL_ID_AMPERE	0xA16
 
 struct arm_cspmu;
 
-- 
cgit v1.2.3


From 0abe7f61c28d62ee0530c31589e6ea209aa82cbd Mon Sep 17 00:00:00 2001
From: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Date: Thu, 12 Oct 2023 00:41:03 -0700
Subject: docs/perf: Add ampere_cspmu to toctree to fix a build warning

Add ampere_cspmu to toctree in order to address the following warning
produced when building documents:

	Documentation/admin-guide/perf/ampere_cspmu.rst: WARNING: document isn't included in any toctree

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lore.kernel.org/all/20231011172250.5a6498e5@canb.auug.org.au/
Fixes: 53a810ad3c5c ("perf: arm_cspmu: ampere_cspmu: Add support for Ampere SoC PMU")
Signed-off-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/20231012074103.3772114-1-ilkka@os.amperecomputing.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/perf/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index f60be04e4e33..a2e6f2c81146 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -22,3 +22,4 @@ Performance monitor support
    nvidia-pmu
    meson-ddr-pmu
    cxl
+   ampere_cspmu
-- 
cgit v1.2.3


From 65033574ade97afccba074d837fd269903a83a9a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 5 Oct 2023 16:40:30 +0100
Subject: arm64: swiotlb: Reduce the default size if no ZONE_DMA bouncing
 needed

With CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC enabled, the arm64 kernel still
allocates the default SWIOTLB buffer (64MB) even if ZONE_DMA is disabled
or all the RAM fits into this zone. However, this potentially wastes a
non-negligible amount of memory on platforms with little RAM.

Reduce the SWIOTLB size to 1MB per 1GB of RAM if only needed for
kmalloc() buffer bouncing.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Suggested-by: Ross Burton <ross.burton@arm.com>
Cc: Ross Burton <ross.burton@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/mm/init.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 8a0f8604348b..8deec68028ac 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -16,6 +16,7 @@
 #include <linux/nodemask.h>
 #include <linux/initrd.h>
 #include <linux/gfp.h>
+#include <linux/math.h>
 #include <linux/memblock.h>
 #include <linux/sort.h>
 #include <linux/of.h>
@@ -493,8 +494,16 @@ void __init mem_init(void)
 {
 	bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
 
-	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC))
+	if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
+		/*
+		 * If no bouncing needed for ZONE_DMA, reduce the swiotlb
+		 * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
+		 */
+		unsigned long size =
+			DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
+		swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
 		swiotlb = true;
+	}
 
 	swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
 
-- 
cgit v1.2.3


From 338a835f40a849cd89b993e342bd9fbd5684825c Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 19 Sep 2023 17:27:56 +0100
Subject: arm64: add FEAT_LRCPC3 HWCAP

FEAT_LRCPC3 adds more instructions to support the Release Consistency model.
Add a HWCAP so that userspace can make decisions about instructions it can use.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230919162757.2707023-2-joey.gouly@arm.com
[catalin.marinas@arm.com: change the HWCAP number]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/elf_hwcaps.rst | 3 +++
 arch/arm64/include/asm/hwcap.h          | 1 +
 arch/arm64/include/uapi/asm/hwcap.h     | 1 +
 arch/arm64/kernel/cpufeature.c          | 1 +
 arch/arm64/kernel/cpuinfo.c             | 1 +
 arch/arm64/tools/sysreg                 | 1 +
 6 files changed, 8 insertions(+)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 2ad0a369d96a..a453f8430f7f 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -311,6 +311,9 @@ HWCAP2_HBC
 HWCAP2_SVE_B16B16
     Functionality implied by ID_AA64ZFR0_EL1.B16B16 == 0b0001.
 
+HWCAP2_LRCPC3
+    Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0011.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 210a41f8b10a..1c65f10619a0 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -140,6 +140,7 @@
 #define KERNEL_HWCAP_MOPS		__khwcap2_feature(MOPS)
 #define KERNEL_HWCAP_HBC		__khwcap2_feature(HBC)
 #define KERNEL_HWCAP_SVE_B16B16		__khwcap2_feature(SVE_B16B16)
+#define KERNEL_HWCAP_LRCPC3		__khwcap2_feature(LRCPC3)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 6faf549077c5..0f37944e98b6 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -105,5 +105,6 @@
 #define HWCAP2_MOPS		(1UL << 43)
 #define HWCAP2_HBC		(1UL << 44)
 #define HWCAP2_SVE_B16B16	(1UL << 45)
+#define HWCAP2_LRCPC3		(1UL << 46)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a013dfd5b6e9..a83934291037 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2809,6 +2809,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
+	HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB),
 	HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index ea2a31988103..0ba7e6cd8fee 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -128,6 +128,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_MOPS]		= "mops",
 	[KERNEL_HWCAP_HBC]		= "hbc",
 	[KERNEL_HWCAP_SVE_B16B16]	= "sveb16b16",
+	[KERNEL_HWCAP_LRCPC3]		= "lrcpc3",
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index bb69ab34202b..4794556e67e9 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1309,6 +1309,7 @@ UnsignedEnum	23:20	LRCPC
 	0b0000	NI
 	0b0001	IMP
 	0b0010	LRCPC2
+	0b0011	LRCPC3
 EndEnum
 UnsignedEnum	19:16	FCMA
 	0b0000	NI
-- 
cgit v1.2.3


From 80652cc0c0485da593c1639a6355dcdab95364e6 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 19 Sep 2023 17:27:57 +0100
Subject: selftests/arm64: add HWCAP2_LRCPC3 test

Add a test for the newly added HWCAP2_LRCPC3.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20230919162757.2707023-3-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index d8a144213d04..a60f23794944 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -295,6 +295,19 @@ static void uscat_sigbus(void)
 	asm volatile(".inst 0xb820003f" : : : );
 }
 
+static void lrcpc3_sigill(void)
+{
+	int data[2] = { 1, 2 };
+
+	register int *src asm ("x0") = data;
+	register int data0 asm ("w2") = 0;
+	register int data1 asm ("w3") = 0;
+
+	/* LDIAPP w2, w3, [x0] */
+	asm volatile(".inst 0x99431802"
+	              : "=r" (data0), "=r" (data1) : "r" (src) :);
+}
+
 static const struct hwcap_data {
 	const char *name;
 	unsigned long at_hwcap;
@@ -354,6 +367,13 @@ static const struct hwcap_data {
 		.cpuinfo = "ilrcpc",
 		.sigill_fn = ilrcpc_sigill,
 	},
+	{
+		.name = "LRCPC3",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_LRCPC3,
+		.cpuinfo = "lrcpc3",
+		.sigill_fn = lrcpc3_sigill,
+	},
 	{
 		.name = "LSE",
 		.at_hwcap = AT_HWCAP,
-- 
cgit v1.2.3


From 94d0657f9f0d311489606589133ebf49e28104d8 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 3 Oct 2023 13:45:43 +0100
Subject: arm64: add FEAT_LSE128 HWCAP

Add HWCAP for FEAT_LSE128 (128-bit Atomic instructions).

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20231003124544.858804-2-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arch/arm64/elf_hwcaps.rst | 3 +++
 arch/arm64/include/asm/hwcap.h          | 1 +
 arch/arm64/include/uapi/asm/hwcap.h     | 1 +
 arch/arm64/kernel/cpufeature.c          | 1 +
 arch/arm64/kernel/cpuinfo.c             | 1 +
 arch/arm64/tools/sysreg                 | 1 +
 6 files changed, 8 insertions(+)

diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index a453f8430f7f..4b8399ac592b 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -314,6 +314,9 @@ HWCAP2_SVE_B16B16
 HWCAP2_LRCPC3
     Functionality implied by ID_AA64ISAR1_EL1.LRCPC == 0b0011.
 
+HWCAP2_LSE128
+    Functionality implied by ID_AA64ISAR0_EL1.Atomic == 0b0011.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 1c65f10619a0..cd71e09ea14d 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -141,6 +141,7 @@
 #define KERNEL_HWCAP_HBC		__khwcap2_feature(HBC)
 #define KERNEL_HWCAP_SVE_B16B16		__khwcap2_feature(SVE_B16B16)
 #define KERNEL_HWCAP_LRCPC3		__khwcap2_feature(LRCPC3)
+#define KERNEL_HWCAP_LSE128		__khwcap2_feature(LSE128)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 0f37944e98b6..5023599fa278 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -106,5 +106,6 @@
 #define HWCAP2_HBC		(1UL << 44)
 #define HWCAP2_SVE_B16B16	(1UL << 45)
 #define HWCAP2_LRCPC3		(1UL << 46)
+#define HWCAP2_LSE128		(1UL << 47)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a83934291037..6b3ac96ef717 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2789,6 +2789,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS),
+	HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3),
 	HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3),
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 0ba7e6cd8fee..a257da7b56fe 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -129,6 +129,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_HBC]		= "hbc",
 	[KERNEL_HWCAP_SVE_B16B16]	= "sveb16b16",
 	[KERNEL_HWCAP_LRCPC3]		= "lrcpc3",
+	[KERNEL_HWCAP_LSE128]		= "lse128",
 };
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 4794556e67e9..96cbeeab4eec 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1239,6 +1239,7 @@ EndEnum
 UnsignedEnum	23:20	ATOMIC
 	0b0000	NI
 	0b0010	IMP
+	0b0011	FEAT_LSE128
 EndEnum
 UnsignedEnum	19:16	CRC32
 	0b0000	NI
-- 
cgit v1.2.3


From 72e301956dbb31bd679eff66fbe3da32d2dc2af5 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Tue, 3 Oct 2023 13:45:44 +0100
Subject: kselftest/arm64: add FEAT_LSE128 to hwcap test

Add test of a 128-bit atomic instruction for FEAT_LSE128.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20231003124544.858804-3-joey.gouly@arm.com
[catalin.marinas@arm.com: reordered lse128_sigill() alphabetically]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/abi/hwcap.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index a60f23794944..1189e77c8152 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -81,6 +81,20 @@ static void lrcpc_sigill(void)
 	asm volatile(".inst 0xb8bfc3e0" : : : );
 }
 
+static void lse128_sigill(void)
+{
+	u64 __attribute__ ((aligned (16))) mem[2] = { 10, 20 };
+	register u64 *memp asm ("x0") = mem;
+	register u64 val0 asm ("x1") = 5;
+	register u64 val1 asm ("x2") = 4;
+
+	/* SWPP X1, X2, [X0] */
+	asm volatile(".inst 0x19228001"
+		     : "+r" (memp), "+r" (val0), "+r" (val1)
+		     :
+		     : "cc", "memory");
+}
+
 static void mops_sigill(void)
 {
 	char dst[1], src[1];
@@ -390,6 +404,13 @@ static const struct hwcap_data {
 		.sigbus_fn = uscat_sigbus,
 		.sigbus_reliable = true,
 	},
+	{
+		.name = "LSE128",
+		.at_hwcap = AT_HWCAP2,
+		.hwcap_bit = HWCAP2_LSE128,
+		.cpuinfo = "lse128",
+		.sigill_fn = lse128_sigill,
+	},
 	{
 		.name = "MOPS",
 		.at_hwcap = AT_HWCAP2,
-- 
cgit v1.2.3


From 166b76a073be8e1ffdc2c4db60f3abbbc974a3a3 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:24 +0100
Subject: clocksource/drivers/arm_arch_timer: Initialize evtstrm after
 finalizing cpucaps

We attempt to initialize each CPU's arch_timer event stream in
arch_timer_evtstrm_enable(), which we call from the
arch_timer_starting_cpu() cpu hotplug callback which is registered early
in boot. As this is registered before we initialize the system cpucaps,
the test for ARM64_HAS_ECV will always be false for CPUs present at boot
time, and will only be taken into account for CPUs onlined late
(including those which are hotplugged out and in again).

Due to this, CPUs present and boot time may not use the intended divider
and scale factor to generate the event stream, and may differ from other
CPUs.

Correct this by only initializing the event stream after cpucaps have been
finalized, registering a separate CPU hotplug callback for the event stream
configuration. Since the caps must be finalized by this point, use
cpus_have_final_cap() to verify this.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/clocksource/arm_arch_timer.c | 31 ++++++++++++++++++++++++++-----
 include/linux/cpuhotplug.h           |  1 +
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 7dd2c615bce2..d1e9e556da81 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -917,7 +917,7 @@ static void arch_timer_evtstrm_enable(unsigned int divider)
 
 #ifdef CONFIG_ARM64
 	/* ECV is likely to require a large divider. Use the EVNTIS flag. */
-	if (cpus_have_const_cap(ARM64_HAS_ECV) && divider > 15) {
+	if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) {
 		cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE;
 		divider -= 8;
 	}
@@ -955,6 +955,30 @@ static void arch_timer_configure_evtstream(void)
 	arch_timer_evtstrm_enable(max(0, lsb));
 }
 
+static int arch_timer_evtstrm_starting_cpu(unsigned int cpu)
+{
+	arch_timer_configure_evtstream();
+	return 0;
+}
+
+static int arch_timer_evtstrm_dying_cpu(unsigned int cpu)
+{
+	cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
+	return 0;
+}
+
+static int __init arch_timer_evtstrm_register(void)
+{
+	if (!arch_timer_evt || !evtstrm_enable)
+		return 0;
+
+	return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
+				 "clockevents/arm/arch_timer_evtstrm:starting",
+				 arch_timer_evtstrm_starting_cpu,
+				 arch_timer_evtstrm_dying_cpu);
+}
+core_initcall(arch_timer_evtstrm_register);
+
 static void arch_counter_set_user_access(void)
 {
 	u32 cntkctl = arch_timer_get_cntkctl();
@@ -1016,8 +1040,6 @@ static int arch_timer_starting_cpu(unsigned int cpu)
 	}
 
 	arch_counter_set_user_access();
-	if (evtstrm_enable)
-		arch_timer_configure_evtstream();
 
 	return 0;
 }
@@ -1164,8 +1186,6 @@ static int arch_timer_dying_cpu(unsigned int cpu)
 {
 	struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
 
-	cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
-
 	arch_timer_stop(clk);
 	return 0;
 }
@@ -1279,6 +1299,7 @@ out_unreg_notify:
 
 out_free:
 	free_percpu(arch_timer_evt);
+	arch_timer_evt = NULL;
 out:
 	return err;
 }
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 068f7738be22..bb4d0bcac81b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -172,6 +172,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_L2X0_STARTING,
 	CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING,
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
+	CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
 	CPUHP_AP_ARM_GLOBAL_TIMER_STARTING,
 	CPUHP_AP_JCORE_TIMER_STARTING,
 	CPUHP_AP_ARM_TWD_STARTING,
-- 
cgit v1.2.3


From 20f3b8eafe0ba5d3c69d5011a9b07739e9645132 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:25 +0100
Subject: arm64/arm: xen: enlighten: Fix KPTI checks

When KPTI is in use, we cannot register a runstate region as XEN
requires that this is always a valid VA, which we cannot guarantee. Due
to this, xen_starting_cpu() must avoid registering each CPU's runstate
region, and xen_guest_init() must avoid setting up features that depend
upon it.

We tried to ensure that in commit:

  f88af7229f6f22ce (" xen/arm: do not setup the runstate info page if kpti is enabled")

... where we added checks for xen_kernel_unmapped_at_usr(), which wraps
arm64_kernel_unmapped_at_el0() on arm64 and is always false on 32-bit
arm.

Unfortunately, as xen_guest_init() is an early_initcall, this happens
before secondary CPUs are booted and arm64 has finalized the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap which backs
arm64_kernel_unmapped_at_el0(), and so this can subsequently be set as
secondary CPUs are onlined. On a big.LITTLE system where the boot CPU
does not require KPTI but some secondary CPUs do, this will result in
xen_guest_init() intializing features that depend on the runstate
region, and xen_starting_cpu() registering the runstate region on some
CPUs before KPTI is subsequent enabled, resulting the the problems the
aforementioned commit tried to avoid.

Handle this more robsutly by deferring the initialization of the
runstate region until secondary CPUs have been initialized and the
ARM64_UNMAP_KERNEL_AT_EL0 cpucap has been finalized. The per-cpu work is
moved into a new hotplug starting function which is registered later
when we're certain that KPTI will not be used.

Fixes: f88af7229f6f ("xen/arm: do not setup the runstate info page if kpti is enabled")
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Bertrand Marquis <bertrand.marquis@arm.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm/xen/enlighten.c   | 25 ++++++++++++++++---------
 include/linux/cpuhotplug.h |  1 +
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index c392e18f1e43..9afdc4c4a5dc 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -164,9 +164,6 @@ static int xen_starting_cpu(unsigned int cpu)
 	BUG_ON(err);
 	per_cpu(xen_vcpu, cpu) = vcpup;
 
-	if (!xen_kernel_unmapped_at_usr())
-		xen_setup_runstate_info(cpu);
-
 after_register_vcpu_info:
 	enable_percpu_irq(xen_events_irq, 0);
 	return 0;
@@ -523,9 +520,6 @@ static int __init xen_guest_init(void)
 		return -EINVAL;
 	}
 
-	if (!xen_kernel_unmapped_at_usr())
-		xen_time_setup_guest();
-
 	if (xen_initial_domain())
 		pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
 
@@ -535,7 +529,13 @@ static int __init xen_guest_init(void)
 }
 early_initcall(xen_guest_init);
 
-static int __init xen_pm_init(void)
+static int xen_starting_runstate_cpu(unsigned int cpu)
+{
+	xen_setup_runstate_info(cpu);
+	return 0;
+}
+
+static int __init xen_late_init(void)
 {
 	if (!xen_domain())
 		return -ENODEV;
@@ -548,9 +548,16 @@ static int __init xen_pm_init(void)
 		do_settimeofday64(&ts);
 	}
 
-	return 0;
+	if (xen_kernel_unmapped_at_usr())
+		return 0;
+
+	xen_time_setup_guest();
+
+	return cpuhp_setup_state(CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
+				 "arm/xen_runstate:starting",
+				 xen_starting_runstate_cpu, NULL);
 }
-late_initcall(xen_pm_init);
+late_initcall(xen_late_init);
 
 
 /* empty stubs */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index bb4d0bcac81b..d06e31e03a5e 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -190,6 +190,7 @@ enum cpuhp_state {
 	/* Must be the last timer callback */
 	CPUHP_AP_DUMMY_TIMER_STARTING,
 	CPUHP_AP_ARM_XEN_STARTING,
+	CPUHP_AP_ARM_XEN_RUNSTATE_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_STARTING,
 	CPUHP_AP_ARM_CORESIGHT_CTI_STARTING,
 	CPUHP_AP_ARM64_ISNDEP_STARTING,
-- 
cgit v1.2.3


From 484de08518e43aa14f0d5719a99b19625ac12228 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:26 +0100
Subject: arm64: Factor out cpucap definitions

For clarity it would be nice to factor cpucap manipulation out of
<asm/cpufeature.h>, and the obvious place would be <asm/cpucap.h>, but
this will clash somewhat with <generated/asm/cpucaps.h>.

Rename <generated/asm/cpucaps.h> to <generated/asm/cpucap-defs.h>,
matching what we do for <generated/asm/sysreg-defs.h>, and introduce a
new <asm/cpucaps.h> which includes the generated header.

Subsequent patches will fill out <asm/cpucaps.h>.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/Kbuild    | 2 +-
 arch/arm64/include/asm/cpucaps.h | 8 ++++++++
 arch/arm64/tools/Makefile        | 4 ++--
 arch/arm64/tools/gen-cpucaps.awk | 6 +++---
 4 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 arch/arm64/include/asm/cpucaps.h

diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 5c8ee5a541d2..4b6d2d52053e 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -6,5 +6,5 @@ generic-y += qspinlock.h
 generic-y += parport.h
 generic-y += user.h
 
-generated-y += cpucaps.h
+generated-y += cpucap-defs.h
 generated-y += sysreg-defs.h
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
new file mode 100644
index 000000000000..7333b5bbf448
--- /dev/null
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ASM_CPUCAPS_H
+#define __ASM_CPUCAPS_H
+
+#include <asm/cpucap-defs.h>
+
+#endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile
index 07a93ab21a62..fa2251d9762d 100644
--- a/arch/arm64/tools/Makefile
+++ b/arch/arm64/tools/Makefile
@@ -3,7 +3,7 @@
 gen := arch/$(ARCH)/include/generated
 kapi := $(gen)/asm
 
-kapi-hdrs-y := $(kapi)/cpucaps.h $(kapi)/sysreg-defs.h
+kapi-hdrs-y := $(kapi)/cpucap-defs.h $(kapi)/sysreg-defs.h
 
 targets += $(addprefix ../../../, $(kapi-hdrs-y))
 
@@ -17,7 +17,7 @@ quiet_cmd_gen_cpucaps = GEN     $@
 quiet_cmd_gen_sysreg = GEN     $@
       cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
 
-$(kapi)/cpucaps.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
+$(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
 	$(call if_changed,gen_cpucaps)
 
 $(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE
diff --git a/arch/arm64/tools/gen-cpucaps.awk b/arch/arm64/tools/gen-cpucaps.awk
index 8525980379d7..2f4f61a0af17 100755
--- a/arch/arm64/tools/gen-cpucaps.awk
+++ b/arch/arm64/tools/gen-cpucaps.awk
@@ -15,8 +15,8 @@ function fatal(msg) {
 /^#/ { next }
 
 BEGIN {
-	print "#ifndef __ASM_CPUCAPS_H"
-	print "#define __ASM_CPUCAPS_H"
+	print "#ifndef __ASM_CPUCAP_DEFS_H"
+	print "#define __ASM_CPUCAP_DEFS_H"
 	print ""
 	print "/* Generated file - do not edit */"
 	cap_num = 0
@@ -31,7 +31,7 @@ BEGIN {
 END {
 	printf("#define ARM64_NCAPS\t\t\t\t\t%d\n", cap_num)
 	print ""
-	print "#endif /* __ASM_CPUCAPS_H */"
+	print "#endif /* __ASM_CPUCAP_DEFS_H */"
 }
 
 # Any lines not handled by previous rules are unexpected
-- 
cgit v1.2.3


From de66cb37ab6488d78d4b415504ad1fcd0910bae5 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:27 +0100
Subject: arm64: Add cpucap_is_possible()

Many cpucaps can only be set when certain CONFIG_* options are selected,
and we need to check the CONFIG_* option before the cap in order to
avoid generating redundant code. Due to this, we have a growing number
of helpers in <asm/cpufeature.h> of the form:

| static __always_inline bool system_supports_foo(void)
| {
|         return IS_ENABLED(CONFIG_ARM64_FOO) &&
|                 cpus_have_const_cap(ARM64_HAS_FOO);
| }

This is unfortunate as it forces us to use cpus_have_const_cap()
unnecessarily, resulting in redundant code being generated by the
compiler. In the vast majority of cases, we only require that feature
checks indicate the presence of a feature after cpucaps have been
finalized, and so it would be sufficient to use alternative_has_cap_*().
However some code needs to handle a feature before alternatives have
been patched, and must test the system_cpucaps bitmap via
cpus_have_const_cap(). In other cases we'd like to check for
unintentional usage of a cpucap before alternatives are patched, and so
it would be preferable to use cpus_have_final_cap().

Placing the IS_ENABLED() checks in each callsite is tedious and
error-prone, and the same applies for writing wrappers for each
comination of cpucap and alternative_has_cap_*() / cpus_have_cap() /
cpus_have_final_cap(). It would be nicer if we could centralize the
knowledge of which cpucaps are possible, and have
alternative_has_cap_*(), cpus_have_cap(), and cpus_have_final_cap()
handle this automatically.

This patch adds a new cpucap_is_possible() function which will be
responsible for checking the CONFIG_* option, and updates the low-level
cpucap checks to use this. The existing CONFIG_* checks in
<asm/cpufeature.h> are moved over to cpucap_is_possible(), but the (now
trival) wrapper functions are retained for now.

There should be no functional change as a result of this patch alone.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/alternative-macros.h |  8 +++---
 arch/arm64/include/asm/cpucaps.h            | 41 +++++++++++++++++++++++++++++
 arch/arm64/include/asm/cpufeature.h         | 39 ++++++++++-----------------
 3 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h
index 94b486192e1f..210bb43cff2c 100644
--- a/arch/arm64/include/asm/alternative-macros.h
+++ b/arch/arm64/include/asm/alternative-macros.h
@@ -226,8 +226,8 @@ alternative_endif
 static __always_inline bool
 alternative_has_cap_likely(const unsigned long cpucap)
 {
-	compiletime_assert(cpucap < ARM64_NCAPS,
-			   "cpucap must be < ARM64_NCAPS");
+	if (!cpucap_is_possible(cpucap))
+		return false;
 
 	asm_volatile_goto(
 	ALTERNATIVE_CB("b	%l[l_no]", %[cpucap], alt_cb_patch_nops)
@@ -244,8 +244,8 @@ l_no:
 static __always_inline bool
 alternative_has_cap_unlikely(const unsigned long cpucap)
 {
-	compiletime_assert(cpucap < ARM64_NCAPS,
-			   "cpucap must be < ARM64_NCAPS");
+	if (!cpucap_is_possible(cpucap))
+		return false;
 
 	asm_volatile_goto(
 	ALTERNATIVE("nop", "b	%l[l_yes]", %[cpucap])
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 7333b5bbf448..764ad4eef859 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -5,4 +5,45 @@
 
 #include <asm/cpucap-defs.h>
 
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+/*
+ * Check whether a cpucap is possible at compiletime.
+ */
+static __always_inline bool
+cpucap_is_possible(const unsigned int cap)
+{
+	compiletime_assert(__builtin_constant_p(cap),
+			   "cap must be a constant");
+	compiletime_assert(cap < ARM64_NCAPS,
+			   "cap must be < ARM64_NCAPS");
+
+	switch (cap) {
+	case ARM64_HAS_PAN:
+		return IS_ENABLED(CONFIG_ARM64_PAN);
+	case ARM64_SVE:
+		return IS_ENABLED(CONFIG_ARM64_SVE);
+	case ARM64_SME:
+	case ARM64_SME2:
+	case ARM64_SME_FA64:
+		return IS_ENABLED(CONFIG_ARM64_SME);
+	case ARM64_HAS_CNP:
+		return IS_ENABLED(CONFIG_ARM64_CNP);
+	case ARM64_HAS_ADDRESS_AUTH:
+	case ARM64_HAS_GENERIC_AUTH:
+		return IS_ENABLED(CONFIG_ARM64_PTR_AUTH);
+	case ARM64_HAS_GIC_PRIO_MASKING:
+		return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI);
+	case ARM64_MTE:
+		return IS_ENABLED(CONFIG_ARM64_MTE);
+	case ARM64_BTI:
+		return IS_ENABLED(CONFIG_ARM64_BTI);
+	case ARM64_HAS_TLB_RANGE:
+		return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
+	}
+
+	return true;
+}
+#endif /* __ASSEMBLY__ */
+
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 5bba39376055..577d9766e5ab 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -450,6 +450,8 @@ static __always_inline bool system_capabilities_finalized(void)
  */
 static __always_inline bool cpus_have_cap(unsigned int num)
 {
+	if (__builtin_constant_p(num) && !cpucap_is_possible(num))
+		return false;
 	if (num >= ARM64_NCAPS)
 		return false;
 	return arch_test_bit(num, system_cpucaps);
@@ -465,8 +467,6 @@ static __always_inline bool cpus_have_cap(unsigned int num)
  */
 static __always_inline bool __cpus_have_const_cap(int num)
 {
-	if (num >= ARM64_NCAPS)
-		return false;
 	return alternative_has_cap_unlikely(num);
 }
 
@@ -740,8 +740,7 @@ static __always_inline bool system_supports_fpsimd(void)
 
 static inline bool system_uses_hw_pan(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_PAN) &&
-		cpus_have_const_cap(ARM64_HAS_PAN);
+	return cpus_have_const_cap(ARM64_HAS_PAN);
 }
 
 static inline bool system_uses_ttbr0_pan(void)
@@ -752,26 +751,22 @@ static inline bool system_uses_ttbr0_pan(void)
 
 static __always_inline bool system_supports_sve(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_SVE) &&
-		cpus_have_const_cap(ARM64_SVE);
+	return cpus_have_const_cap(ARM64_SVE);
 }
 
 static __always_inline bool system_supports_sme(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_SME) &&
-		cpus_have_const_cap(ARM64_SME);
+	return cpus_have_const_cap(ARM64_SME);
 }
 
 static __always_inline bool system_supports_sme2(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_SME) &&
-		cpus_have_const_cap(ARM64_SME2);
+	return cpus_have_const_cap(ARM64_SME2);
 }
 
 static __always_inline bool system_supports_fa64(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_SME) &&
-		cpus_have_const_cap(ARM64_SME_FA64);
+	return cpus_have_const_cap(ARM64_SME_FA64);
 }
 
 static __always_inline bool system_supports_tpidr2(void)
@@ -781,20 +776,17 @@ static __always_inline bool system_supports_tpidr2(void)
 
 static __always_inline bool system_supports_cnp(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_CNP) &&
-		cpus_have_const_cap(ARM64_HAS_CNP);
+	return cpus_have_const_cap(ARM64_HAS_CNP);
 }
 
 static inline bool system_supports_address_auth(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_PTR_AUTH) &&
-		cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH);
+	return cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH);
 }
 
 static inline bool system_supports_generic_auth(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_PTR_AUTH) &&
-		cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH);
+	return cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH);
 }
 
 static inline bool system_has_full_ptr_auth(void)
@@ -804,14 +796,12 @@ static inline bool system_has_full_ptr_auth(void)
 
 static __always_inline bool system_uses_irq_prio_masking(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) &&
-	       cpus_have_const_cap(ARM64_HAS_GIC_PRIO_MASKING);
+	return cpus_have_const_cap(ARM64_HAS_GIC_PRIO_MASKING);
 }
 
 static inline bool system_supports_mte(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_MTE) &&
-		cpus_have_const_cap(ARM64_MTE);
+	return cpus_have_const_cap(ARM64_MTE);
 }
 
 static inline bool system_has_prio_mask_debugging(void)
@@ -822,13 +812,12 @@ static inline bool system_has_prio_mask_debugging(void)
 
 static inline bool system_supports_bti(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI);
+	return cpus_have_const_cap(ARM64_BTI);
 }
 
 static inline bool system_supports_tlb_range(void)
 {
-	return IS_ENABLED(CONFIG_ARM64_TLB_RANGE) &&
-		cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
+	return cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
 }
 
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
-- 
cgit v1.2.3


From 7bf46aa1c9cbd85e173a9ab8cbf6d7092c388947 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:28 +0100
Subject: arm64: Add cpus_have_final_boot_cap()

The cpus_have_final_cap() function can be used to test a cpucap while
also verifying that we do not consume the cpucap until system
capabilities have been finalized. It would be helpful if we could do
likewise for boot cpucaps.

This patch adds a new cpus_have_final_boot_cap() helper which can be
used to test a cpucap while also verifying that boot capabilities have
been finalized. Users will be added in subsequent patches.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 577d9766e5ab..ba7c60c9ac69 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -438,6 +438,11 @@ unsigned long cpu_get_elf_hwcap2(void);
 #define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
 #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
 
+static __always_inline bool boot_capabilities_finalized(void)
+{
+	return alternative_has_cap_likely(ARM64_ALWAYS_BOOT);
+}
+
 static __always_inline bool system_capabilities_finalized(void)
 {
 	return alternative_has_cap_likely(ARM64_ALWAYS_SYSTEM);
@@ -473,8 +478,26 @@ static __always_inline bool __cpus_have_const_cap(int num)
 /*
  * Test for a capability without a runtime check.
  *
- * Before capabilities are finalized, this will BUG().
- * After capabilities are finalized, this is patched to avoid a runtime check.
+ * Before boot capabilities are finalized, this will BUG().
+ * After boot capabilities are finalized, this is patched to avoid a runtime
+ * check.
+ *
+ * @num must be a compile-time constant.
+ */
+static __always_inline bool cpus_have_final_boot_cap(int num)
+{
+	if (boot_capabilities_finalized())
+		return __cpus_have_const_cap(num);
+	else
+		BUG();
+}
+
+/*
+ * Test for a capability without a runtime check.
+ *
+ * Before system capabilities are finalized, this will BUG().
+ * After system capabilities are finalized, this is patched to avoid a runtime
+ * check.
  *
  * @num must be a compile-time constant.
  */
-- 
cgit v1.2.3


From 075f48c924be9e34816bb808eb66ba9105e75e5b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:29 +0100
Subject: arm64: Rework setup_cpu_features()

Currently setup_cpu_features() handles a mixture of one-time kernel
feature setup (e.g. cpucaps) and one-time user feature setup (e.g. ELF
hwcaps). Subsequent patches will rework other one-time setup and expand
the logic currently in setup_cpu_features(), and in preparation for this
it would be helpful to split the kernel and user setup into separate
functions.

This patch splits setup_user_features() out of setup_cpu_features(),
with a few additional cleanups of note:

* setup_cpu_features() is renamed to setup_system_features() to make it
  clear that it handles system-wide feature setup rather than cpu-local
  feature setup.

* setup_system_capabilities() is folded into setup_system_features().

* Presence of TTBR0 pan is logged immediately after
  update_cpu_capabilities(), so that this is guaranteed to appear
  alongside all the other detected system cpucaps.

* The 'cwg' variable is removed as its value is only consumed once and
  it's simpler to use cache_type_cwg() directly without assigning its
  return value to a variable.

* The call to setup_user_features() is moved after alternatives are
  patched, which will allow user feature setup code to depend on
  alternative branches and allow for simplifications in subsequent
  patches.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  4 +++-
 arch/arm64/kernel/cpufeature.c      | 43 ++++++++++++++++++-------------------
 arch/arm64/kernel/smp.c             |  3 ++-
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index ba7c60c9ac69..1233a8ff96a8 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -649,7 +649,9 @@ static inline bool id_aa64pfr1_mte(u64 pfr1)
 	return val >= ID_AA64PFR1_EL1_MTE_MTE2;
 }
 
-void __init setup_cpu_features(void);
+void __init setup_system_features(void);
+void __init setup_user_features(void);
+
 void check_local_cpu_capabilities(void);
 
 u64 read_sanitised_ftr_reg(u32 id);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 444a73c2e638..234cf3189bee 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3328,23 +3328,35 @@ unsigned long cpu_get_elf_hwcap2(void)
 	return elf_hwcap[1];
 }
 
-static void __init setup_system_capabilities(void)
+void __init setup_system_features(void)
 {
 	/*
-	 * We have finalised the system-wide safe feature
-	 * registers, finalise the capabilities that depend
-	 * on it. Also enable all the available capabilities,
-	 * that are not enabled already.
+	 * The system-wide safe feature feature register values have been
+	 * finalized. Finalize and log the available system capabilities.
 	 */
 	update_cpu_capabilities(SCOPE_SYSTEM);
+	if (system_uses_ttbr0_pan())
+		pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
+
+	/*
+	 * Enable all the available capabilities which have not been enabled
+	 * already.
+	 */
 	enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU);
+
+	sve_setup();
+	sme_setup();
+
+	/*
+	 * Check for sane CTR_EL0.CWG value.
+	 */
+	if (!cache_type_cwg())
+		pr_warn("No Cache Writeback Granule information, assuming %d\n",
+			ARCH_DMA_MINALIGN);
 }
 
-void __init setup_cpu_features(void)
+void __init setup_user_features(void)
 {
-	u32 cwg;
-
-	setup_system_capabilities();
 	setup_elf_hwcaps(arm64_elf_hwcaps);
 
 	if (system_supports_32bit_el0()) {
@@ -3352,20 +3364,7 @@ void __init setup_cpu_features(void)
 		elf_hwcap_fixup();
 	}
 
-	if (system_uses_ttbr0_pan())
-		pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
-
-	sve_setup();
-	sme_setup();
 	minsigstksz_setup();
-
-	/*
-	 * Check for sane CTR_EL0.CWG value.
-	 */
-	cwg = cache_type_cwg();
-	if (!cwg)
-		pr_warn("No Cache Writeback Granule information, assuming %d\n",
-			ARCH_DMA_MINALIGN);
 }
 
 static int enable_mismatched_32bit_el0(unsigned int cpu)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..e0cdf6820f9e 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -431,9 +431,10 @@ static void __init hyp_mode_check(void)
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
-	setup_cpu_features();
+	setup_system_features();
 	hyp_mode_check();
 	apply_alternatives_all();
+	setup_user_features();
 	mark_linear_text_alias_ro();
 }
 
-- 
cgit v1.2.3


From 7f632d331d4706cdbfcaa217b98e1df9b5b5719b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:30 +0100
Subject: arm64: Fixup user features at boot time

For ARM64_WORKAROUND_2658417, we use a cpu_enable() callback to hide the
ID_AA64ISAR1_EL1.BF16 ID register field. This is a little awkward as
CPUs may attempt to apply the workaround concurrently, requiring that we
protect the bulk of the callback with a raw_spinlock, and requiring some
pointless work every time a CPU is subsequently hotplugged in.

This patch makes this a little simpler by handling the masking once at
boot time. A new user_feature_fixup() function is called at the start of
setup_user_features() to mask the feature, matching the style of
elf_hwcap_fixup(). The ARM64_WORKAROUND_2658417 cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is
not possible.

Note that the ARM64_WORKAROUND_2658417 capability is matched with
ERRATA_MIDR_RANGE(), which implicitly gives the capability a
ARM64_CPUCAP_LOCAL_CPU_ERRATUM type, which forbids the late onlining of
a CPU with the erratum if the erratum was not present at boot time.
Therefore this patch doesn't change the behaviour for late onlining.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h |  2 ++
 arch/arm64/kernel/cpu_errata.c   | 17 -----------------
 arch/arm64/kernel/cpufeature.c   | 13 +++++++++++++
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 764ad4eef859..07c9271b534d 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -40,6 +40,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_BTI);
 	case ARM64_HAS_TLB_RANGE:
 		return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
+	case ARM64_WORKAROUND_2658417:
+		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	}
 
 	return true;
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index be66e94a21bd..86aed329d42c 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -121,22 +121,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused)
 	sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCI, 0);
 }
 
-static DEFINE_RAW_SPINLOCK(reg_user_mask_modification);
-static void __maybe_unused
-cpu_clear_bf16_from_user_emulation(const struct arm64_cpu_capabilities *__unused)
-{
-	struct arm64_ftr_reg *regp;
-
-	regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1);
-	if (!regp)
-		return;
-
-	raw_spin_lock(&reg_user_mask_modification);
-	if (regp->user_mask & ID_AA64ISAR1_EL1_BF16_MASK)
-		regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK;
-	raw_spin_unlock(&reg_user_mask_modification);
-}
-
 #define CAP_MIDR_RANGE(model, v_min, r_min, v_max, r_max)	\
 	.matches = is_affected_midr_range,			\
 	.midr_range = MIDR_RANGE(model, v_min, r_min, v_max, r_max)
@@ -727,7 +711,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		/* Cortex-A510 r0p0 - r1p1 */
 		ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
 		MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)),
-		.cpu_enable = cpu_clear_bf16_from_user_emulation,
 	},
 #endif
 #ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 234cf3189bee..46508399796f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2190,6 +2190,17 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
 }
 #endif /* CONFIG_ARM64_MTE */
 
+static void user_feature_fixup(void)
+{
+	if (cpus_have_cap(ARM64_WORKAROUND_2658417)) {
+		struct arm64_ftr_reg *regp;
+
+		regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1);
+		if (regp)
+			regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK;
+	}
+}
+
 static void elf_hwcap_fixup(void)
 {
 #ifdef CONFIG_ARM64_ERRATUM_1742098
@@ -3357,6 +3368,8 @@ void __init setup_system_features(void)
 
 void __init setup_user_features(void)
 {
+	user_feature_fixup();
+
 	setup_elf_hwcaps(arm64_elf_hwcaps);
 
 	if (system_supports_32bit_el0()) {
-- 
cgit v1.2.3


From 42c5a3b04bf672292b2da7b51745728609a29691 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:31 +0100
Subject: arm64: Split kpti_install_ng_mappings()

The arm64_cpu_capabilities::cpu_enable callbacks are intended for
cpu-local feature enablement (e.g. poking system registers). These get
called for each online CPU when boot/system cpucaps get finalized and
enabled, and get called whenever a CPU is subsequently onlined.

For KPTI with the ARM64_UNMAP_KERNEL_AT_EL0 cpucap, we use the
kpti_install_ng_mappings() function as the cpu_enable callback. This
does a mixture of cpu-local configuration (setting VBAR_EL1 to the
appropriate trampoline vectors) and some global configuration (rewriting
the swapper page tables to sue non-glboal mappings) that must happen at
most once.

This patch splits kpti_install_ng_mappings() into a cpu-local
cpu_enable_kpti() initialization function and a system-wide
kpti_install_ng_mappings() function. The cpu_enable_kpti() function is
responsible for selecting the necessary cpu-local vectors each time a
CPU is onlined, and the kpti_install_ng_mappings() function performs the
one-time rewrite of the translation tables too use non-global mappings.
Splitting the two makes the code a bit easier to follow and also allows
the page table rewriting code to be marked as __init such that it can be
freed after use.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 54 ++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 46508399796f..5add7d06469d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1754,16 +1754,15 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
 			     phys_addr_t size, pgprot_t prot,
 			     phys_addr_t (*pgtable_alloc)(int), int flags);
 
-static phys_addr_t kpti_ng_temp_alloc;
+static phys_addr_t __initdata kpti_ng_temp_alloc;
 
-static phys_addr_t kpti_ng_pgd_alloc(int shift)
+static phys_addr_t __init kpti_ng_pgd_alloc(int shift)
 {
 	kpti_ng_temp_alloc -= PAGE_SIZE;
 	return kpti_ng_temp_alloc;
 }
 
-static void
-kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
+static int __init __kpti_install_ng_mappings(void *__unused)
 {
 	typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
 	extern kpti_remap_fn idmap_kpti_install_ng_mappings;
@@ -1776,20 +1775,6 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 	pgd_t *kpti_ng_temp_pgd;
 	u64 alloc = 0;
 
-	if (__this_cpu_read(this_cpu_vector) == vectors) {
-		const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
-
-		__this_cpu_write(this_cpu_vector, v);
-	}
-
-	/*
-	 * We don't need to rewrite the page-tables if either we've done
-	 * it already or we have KASLR enabled and therefore have not
-	 * created any global mappings at all.
-	 */
-	if (arm64_use_ng_mappings)
-		return;
-
 	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
 	if (!cpu) {
@@ -1826,14 +1811,39 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 		free_pages(alloc, order);
 		arm64_use_ng_mappings = true;
 	}
+
+	return 0;
+}
+
+static void __init kpti_install_ng_mappings(void)
+{
+	/*
+	 * We don't need to rewrite the page-tables if either we've done
+	 * it already or we have KASLR enabled and therefore have not
+	 * created any global mappings at all.
+	 */
+	if (arm64_use_ng_mappings)
+		return;
+
+	stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask);
 }
+
 #else
-static void
-kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
+static inline void kpti_install_ng_mappings(void)
 {
 }
 #endif	/* CONFIG_UNMAP_KERNEL_AT_EL0 */
 
+static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap)
+{
+	if (__this_cpu_read(this_cpu_vector) == vectors) {
+		const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
+
+		__this_cpu_write(this_cpu_vector, v);
+	}
+
+}
+
 static int __init parse_kpti(char *str)
 {
 	bool enabled;
@@ -2362,7 +2372,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.desc = "Kernel page table isolation (KPTI)",
 		.capability = ARM64_UNMAP_KERNEL_AT_EL0,
 		.type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE,
-		.cpu_enable = kpti_install_ng_mappings,
+		.cpu_enable = cpu_enable_kpti,
 		.matches = unmap_kernel_at_el0,
 		/*
 		 * The ID feature fields below are used to indicate that
@@ -3355,6 +3365,8 @@ void __init setup_system_features(void)
 	 */
 	enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU);
 
+	kpti_install_ng_mappings();
+
 	sve_setup();
 	sme_setup();
 
-- 
cgit v1.2.3


From d8569fba13850bbf46a172fcee63df2130cf4ecc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:32 +0100
Subject: arm64: kvm: Use cpus_have_final_cap() explicitly

Much of the arm64 KVM code uses cpus_have_const_cap() to check for
cpucaps, but this is unnecessary and it would be preferable to use
cpus_have_final_cap().

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

KVM is initialized after cpucaps have been finalized and alternatives
have been patched. Since commit:

  d86de40decaa14e6 ("arm64: cpufeature: upgrade hyp caps to final")

... use of cpus_have_const_cap() in hyp code is automatically converted
to use cpus_have_final_cap():

| static __always_inline bool cpus_have_const_cap(int num)
| {
| 	if (is_hyp_code())
| 		return cpus_have_final_cap(num);
| 	else if (system_capabilities_finalized())
| 		return __cpus_have_const_cap(num);
| 	else
| 		return cpus_have_cap(num);
| }

Thus, converting hyp code to use cpus_have_final_cap() directly will not
result in any functional change.

Non-hyp KVM code is also not executed until cpucaps have been finalized,
and it would be preferable to extent the same treatment to this code and
use cpus_have_final_cap() directly.

This patch converts instances of cpus_have_const_cap() in KVM-only code
over to cpus_have_final_cap(). As all of this code runs after cpucaps
have been finalized, there should be no functional change as a result of
this patch, but the redundant instructions generated by
cpus_have_const_cap() will be removed from the non-hyp KVM code.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/kvm_emulate.h |  4 ++--
 arch/arm64/include/asm/kvm_host.h    |  2 +-
 arch/arm64/include/asm/kvm_mmu.h     |  2 +-
 arch/arm64/kvm/arm.c                 | 10 +++++-----
 arch/arm64/kvm/guest.c               |  4 ++--
 arch/arm64/kvm/hyp/pgtable.c         |  2 +-
 arch/arm64/kvm/mmu.c                 |  2 +-
 arch/arm64/kvm/sys_regs.c            |  2 +-
 arch/arm64/kvm/vgic/vgic-v3.c        |  2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 3d6725ff0bf6..cbd2f163a67d 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -71,14 +71,14 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
 	if (has_vhe() || has_hvhe())
 		vcpu->arch.hcr_el2 |= HCR_E2H;
-	if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) {
+	if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
 		/* route synchronous external abort exceptions to EL2 */
 		vcpu->arch.hcr_el2 |= HCR_TEA;
 		/* trap error record accesses */
 		vcpu->arch.hcr_el2 |= HCR_TERR;
 	}
 
-	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) {
+	if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) {
 		vcpu->arch.hcr_el2 |= HCR_FWB;
 	} else {
 		/*
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index af06ccb7ee34..e64d64e6ad44 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1052,7 +1052,7 @@ static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
 
 static inline bool kvm_system_needs_idmapped_vectors(void)
 {
-	return cpus_have_const_cap(ARM64_SPECTRE_V3A);
+	return cpus_have_final_cap(ARM64_SPECTRE_V3A);
 }
 
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 96a80e8f6226..27810667dec7 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -218,7 +218,7 @@ static inline void __clean_dcache_guest_page(void *va, size_t size)
 	 * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
 	 * PoU is not required either in this case.
 	 */
-	if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+	if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		return;
 
 	kvm_flush_dcache_to_poc(va, size);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 4866b3f7b4ea..4ea6c22250a5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -284,7 +284,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = kvm_arm_pvtime_supported();
 		break;
 	case KVM_CAP_ARM_EL1_32BIT:
-		r = cpus_have_const_cap(ARM64_HAS_32BIT_EL1);
+		r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
 		break;
 	case KVM_CAP_GUEST_DEBUG_HW_BPS:
 		r = get_num_brps();
@@ -296,7 +296,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = kvm_arm_support_pmu_v3();
 		break;
 	case KVM_CAP_ARM_INJECT_SERROR_ESR:
-		r = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
+		r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
 		break;
 	case KVM_CAP_ARM_VM_IPA_SIZE:
 		r = get_kvm_ipa_limit();
@@ -1207,7 +1207,7 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
 	if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
 		return 0;
 
-	if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1))
+	if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
 		return -EINVAL;
 
 	/* MTE is incompatible with AArch32 */
@@ -1777,7 +1777,7 @@ static void hyp_install_host_vector(void)
 	 * Call initialization code, and switch to the full blown HYP code.
 	 * If the cpucaps haven't been finalized yet, something has gone very
 	 * wrong, and hyp will crash and burn when it uses any
-	 * cpus_have_const_cap() wrapper.
+	 * cpus_have_*_cap() wrapper.
 	 */
 	BUG_ON(!system_capabilities_finalized());
 	params = this_cpu_ptr_nvhe_sym(kvm_init_params);
@@ -2310,7 +2310,7 @@ static int __init init_hyp_mode(void)
 
 	if (is_protected_kvm_enabled()) {
 		if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
-		    cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH))
+		    cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
 			pkvm_hyp_init_ptrauth();
 
 		init_cpu_logical_map();
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 95f6945c4432..a83c1d56a294 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -815,7 +815,7 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
 			      struct kvm_vcpu_events *events)
 {
 	events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
-	events->exception.serror_has_esr = cpus_have_const_cap(ARM64_HAS_RAS_EXTN);
+	events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
 
 	if (events->exception.serror_pending && events->exception.serror_has_esr)
 		events->exception.serror_esr = vcpu_get_vsesr(vcpu);
@@ -837,7 +837,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
 	bool ext_dabt_pending = events->exception.ext_dabt_pending;
 
 	if (serror_pending && has_esr) {
-		if (!cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
+		if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
 			return -EINVAL;
 
 		if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index f155b8c9e98c..799d2c204bb8 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -664,7 +664,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 
 static bool stage2_has_fwb(struct kvm_pgtable *pgt)
 {
-	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+	if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		return false;
 
 	return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 482280fe22d7..e6061fd174b0 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1578,7 +1578,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	if (device)
 		prot |= KVM_PGTABLE_PROT_DEVICE;
-	else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+	else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
 		prot |= KVM_PGTABLE_PROT_X;
 
 	/*
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e92ec810d449..9318b6939b78 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -207,7 +207,7 @@ static bool access_dcsw(struct kvm_vcpu *vcpu,
 	 * CPU left in the system, and certainly not from non-secure
 	 * software).
 	 */
-	if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB))
+	if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		kvm_set_way_flush(vcpu);
 
 	return true;
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 3dfc8b84e03e..9465d3706ab9 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -684,7 +684,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
 	if (kvm_vgic_global_state.vcpu_base == 0)
 		kvm_info("disabling GICv2 emulation\n");
 
-	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+	if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
 		group0_trap = true;
 		group1_trap = true;
 	}
-- 
cgit v1.2.3


From bc9bbb78801af1bd155cda6a30ecaa720ef2bab7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:33 +0100
Subject: arm64: Explicitly save/restore CPACR when probing SVE and SME

When a CPUs onlined we first probe for supported features and
propetites, and then we subsequently enable features that have been
detected. This is a little problematic for SVE and SME, as some
properties (e.g. vector lengths) cannot be probed while they are
disabled. Due to this, the code probing for SVE properties has to enable
SVE for EL1 prior to proving, and the code probing for SME properties
has to enable SME for EL1 prior to probing. We never disable SVE or SME
for EL1 after probing.

It would be a little nicer to transiently enable SVE and SME during
probing, leaving them both disabled unless explicitly enabled, as this
would make it much easier to catch unintentional usage (e.g. when they
are not present system-wide).

This patch reworks the SVE and SME feature probing code to only
transiently enable support at EL1, disabling after probing is complete.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h | 26 ++++++++++++++++++++++++++
 arch/arm64/kernel/cpufeature.c  | 24 ++++++++++++++++++++++--
 arch/arm64/kernel/fpsimd.c      |  7 ++-----
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 8df46f186c64..bd7bea92dae0 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -32,6 +32,32 @@
 #define VFP_STATE_SIZE		((32 * 8) + 4)
 #endif
 
+static inline unsigned long cpacr_save_enable_kernel_sve(void)
+{
+	unsigned long old = read_sysreg(cpacr_el1);
+	unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_ZEN_EL1EN;
+
+	write_sysreg(old | set, cpacr_el1);
+	isb();
+	return old;
+}
+
+static inline unsigned long cpacr_save_enable_kernel_sme(void)
+{
+	unsigned long old = read_sysreg(cpacr_el1);
+	unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_SMEN_EL1EN;
+
+	write_sysreg(old | set, cpacr_el1);
+	isb();
+	return old;
+}
+
+static inline void cpacr_restore(unsigned long cpacr)
+{
+	write_sysreg(cpacr, cpacr_el1);
+	isb();
+}
+
 /*
  * When we defined the maximum SVE vector length we defined the ABI so
  * that the maximum vector length included all the reserved for future
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 5add7d06469d..e3e0d3d3bd6b 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1040,13 +1040,19 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+		unsigned long cpacr = cpacr_save_enable_kernel_sve();
+
 		info->reg_zcr = read_zcr_features();
 		init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
 		vec_init_vq_map(ARM64_VEC_SVE);
+
+		cpacr_restore(cpacr);
 	}
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
 	    id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
+		unsigned long cpacr = cpacr_save_enable_kernel_sme();
+
 		info->reg_smcr = read_smcr_features();
 		/*
 		 * We mask out SMPS since even if the hardware
@@ -1056,6 +1062,8 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 		info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
 		init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr);
 		vec_init_vq_map(ARM64_VEC_SME);
+
+		cpacr_restore(cpacr);
 	}
 
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
@@ -1291,6 +1299,8 @@ void update_cpu_features(int cpu,
 
 	if (IS_ENABLED(CONFIG_ARM64_SVE) &&
 	    id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+		unsigned long cpacr = cpacr_save_enable_kernel_sve();
+
 		info->reg_zcr = read_zcr_features();
 		taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu,
 					info->reg_zcr, boot->reg_zcr);
@@ -1298,10 +1308,14 @@ void update_cpu_features(int cpu,
 		/* Probe vector lengths */
 		if (!system_capabilities_finalized())
 			vec_update_vq_map(ARM64_VEC_SVE);
+
+		cpacr_restore(cpacr);
 	}
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
 	    id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
+		unsigned long cpacr = cpacr_save_enable_kernel_sme();
+
 		info->reg_smcr = read_smcr_features();
 		/*
 		 * We mask out SMPS since even if the hardware
@@ -1315,6 +1329,8 @@ void update_cpu_features(int cpu,
 		/* Probe vector lengths */
 		if (!system_capabilities_finalized())
 			vec_update_vq_map(ARM64_VEC_SME);
+
+		cpacr_restore(cpacr);
 	}
 
 	/*
@@ -3174,6 +3190,8 @@ static void verify_local_elf_hwcaps(void)
 
 static void verify_sve_features(void)
 {
+	unsigned long cpacr = cpacr_save_enable_kernel_sve();
+
 	u64 safe_zcr = read_sanitised_ftr_reg(SYS_ZCR_EL1);
 	u64 zcr = read_zcr_features();
 
@@ -3186,11 +3204,13 @@ static void verify_sve_features(void)
 		cpu_die_early();
 	}
 
-	/* Add checks on other ZCR bits here if necessary */
+	cpacr_restore(cpacr);
 }
 
 static void verify_sme_features(void)
 {
+	unsigned long cpacr = cpacr_save_enable_kernel_sme();
+
 	u64 safe_smcr = read_sanitised_ftr_reg(SYS_SMCR_EL1);
 	u64 smcr = read_smcr_features();
 
@@ -3203,7 +3223,7 @@ static void verify_sme_features(void)
 		cpu_die_early();
 	}
 
-	/* Add checks on other SMCR bits here if necessary */
+	cpacr_restore(cpacr);
 }
 
 static void verify_hyp_capabilities(void)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 91e44ac7150f..601b973f90ad 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1174,7 +1174,7 @@ void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
  * Read the pseudo-ZCR used by cpufeatures to identify the supported SVE
  * vector length.
  *
- * Use only if SVE is present.
+ * Use only if SVE is present and enabled.
  * This function clobbers the SVE vector length.
  */
 u64 read_zcr_features(void)
@@ -1183,7 +1183,6 @@ u64 read_zcr_features(void)
 	 * Set the maximum possible VL, and write zeroes to all other
 	 * bits to see if they stick.
 	 */
-	sve_kernel_enable(NULL);
 	write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL1);
 
 	/* Return LEN value that would be written to get the maximum VL */
@@ -1337,13 +1336,11 @@ void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
  * Read the pseudo-SMCR used by cpufeatures to identify the supported
  * vector length.
  *
- * Use only if SME is present.
+ * Use only if SME is present and enabled.
  * This function clobbers the SME vector length.
  */
 u64 read_smcr_features(void)
 {
-	sme_kernel_enable(NULL);
-
 	/*
 	 * Set the maximum possible VL.
 	 */
-- 
cgit v1.2.3


From 907722917002ed06b8f75e119b83842d5f63b15e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:34 +0100
Subject: arm64: Use build-time assertions for cpucap ordering

Both sme2_kernel_enable() and fa64_kernel_enable() need to run after
sme_kernel_enable(). This happens to be true today as ARM64_SME has a
lower index than either ARM64_SME2 or ARM64_SME_FA64, and both functions
have a comment to this effect.

It would be nicer to have a build-time assertion like we for for
can_use_gic_priorities() and has_gic_prio_relaxed_sync(), as that way
it will be harder to miss any potential breakage.

This patch replaces the comments with build-time assertions.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 601b973f90ad..48338cf8f90b 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1310,23 +1310,21 @@ void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 	isb();
 }
 
-/*
- * This must be called after sme_kernel_enable(), we rely on the
- * feature table being sorted to ensure this.
- */
 void sme2_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 {
+	/* This must be enabled after SME */
+	BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME);
+
 	/* Allow use of ZT0 */
 	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_EZT0_MASK,
 		       SYS_SMCR_EL1);
 }
 
-/*
- * This must be called after sme_kernel_enable(), we rely on the
- * feature table being sorted to ensure this.
- */
 void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 {
+	/* This must be enabled after SME */
+	BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME);
+
 	/* Allow use of FA64 */
 	write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
 		       SYS_SMCR_EL1);
-- 
cgit v1.2.3


From 14567ba42c5747f50584eb463ac8029f2a30e0d1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:35 +0100
Subject: arm64: Rename SVE/SME cpu_enable functions

The arm64_cpu_capabilities::cpu_enable() callbacks for SVE, SME, SME2,
and FA64 are named with an unusual "${feature}_kernel_enable" pattern
rather than the much more common "cpu_enable_${feature}". Now that we
only use these as cpu_enable() callbacks, it would be nice to have them
match the usual scheme.

This patch renames the cpu_enable() callbacks to match this scheme. At
the same time, the comment above cpu_enable_sve() is removed for
consistency with the other cpu_enable() callbacks.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h |  8 ++++----
 arch/arm64/kernel/cpufeature.c  |  8 ++++----
 arch/arm64/kernel/fpsimd.c      | 12 ++++--------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index bd7bea92dae0..0cbaa06b394a 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -149,10 +149,10 @@ extern void sme_save_state(void *state, int zt);
 extern void sme_load_state(void const *state, int zt);
 
 struct arm64_cpu_capabilities;
-extern void sve_kernel_enable(const struct arm64_cpu_capabilities *__unused);
-extern void sme_kernel_enable(const struct arm64_cpu_capabilities *__unused);
-extern void sme2_kernel_enable(const struct arm64_cpu_capabilities *__unused);
-extern void fa64_kernel_enable(const struct arm64_cpu_capabilities *__unused);
+extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
+extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused);
+extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused);
+extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused);
 
 extern u64 read_zcr_features(void);
 extern u64 read_smcr_features(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e3e0d3d3bd6b..fb828f8c49e3 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2425,7 +2425,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.desc = "Scalable Vector Extension",
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.capability = ARM64_SVE,
-		.cpu_enable = sve_kernel_enable,
+		.cpu_enable = cpu_enable_sve,
 		.matches = has_cpuid_feature,
 		ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP)
 	},
@@ -2678,7 +2678,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.capability = ARM64_SME,
 		.matches = has_cpuid_feature,
-		.cpu_enable = sme_kernel_enable,
+		.cpu_enable = cpu_enable_sme,
 		ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP)
 	},
 	/* FA64 should be sorted after the base SME capability */
@@ -2687,7 +2687,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.capability = ARM64_SME_FA64,
 		.matches = has_cpuid_feature,
-		.cpu_enable = fa64_kernel_enable,
+		.cpu_enable = cpu_enable_fa64,
 		ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP)
 	},
 	{
@@ -2695,7 +2695,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
 		.capability = ARM64_SME2,
 		.matches = has_cpuid_feature,
-		.cpu_enable = sme2_kernel_enable,
+		.cpu_enable = cpu_enable_sme2,
 		ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2)
 	},
 #endif /* CONFIG_ARM64_SME */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 48338cf8f90b..45ea9cabbaa4 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1160,11 +1160,7 @@ fail:
 	panic("Cannot allocate percpu memory for EFI SVE save/restore");
 }
 
-/*
- * Enable SVE for EL1.
- * Intended for use by the cpufeatures code during CPU boot.
- */
-void sve_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p)
 {
 	write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1);
 	isb();
@@ -1295,7 +1291,7 @@ static void sme_free(struct task_struct *task)
 	task->thread.sme_state = NULL;
 }
 
-void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p)
 {
 	/* Set priority for all PEs to architecturally defined minimum */
 	write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK,
@@ -1310,7 +1306,7 @@ void sme_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 	isb();
 }
 
-void sme2_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+void cpu_enable_sme2(const struct arm64_cpu_capabilities *__always_unused p)
 {
 	/* This must be enabled after SME */
 	BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME);
@@ -1320,7 +1316,7 @@ void sme2_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
 		       SYS_SMCR_EL1);
 }
 
-void fa64_kernel_enable(const struct arm64_cpu_capabilities *__always_unused p)
+void cpu_enable_fa64(const struct arm64_cpu_capabilities *__always_unused p)
 {
 	/* This must be enabled after SME */
 	BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME);
-- 
cgit v1.2.3


From 34f66c4c4d5518c11bfb7d10defff8f814c9f28a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:36 +0100
Subject: arm64: Use a positive cpucap for FP/SIMD

Currently we have a negative cpucap which describes the *absence* of
FP/SIMD rather than *presence* of FP/SIMD. This largely works, but is
somewhat awkward relative to other cpucaps that describe the presence of
a feature, and it would be nicer to have a cpucap which describes the
presence of FP/SIMD:

* This will allow the cpucap to be treated as a standard
  ARM64_CPUCAP_SYSTEM_FEATURE, which can be detected with the standard
  has_cpuid_feature() function and ARM64_CPUID_FIELDS() description.

* This ensures that the cpucap will only transition from not-present to
  present, reducing the risk of unintentional and/or unsafe usage of
  FP/SIMD before cpucaps are finalized.

* This will allow using arm64_cpu_capabilities::cpu_enable() to enable
  the use of FP/SIMD later, with FP/SIMD being disabled at boot time
  otherwise. This will ensure that any unintentional and/or unsafe usage
  of FP/SIMD prior to this is trapped, and will ensure that FP/SIMD is
  never unintentionally enabled for userspace in mismatched big.LITTLE
  systems.

This patch replaces the negative ARM64_HAS_NO_FPSIMD cpucap with a
positive ARM64_HAS_FPSIMD cpucap, making changes as described above.
Note that as FP/SIMD will now be trapped when not supported system-wide,
do_fpsimd_acc() must handle these traps in the same way as for SVE and
SME. The commentary in fpsimd_restore_current_state() is updated to
describe the new scheme.

No users of system_supports_fpsimd() need to know that FP/SIMD is
available prior to alternatives being patched, so this is updated to
use alternative_has_cap_likely() to check for the ARM64_HAS_FPSIMD
cpucap, without generating code to test the system_cpucaps bitmap.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  2 +-
 arch/arm64/include/asm/fpsimd.h     |  1 +
 arch/arm64/kernel/cpufeature.c      | 18 +++++----------
 arch/arm64/kernel/fpsimd.c          | 44 +++++++++++++++++++++++++++++--------
 arch/arm64/mm/proc.S                |  3 +--
 arch/arm64/tools/cpucaps            |  2 +-
 6 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 1233a8ff96a8..a1698945916e 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -760,7 +760,7 @@ static inline bool system_supports_mixed_endian(void)
 
 static __always_inline bool system_supports_fpsimd(void)
 {
-	return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD);
+	return alternative_has_cap_likely(ARM64_HAS_FPSIMD);
 }
 
 static inline bool system_uses_hw_pan(void)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 0cbaa06b394a..c43ae9c013ec 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -149,6 +149,7 @@ extern void sme_save_state(void *state, int zt);
 extern void sme_load_state(void const *state, int zt);
 
 struct arm64_cpu_capabilities;
+extern void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused);
 extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index fb828f8c49e3..c493fa582a19 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1580,14 +1580,6 @@ static bool has_no_hw_prefetch(const struct arm64_cpu_capabilities *entry, int _
 		MIDR_CPU_VAR_REV(1, MIDR_REVISION_MASK));
 }
 
-static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused)
-{
-	u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
-
-	return cpuid_feature_extract_signed_field(pfr0,
-					ID_AA64PFR0_EL1_FP_SHIFT) < 0;
-}
-
 static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
 			  int scope)
 {
@@ -2398,11 +2390,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP)
 	},
 	{
-		/* FP/SIMD is not implemented */
-		.capability = ARM64_HAS_NO_FPSIMD,
-		.type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE,
-		.min_field_value = 0,
-		.matches = has_no_fpsimd,
+		.capability = ARM64_HAS_FPSIMD,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		.cpu_enable = cpu_enable_fpsimd,
+		ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP)
 	},
 #ifdef CONFIG_ARM64_PMEM
 	{
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 45ea9cabbaa4..d286bcfc3f41 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1520,8 +1520,17 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs)
  */
 void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs)
 {
-	/* TODO: implement lazy context saving/restoring */
-	WARN_ON(1);
+	/* Even if we chose not to use FPSIMD, the hardware could still trap: */
+	if (!system_supports_fpsimd()) {
+		force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
+		return;
+	}
+
+	/*
+	 * When FPSIMD is enabled, we should never take a trap unless something
+	 * has gone very wrong.
+	 */
+	BUG();
 }
 
 /*
@@ -1762,13 +1771,23 @@ void fpsimd_bind_state_to_cpu(struct cpu_fp_state *state)
 void fpsimd_restore_current_state(void)
 {
 	/*
-	 * For the tasks that were created before we detected the absence of
-	 * FP/SIMD, the TIF_FOREIGN_FPSTATE could be set via fpsimd_thread_switch(),
-	 * e.g, init. This could be then inherited by the children processes.
-	 * If we later detect that the system doesn't support FP/SIMD,
-	 * we must clear the flag for  all the tasks to indicate that the
-	 * FPSTATE is clean (as we can't have one) to avoid looping for ever in
-	 * do_notify_resume().
+	 * TIF_FOREIGN_FPSTATE is set on the init task and copied by
+	 * arch_dup_task_struct() regardless of whether FP/SIMD is detected.
+	 * Thus user threads can have this set even when FP/SIMD hasn't been
+	 * detected.
+	 *
+	 * When FP/SIMD is detected, begin_new_exec() will set
+	 * TIF_FOREIGN_FPSTATE via flush_thread() -> fpsimd_flush_thread(),
+	 * and fpsimd_thread_switch() will set TIF_FOREIGN_FPSTATE when
+	 * switching tasks. We detect FP/SIMD before we exec the first user
+	 * process, ensuring this has TIF_FOREIGN_FPSTATE set and
+	 * do_notify_resume() will call fpsimd_restore_current_state() to
+	 * install the user FP/SIMD context.
+	 *
+	 * When FP/SIMD is not detected, nothing else will clear or set
+	 * TIF_FOREIGN_FPSTATE prior to the first return to userspace, and
+	 * we must clear TIF_FOREIGN_FPSTATE to avoid do_notify_resume()
+	 * looping forever calling fpsimd_restore_current_state().
 	 */
 	if (!system_supports_fpsimd()) {
 		clear_thread_flag(TIF_FOREIGN_FPSTATE);
@@ -2101,6 +2120,13 @@ static inline void fpsimd_hotplug_init(void)
 static inline void fpsimd_hotplug_init(void) { }
 #endif
 
+void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__always_unused p)
+{
+	unsigned long enable = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN;
+	write_sysreg(read_sysreg(CPACR_EL1) | enable, CPACR_EL1);
+	isb();
+}
+
 /*
  * FP/SIMD support code initialisation.
  */
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 14fdf645edc8..f66c37a1610e 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -405,8 +405,7 @@ SYM_FUNC_START(__cpu_setup)
 	tlbi	vmalle1				// Invalidate local TLB
 	dsb	nsh
 
-	mov	x1, #3 << 20
-	msr	cpacr_el1, x1			// Enable FP/ASIMD
+	msr	cpacr_el1, xzr			// Reset cpacr_el1
 	mov	x1, #1 << 12			// Reset mdscr_el1 and disable
 	msr	mdscr_el1, x1			// access to the DCC from EL0
 	isb					// Unmask debug exceptions now,
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index c3f06fdef609..1adf562c914d 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -27,6 +27,7 @@ HAS_ECV_CNTPOFF
 HAS_EPAN
 HAS_EVT
 HAS_FGT
+HAS_FPSIMD
 HAS_GENERIC_AUTH
 HAS_GENERIC_AUTH_ARCH_QARMA3
 HAS_GENERIC_AUTH_ARCH_QARMA5
@@ -39,7 +40,6 @@ HAS_LDAPR
 HAS_LSE_ATOMICS
 HAS_MOPS
 HAS_NESTED_VIRT
-HAS_NO_FPSIMD
 HAS_NO_HW_PREFETCH
 HAS_PAN
 HAS_S1PIE
-- 
cgit v1.2.3


From 7f0387cf76b1d026a5f0e0386a05c4cdac3a99af Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:37 +0100
Subject: arm64: Avoid cpus_have_const_cap() for
 ARM64_HAS_{ADDRESS,GENERIC}_AUTH

In system_supports_address_auth() and system_supports_generic_auth() we
use cpus_have_const_cap to check for ARM64_HAS_ADDRESS_AUTH and
ARM64_HAS_GENERIC_AUTH respectively, but this is not necessary and
alternative_has_cap_*() would bre preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_HAS_ADDRESS_AUTH cpucap is a boot cpu feature which is
detected and patched early on the boot CPU before any pointer
authentication keys are enabled via their respective SCTLR_ELx.EN* bits.
Nothing which uses system_supports_address_auth() is called before the
boot alternatives are patched. Thus it is safe for
system_supports_address_auth() to use cpus_have_final_boot_cap() to
check for ARM64_HAS_ADDRESS_AUTH.

The ARM64_HAS_GENERIC_AUTH cpucap is a system feature which is detected
on all CPUs, then finalized and patched under
setup_system_capabilities(). We use system_supports_generic_auth() in a
few places:

* The pac_generic_keys_get() and pac_generic_keys_set() functions are
  only reachable from system calls once userspace is up and running. As
  cpucaps are finalzied long before userspace runs, these can safely use
  alternative_has_cap_*() or cpus_have_final_cap().

* The ptrauth_prctl_reset_keys() function is only reachable from system
  calls once userspace is up and running. As cpucaps are finalized long
  before userspace runs, this can safely use alternative_has_cap_*() or
  cpus_have_final_cap().

* The ptrauth_keys_install_user() function is used during
  context-switch. This is called prior to alternatives being applied,
  and so cannot use cpus_have_final_cap(), but as this only needs to
  switch the APGA key for userspace tasks, it's safe to use
  alternative_has_cap_*().

* The ptrauth_keys_init_user() function is used to initialize userspace
  keys, and is only reachable after system cpucaps have been finalized
  and patched. Thus this can safely use alternative_has_cap_*() or
  cpus_have_final_cap().

* The system_has_full_ptr_auth() helper function is only used by KVM
  code, which is only reachable after system cpucaps have been finalized
  and patched. Thus this can safely use alternative_has_cap_*() or
  cpus_have_final_cap().

This patch modifies system_supports_address_auth() to use
cpus_have_final_boot_cap() to check ARM64_HAS_ADDRESS_AUTH, and modifies
system_supports_generic_auth() to use alternative_has_cap_unlikely() to
check ARM64_HAS_GENERIC_AUTH. In either case this will avoid generating
code to test the system_cpucaps bitmap and should be better for all
subsequent calls at runtime. The use of cpus_have_final_boot_cap() will
make it easier to spot if code is chaanged such that these run before
the relevant cpucap is guaranteed to have been finalized.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a1698945916e..dfdedbdcc115 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -806,12 +806,12 @@ static __always_inline bool system_supports_cnp(void)
 
 static inline bool system_supports_address_auth(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH);
+	return cpus_have_final_boot_cap(ARM64_HAS_ADDRESS_AUTH);
 }
 
 static inline bool system_supports_generic_auth(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_GENERIC_AUTH);
+	return alternative_has_cap_unlikely(ARM64_HAS_GENERIC_AUTH);
 }
 
 static inline bool system_has_full_ptr_auth(void)
-- 
cgit v1.2.3


From d70bac1d22f0a215047a1f6b9c8c7af858d975fc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:38 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_ARMv8_4_TTL

In __tlbi_level() we use cpus_have_const_cap() to check for
ARM64_HAS_ARMv8_4_TTL, but this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

In the window between detecting the ARM64_HAS_ARMv8_4_TTL cpucap and
patching alternative branches, we do not perform any TLB invalidation,
and even if we were to perform TLB invalidation here it would not be
functionally necessary to optimize this by using the TTL hint. Hence
there's no need to use cpus_have_const_cap(), and
alternative_has_cap_unlikely() is sufficient.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index b149cf9f91bc..53ed194626e1 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -105,7 +105,7 @@ static inline unsigned long get_trans_granule(void)
 #define __tlbi_level(op, addr, level) do {				\
 	u64 arg = addr;							\
 									\
-	if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&		\
+	if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) &&	\
 	    level) {							\
 		u64 ttl = level & 3;					\
 		ttl |= get_trans_granule() << 2;			\
-- 
cgit v1.2.3


From bbbb65770bf44d3eaec5b7784028756b58ea08db Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:39 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_BTI

In system_supports_bti() we use cpus_have_const_cap() to check for
ARM64_HAS_BTI, but this is not necessary and alternative_has_cap_*() or
cpus_have_final_*cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

When CONFIG_ARM64_BTI_KERNEL=y, the ARM64_HAS_BTI cpucap is a strict
boot cpu feature which is detected and patched early on the boot cpu.
All uses guarded by CONFIG_ARM64_BTI_KERNEL happen after the boot CPU
has detected ARM64_HAS_BTI and patched boot alternatives, and hence can
safely use alternative_has_cap_*() or cpus_have_final_boot_cap().

Regardless of CONFIG_ARM64_BTI_KERNEL, all other uses of ARM64_HAS_BTI
happen after system capabilities have been finalized and alternatives
have been patched. Hence these can safely use alternative_has_cap_*) or
cpus_have_final_cap().

This patch splits system_supports_bti() into system_supports_bti() and
system_supports_bti_kernel(), with the former handling where the cpucap
affects userspace functionality, and ther latter handling where the
cpucap affects kernel functionality. The use of cpus_have_const_cap() is
replaced by cpus_have_final_cap() in cpus_have_const_cap, and
cpus_have_final_boot_cap() in system_supports_bti_kernel(). This will
avoid generating code to test the system_cpucaps bitmap and should be
better for all subsequent calls at runtime. The use of
cpus_have_final_cap() and cpus_have_final_boot_cap() will make it easier
to spot if code is chaanged such that these run before the ARM64_HAS_BTI
cpucap is guaranteed to have been finalized.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h   | 8 +++++++-
 arch/arm64/include/asm/pgtable-prot.h | 6 +-----
 arch/arm64/kernel/efi.c               | 3 +--
 arch/arm64/kernel/vdso.c              | 2 +-
 arch/arm64/kvm/hyp/pgtable.c          | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index dfdedbdcc115..caa54ddf0bcd 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -837,7 +837,13 @@ static inline bool system_has_prio_mask_debugging(void)
 
 static inline bool system_supports_bti(void)
 {
-	return cpus_have_const_cap(ARM64_BTI);
+	return cpus_have_final_cap(ARM64_BTI);
+}
+
+static inline bool system_supports_bti_kernel(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) &&
+		cpus_have_final_boot_cap(ARM64_BTI);
 }
 
 static inline bool system_supports_tlb_range(void)
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index eed814b00a38..e9624f6326dd 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -75,11 +75,7 @@ extern bool arm64_use_ng_mappings;
  * If we have userspace only BTI we don't want to mark kernel pages
  * guarded even if the system does support BTI.
  */
-#ifdef CONFIG_ARM64_BTI_KERNEL
-#define PTE_MAYBE_GP		(system_supports_bti() ? PTE_GP : 0)
-#else
-#define PTE_MAYBE_GP		0
-#endif
+#define PTE_MAYBE_GP		(system_supports_bti_kernel() ? PTE_GP : 0)
 
 #define PAGE_KERNEL		__pgprot(_PAGE_KERNEL)
 #define PAGE_KERNEL_RO		__pgprot(_PAGE_KERNEL_RO)
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 2b478ca356b0..3f8c9c143552 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -113,8 +113,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
 	if (md->attribute & EFI_MEMORY_XP)
 		pte = set_pte_bit(pte, __pgprot(PTE_PXN));
-	else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) &&
-		 system_supports_bti() && spd->has_bti)
+	else if (system_supports_bti_kernel() && spd->has_bti)
 		pte = set_pte_bit(pte, __pgprot(PTE_GP));
 	set_pte(ptep, pte);
 	return 0;
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index d9e1355730ef..5562daf38a22 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -212,7 +212,7 @@ static int __setup_additional_pages(enum vdso_abi abi,
 	if (IS_ERR(ret))
 		goto up_fail;
 
-	if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
+	if (system_supports_bti_kernel())
 		gp_flags = VM_ARM64_BTI;
 
 	vdso_base += VVAR_NR_PAGES * PAGE_SIZE;
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 799d2c204bb8..77fb330c7bf4 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -401,7 +401,7 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
 		if (device)
 			return -EINVAL;
 
-		if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
+		if (system_supports_bti_kernel())
 			attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
 	} else {
 		attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
-- 
cgit v1.2.3


From 6766a8ef18a7796e728774e390c0e8526abb6d6d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:40 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_CACHE_DIC

In icache_inval_all_pou() we use cpus_have_const_cap() to check for
ARM64_HAS_CACHE_DIC, but this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpus_have_const_cap() check in icache_inval_all_pou() is an
optimization to skip a redundant (but benign) IC IALLUIS + DSB ISH
sequence when all CPUs in the system have DIC. In the window between
detecting the ARM64_HAS_CACHE_DIC cpucap and patching alternative
branches there is only a single potential call to icache_inval_all_pou()
(in the alternatives patching itself), which there's no need to optimize
for at the expense of other callers.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. This also aligns better with the way we patch the assembly
cache maintenance sequences in arch/arm64/mm/cache.S.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cacheflush.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index d115451ed263..fefac75fa009 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -132,7 +132,7 @@ void flush_dcache_folio(struct folio *);
 
 static __always_inline void icache_inval_all_pou(void)
 {
-	if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC))
+	if (alternative_has_cap_unlikely(ARM64_HAS_CACHE_DIC))
 		return;
 
 	asm("ic	ialluis");
-- 
cgit v1.2.3


From 54c8818aa283040956a3e85368bb9d9abad1bf7a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:41 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_CNP

In system_supports_cnp() we use cpus_have_const_cap() to check for
ARM64_HAS_CNP, but this is only necessary so that the cpu_enable_cnp()
callback can run prior to alternatives being patched, and otherwise this
is not necessary and alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpu_enable_cnp() callback is run immediately after the ARM64_HAS_CNP
cpucap is detected system-wide under setup_system_capabilities(), prior
to alternatives being patched. During this window cpu_enable_cnp() uses
cpu_replace_ttbr1() to set the CNP bit for the swapper_pg_dir in TTBR1.
No other users of the ARM64_HAS_CNP cpucap need the up-to-date value
during this window:

* As KVM isn't initialized yet, kvm_get_vttbr() isn't reachable.

* As cpuidle isn't initialized yet, __cpu_suspend_exit() isn't
  reachable.

* At this point all CPUs are using the swapper_pg_dir with a reserved
  ASID in TTBR1, and the idmap_pg_dir in TTBR0, so neither
  check_and_switch_context() nor cpu_do_switch_mm() need to do anything
  special.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. To allow cpu_enable_cnp() to function prior to alternatives
being patched, cpu_replace_ttbr1() is split into cpu_replace_ttbr1() and
cpu_enable_swapper_cnp(), with the former only used for early TTBR1
replacement, and the latter used by both cpu_enable_cnp() and
__cpu_suspend_exit().

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h  |  2 +-
 arch/arm64/include/asm/mmu_context.h | 28 +++++++++++++++++-----------
 arch/arm64/kernel/cpufeature.c       |  2 +-
 arch/arm64/kernel/suspend.c          |  2 +-
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index caa54ddf0bcd..a91f940351fc 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -801,7 +801,7 @@ static __always_inline bool system_supports_tpidr2(void)
 
 static __always_inline bool system_supports_cnp(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_CNP);
+	return alternative_has_cap_unlikely(ARM64_HAS_CNP);
 }
 
 static inline bool system_supports_address_auth(void)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index a6fb325424e7..9ce4200508b1 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -152,7 +152,7 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
  * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
  * avoiding the possibility of conflicting TLB entries being allocated.
  */
-static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
+static inline void __cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap, bool cnp)
 {
 	typedef void (ttbr_replace_func)(phys_addr_t);
 	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
@@ -162,17 +162,8 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
 	/* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
 	phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
 
-	if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) {
-		/*
-		 * cpu_replace_ttbr1() is used when there's a boot CPU
-		 * up (i.e. cpufeature framework is not up yet) and
-		 * latter only when we enable CNP via cpufeature's
-		 * enable() callback.
-		 * Also we rely on the system_cpucaps bit being set before
-		 * calling the enable() function.
-		 */
+	if (cnp)
 		ttbr1 |= TTBR_CNP_BIT;
-	}
 
 	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
@@ -189,6 +180,21 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
 	cpu_uninstall_idmap();
 }
 
+static inline void cpu_enable_swapper_cnp(void)
+{
+	__cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir, true);
+}
+
+static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
+{
+	/*
+	 * Only for early TTBR1 replacement before cpucaps are finalized and
+	 * before we've decided whether to use CNP.
+	 */
+	WARN_ON(system_capabilities_finalized());
+	__cpu_replace_ttbr1(pgdp, idmap, false);
+}
+
 /*
  * It would be nice to return ASIDs back to the allocator, but unfortunately
  * that introduces a race with a generation rollover where we could erroneously
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c493fa582a19..e1582e50f529 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3458,7 +3458,7 @@ subsys_initcall_sync(init_32bit_el0_mask);
 
 static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
 {
-	cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
+	cpu_enable_swapper_cnp();
 }
 
 /*
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 0fbdf5fe64d8..c09fa99e0a44 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -55,7 +55,7 @@ void notrace __cpu_suspend_exit(void)
 
 	/* Restore CnP bit in TTBR1_EL1 */
 	if (system_supports_cnp())
-		cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
+		cpu_enable_swapper_cnp();
 
 	/*
 	 * PSTATE was not saved over suspend/resume, re-enable any detected
-- 
cgit v1.2.3


From 25693f1771552ad6a2f38c7749a5b7516bbddd1d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:42 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_DIT

In __cpu_suspend_exit() we use cpus_have_const_cap() to check for
ARM64_HAS_DIT but this is not necessary and cpus_have_final_cap() of
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_HAS_DIT cpucap is detected and patched (along with all other
cpucaps) before __cpu_suspend_exit() can run. We'll only use
__cpu_suspend_exit() as part of PSCI cpuidle or hibernation, and both of
these are intialized after system cpucaps are detected and patched: the
PSCI cpuidle driver is registered with a device_initcall, hibernation
restoration occurs in a late_initcall, and hibarnation saving is driven
by usrspace. Therefore it is not necessary to use cpus_have_const_cap(),
and using alternative_has_cap_*() or cpus_have_final_cap() is
sufficient.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. To clearly document the ordering relationship between
suspend/resume and alternatives patching, an explicit check for
system_capabilities_finalized() is added to cpu_suspend() along with a
comment block, which will make it easier to spot issues if code is
changed in future to allow these functions to be reached earlier.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/suspend.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index c09fa99e0a44..eca4d0435211 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -61,7 +61,7 @@ void notrace __cpu_suspend_exit(void)
 	 * PSTATE was not saved over suspend/resume, re-enable any detected
 	 * features that might not have been set correctly.
 	 */
-	if (cpus_have_const_cap(ARM64_HAS_DIT))
+	if (alternative_has_cap_unlikely(ARM64_HAS_DIT))
 		set_pstate_dit(1);
 	__uaccess_enable_hw_pan();
 
@@ -98,6 +98,15 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	struct sleep_stack_data state;
 	struct arm_cpuidle_irq_context context;
 
+	/*
+	 * Some portions of CPU state (e.g. PSTATE.{PAN,DIT}) are initialized
+	 * before alternatives are patched, but are only restored by
+	 * __cpu_suspend_exit() after alternatives are patched. To avoid
+	 * accidentally losing these bits we must not attempt to suspend until
+	 * after alternatives have been patched.
+	 */
+	WARN_ON(!system_capabilities_finalized());
+
 	/* Report any MTE async fault before going to suspend */
 	mte_suspend_enter();
 
-- 
cgit v1.2.3


From 20af807d806d13f4ab7795943587cf41257c649a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:43 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_GIC_PRIO_MASKING

In system_uses_irq_prio_masking() we use cpus_have_const_cap() to check
for ARM64_HAS_GIC_PRIO_MASKING, but this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

When CONFIG_ARM64_PSEUDO_NMI=y the ARM64_HAS_GIC_PRIO_MASKING cpucap is
a strict boot cpu feature which is detected and patched early on the
boot cpu, which both happen in smp_prepare_boot_cpu(). In the window
between the ARM64_HAS_GIC_PRIO_MASKING cpucap is detected and
alternatives are patched we don't run any code that depends upon the
ARM64_HAS_GIC_PRIO_MASKING cpucap:

* We leave DAIF.IF set until after boot alternatives are patched, and
  interrupts are unmasked later in init_IRQ(), so we cannot reach
  IRQ/FIQ entry code and will not use irqs_priority_unmasked().

* We don't call any code which uses arm_cpuidle_save_irq_context() and
  arm_cpuidle_restore_irq_context() during this window.

* We don't call start_thread_common() during this window.

* The local_irq_*() code in <asm/irqflags.h> depends solely on an
  alternative branch since commit:

  a5f61cc636f48bdf ("arm64: irqflags: use alternative branches for pseudo-NMI logic")

  ... and hence will use the default (DAIF-only) masking behaviour until
  alternatives are patched.

* Secondary CPUs are brought up later after alternatives are patched,
  and alternatives are patched on the boot CPU immediately prior to
  calling init_gic_priority_masking(), so we'll correctly initialize
  interrupt masking regardless.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which avoid generating code to test the
system_cpucaps bitmap and should be better for all subsequent calls at
runtime. As this makes system_uses_irq_prio_masking() equivalent to
__irqflags_uses_pmr(), the latter is removed and replaced with the
former for consistency.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  2 +-
 arch/arm64/include/asm/irqflags.h   | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index a91f940351fc..c387ee4ee194 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -821,7 +821,7 @@ static inline bool system_has_full_ptr_auth(void)
 
 static __always_inline bool system_uses_irq_prio_masking(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_GIC_PRIO_MASKING);
+	return alternative_has_cap_unlikely(ARM64_HAS_GIC_PRIO_MASKING);
 }
 
 static inline bool system_supports_mte(void)
diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h
index 1f31ec146d16..0a7186a93882 100644
--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -21,12 +21,6 @@
  * exceptions should be unmasked.
  */
 
-static __always_inline bool __irqflags_uses_pmr(void)
-{
-	return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) &&
-	       alternative_has_cap_unlikely(ARM64_HAS_GIC_PRIO_MASKING);
-}
-
 static __always_inline void __daif_local_irq_enable(void)
 {
 	barrier();
@@ -49,7 +43,7 @@ static __always_inline void __pmr_local_irq_enable(void)
 
 static inline void arch_local_irq_enable(void)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		__pmr_local_irq_enable();
 	} else {
 		__daif_local_irq_enable();
@@ -77,7 +71,7 @@ static __always_inline void __pmr_local_irq_disable(void)
 
 static inline void arch_local_irq_disable(void)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		__pmr_local_irq_disable();
 	} else {
 		__daif_local_irq_disable();
@@ -99,7 +93,7 @@ static __always_inline unsigned long __pmr_local_save_flags(void)
  */
 static inline unsigned long arch_local_save_flags(void)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		return __pmr_local_save_flags();
 	} else {
 		return __daif_local_save_flags();
@@ -118,7 +112,7 @@ static __always_inline bool __pmr_irqs_disabled_flags(unsigned long flags)
 
 static inline bool arch_irqs_disabled_flags(unsigned long flags)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		return __pmr_irqs_disabled_flags(flags);
 	} else {
 		return __daif_irqs_disabled_flags(flags);
@@ -137,7 +131,7 @@ static __always_inline bool __pmr_irqs_disabled(void)
 
 static inline bool arch_irqs_disabled(void)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		return __pmr_irqs_disabled();
 	} else {
 		return __daif_irqs_disabled();
@@ -169,7 +163,7 @@ static __always_inline unsigned long __pmr_local_irq_save(void)
 
 static inline unsigned long arch_local_irq_save(void)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		return __pmr_local_irq_save();
 	} else {
 		return __daif_local_irq_save();
@@ -196,7 +190,7 @@ static __always_inline void __pmr_local_irq_restore(unsigned long flags)
  */
 static inline void arch_local_irq_restore(unsigned long flags)
 {
-	if (__irqflags_uses_pmr()) {
+	if (system_uses_irq_prio_masking()) {
 		__pmr_local_irq_restore(flags);
 	} else {
 		__daif_local_irq_restore(flags);
-- 
cgit v1.2.3


From 53d62e995d9eaed16390d38b40eaed4286a1282c Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:44 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_PAN

In system_uses_hw_pan() we use cpus_have_const_cap() to check for
ARM64_HAS_PAN, but this is only necessary so that the
system_uses_ttbr0_pan() check in setup_cpu_features() can run prior to
alternatives being patched, and otherwise this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_HAS_PAN cpucap is used by system_uses_hw_pan() and
system_uses_ttbr0_pan() depending on whether CONFIG_ARM64_SW_TTBR0_PAN
is selected, and:

* We only use system_uses_hw_pan() directly in __sdei_handler(), which
  isn't reachable until after alternatives have been patched, and for
  this it is safe to use alternative_has_cap_*().

* We use system_uses_ttbr0_pan() in a few places:

  - In check_and_switch_context() and cpu_uninstall_idmap(), which will
    defer installing a translation table into TTBR0 when the
    ARM64_HAS_PAN cpucap is not detected.

    Prior to patching alternatives, all CPUs will be using init_mm with
    the reserved ttbr0 translation tables install in TTBR0, so these can
    safely use alternative_has_cap_*().

  - In update_saved_ttbr0(), which will only save the active TTBR0 into
    a per-thread variable when the ARM64_HAS_PAN cpucap is not detected.

    Prior to patching alternatives, all CPUs will be using init_mm with
    the reserved ttbr0 translation tables install in TTBR0, so these can
    safely use alternative_has_cap_*().

  - In efi_set_pgd(), which will handle check_and_switch_context()
    deferring the installation of TTBR0 when TTBR0 PAN is detected.

    The EFI runtime services are not initialized until after
    alternatives have been patched, and so this can safely use
    alternative_has_cap_*() or cpus_have_final_cap().

  - In uaccess_ttbr0_disable() and uaccess_ttbr0_enable(), where we'll
    avoid installing/uninstalling a translation table in TTBR0 when
    ARM64_HAS_PAN is detected.

    Prior to patching alternatives we will not perform any uaccess and
    will not call uaccess_ttbr0_disable() or uaccess_ttbr0_enable(), and
    so these can safely use alternative_has_cap_*() or
    cpus_have_final_cap().

  - In is_el1_permission_fault() where we will consider a translation
    fault on a TTBR0 address to be a permission fault when ARM64_HAS_PAN
    is not detected *and* we have set the PAN bit in the SPSR (which
    tells us that in the interrupted context, TTBR0 pointed at the
    reserved zero ttbr).

    In the window between detecting system cpucaps and patching
    alternatives we should not perform any accesses to TTBR0 addresses,
    and no userspace translation tables exist until after patching
    alternatives. Thus it is safe for this to use alternative_has_cap*().

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

So that the check for TTBR0 PAN in setup_cpu_features() can run prior to
alternatives being patched, the call to system_uses_ttbr0_pan() is
replaced with an explicit check of the ARM64_HAS_PAN bit in the
system_cpucaps bitmap.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 2 +-
 arch/arm64/kernel/cpufeature.c      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index c387ee4ee194..c581687b23a2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -765,7 +765,7 @@ static __always_inline bool system_supports_fpsimd(void)
 
 static inline bool system_uses_hw_pan(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_PAN);
+	return alternative_has_cap_unlikely(ARM64_HAS_PAN);
 }
 
 static inline bool system_uses_ttbr0_pan(void)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e1582e50f529..9ab7e19b7176 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3368,7 +3368,8 @@ void __init setup_system_features(void)
 	 * finalized. Finalize and log the available system capabilities.
 	 */
 	update_cpu_capabilities(SCOPE_SYSTEM);
-	if (system_uses_ttbr0_pan())
+	if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
+	    !cpus_have_cap(ARM64_HAS_PAN))
 		pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
 
 	/*
-- 
cgit v1.2.3


From 4e00f1d9b7ff6b681fbd048fe5389089ae65bc11 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:45 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_EPAN

We use cpus_have_const_cap() to check for ARM64_HAS_EPAN but this is not
necessary and alternative_has_cap() or cpus_have_cap() would be
preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_HAS_EPAN cpucap is used to affect two things:

1) The permision bits used for userspace executable mappings, which are
   chosen by adjust_protection_map(), which is an arch_initcall. This is
   called after the ARM64_HAS_EPAN cpucap has been detected and
   alternatives have been patched, and before any userspace translation
   tables exist.

2) The handling of faults taken from (user or kernel) accesses to
   userspace executable mappings in do_page_fault(). Userspace
   translation tables are created after adjust_protection_map() is
   called, and hence after the ARM64_HAS_EPAN cpucap has been detected
   and alternatives have been patched.

Neither of these run until after ARM64_HAS_EPAN cpucap has been detected
and alternatives have been patched, and hence there's no need to use
cpus_have_const_cap(). Since adjust_protection_map() is only executed
once at boot time it would be best for it to use cpus_have_cap(), and
since do_page_fault() is executed frequently it would be best for it to
use alternatives_have_cap_unlikely().

This patch replaces the uses of cpus_have_const_cap() with
cpus_have_cap() and alternative_has_cap_unlikely(), which will avoid
generating redundant code, and should be better for all subsequent calls
at runtime. The ARM64_HAS_EPAN cpucap is added to cpucap_is_possible()
so that code can be elided entirely when this is not possible.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/mm/fault.c            | 2 +-
 arch/arm64/mm/mmap.c             | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 07c9271b534d..af9550147dd0 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -21,6 +21,8 @@ cpucap_is_possible(const unsigned int cap)
 	switch (cap) {
 	case ARM64_HAS_PAN:
 		return IS_ENABLED(CONFIG_ARM64_PAN);
+	case ARM64_HAS_EPAN:
+		return IS_ENABLED(CONFIG_ARM64_EPAN);
 	case ARM64_SVE:
 		return IS_ENABLED(CONFIG_ARM64_SVE);
 	case ARM64_SME:
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 2e5d1e238af9..460d799e1296 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -571,7 +571,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 		/* Write implies read */
 		vm_flags |= VM_WRITE;
 		/* If EPAN is absent then exec implies read */
-		if (!cpus_have_const_cap(ARM64_HAS_EPAN))
+		if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
 			vm_flags |= VM_EXEC;
 	}
 
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 8f5b7ce857ed..645fe60d000f 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -68,7 +68,7 @@ static int __init adjust_protection_map(void)
 	 * With Enhanced PAN we can honour the execute-only permissions as
 	 * there is no PAN override with such mappings.
 	 */
-	if (cpus_have_const_cap(ARM64_HAS_EPAN)) {
+	if (cpus_have_cap(ARM64_HAS_EPAN)) {
 		protection_map[VM_EXEC] = PAGE_EXECONLY;
 		protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY;
 	}
-- 
cgit v1.2.3


From 1963d9660d99bdd1bd3666afc6c670d84bff0efb Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:46 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_RNG

In __cpu_has_rng() we use cpus_have_const_cap() to check for
ARM64_HAS_RNG, but this is not necessary and alternative_has_cap_*()
would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

In the window between detecting the ARM64_HAS_RNG cpucap and patching
alternative branches, nothing which calls __cpu_has_rng() can run, and
hence it's not necessary to use cpus_have_const_cap().

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/archrandom.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h
index b0abc64f86b0..ecdb3cfcd0f8 100644
--- a/arch/arm64/include/asm/archrandom.h
+++ b/arch/arm64/include/asm/archrandom.h
@@ -63,7 +63,7 @@ static __always_inline bool __cpu_has_rng(void)
 {
 	if (unlikely(!system_capabilities_finalized() && !preemptible()))
 		return this_cpu_has_cap(ARM64_HAS_RNG);
-	return cpus_have_const_cap(ARM64_HAS_RNG);
+	return alternative_has_cap_unlikely(ARM64_HAS_RNG);
 }
 
 static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs)
-- 
cgit v1.2.3


From 4c73056e3277f803ccdb7769e55fe7153edb07cc Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:47 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_WFXT

In __delay() we use cpus_have_const_cap() to check for ARM64_HAS_WFXT,
but this is not necessary and alternative_has_cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpus_have_const_cap() check in __delay() is an optimization to use
WFIT and WFET in preference to busy-polling the counter and/or using
regular WFE and relying upon the architected timer event stream. It is
not necessary to apply this optimization in the window between detecting
the ARM64_HAS_WFXT cpucap and patching alternatives.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/delay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index 5b7890139bc2..cb2062e7e234 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -27,7 +27,7 @@ void __delay(unsigned long cycles)
 {
 	cycles_t start = get_cycles();
 
-	if (cpus_have_const_cap(ARM64_HAS_WFXT)) {
+	if (alternative_has_cap_unlikely(ARM64_HAS_WFXT)) {
 		u64 end = start + cycles;
 
 		/*
-- 
cgit v1.2.3


From b54b52576443eece09d0e6cb80198dc17f6b96a5 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:48 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_HAS_TLB_RANGE

We use cpus_have_const_cap() to check for ARM64_HAS_TLB_RANGE, but this
is not necessary and alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

In the window between detecting the ARM64_HAS_TLB_RANGE cpucap and
patching alternative branches, we do not perform any TLB invalidation,
and even if we were to perform TLB invalidation here it would not be
functionally necessary to optimize this by using range invalidation.
Hence there's no need to use cpus_have_const_cap(), and
alternative_has_cap_unlikely() is sufficient.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index c581687b23a2..bceb007f9ea3 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -848,7 +848,7 @@ static inline bool system_supports_bti_kernel(void)
 
 static inline bool system_supports_tlb_range(void)
 {
-	return cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
+	return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE);
 }
 
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
-- 
cgit v1.2.3


From 94324bcbc9d3b77fa29519f27852463a6385828a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:49 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_MTE

In system_supports_mte() we use cpus_have_const_cap() to check for
ARM64_MTE, but this is not necessary and cpus_have_final_boot_cap()
would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_MTE cpucap is a boot cpu feature which is detected and patched
early on the boot CPU under smp_prepare_boot_cpu(). In the window
between detecting the ARM64_MTE cpucap and patching alternatives,
nothing depends on the ARM64_MTE cpucap:

* The kasan_hw_tags_enabled() helper depends upon the kasan_flag_enabled
  static key, which is initialized later in kasan_init_hw_tags() after
  alternatives have been applied.

* No KVM code is called during this window, and KVM is not initialized
  until after system cpucaps have been detected and patched. KVM code
  can safely use cpus_have_final_cap() or alternative_has_cap_*().

* We don't context-switch prior to patching boot alternatives, and thus
  mte_thread_switch() is not reachable during this window. Thus, we can
  safely use cpus_have_final_boot_cap() or alternative_has_cap_*() in
  the context-switch code.

* IRQ and FIQ are masked during this window, and we can only take SError
  and Debug exceptions. SError exceptions are fatal at this point in
  time, and we do not expect to take Debug exceptions, thus:

  - It's fine to lave TCO set for exceptions taken during this window,
    and mte_disable_tco_entry() doesn't need to do anything.

  - We don't need to detect and report asynchronous tag cehck faults
    during this window, and neither mte_check_tfsr_entry() nor
    mte_check_tfsr_exit() need to do anything.

  Since we want to report any SErrors taken during thiw window, these
  cannot safely use cpus_have_final_boot_cap() or cpus_have_final_cap(),
  but these can safely use alternative_has_cap_*().

* The __set_pte_at() function is not used during this window. It is
  possible for this to be used on kernel mappings prior to boot cpucaps
  being finalized, so this cannot safely use cpus_have_final_boot_cap()
  or cpus_have_final_cap(), but this can safely use
  alternative_has_cap_*().

* No userspace translation tables have been created yet, and swap has
  not been initialized yet. Thus swapping is not possible and none of
  the following are called:

  - arch_thp_swp_supported()
  - arch_prepare_to_swap()
  - arch_swap_invalidate_page()
  - arch_swap_invalidate_area()
  - arch_swap_restore()

  These can safely use system_has_final_cap() or
  alternative_has_cap_*().

* The elfcore functions are only reachable after userspace is brought
  up, which happens after system cpucaps have been detected and patched.
  Thus the elfcore code can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* Hibernation is only possible after userspace is brought up, which
  happens after system cpucaps have been detected and patched. Thus the
  hibernate code can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* The set_tagged_addr_ctrl() function is only reachable after userspace
  is brought up, which happens after system cpucaps have been detected
  and patched. Thus this can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* The copy_user_highpage() and copy_highpage() functions are not used
  during this window, and can safely use alternative_has_cap_*().

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which avoid generating code to test the
system_cpucaps bitmap and should be better for all subsequent calls at
runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index bceb007f9ea3..386b339d5367 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -826,7 +826,7 @@ static __always_inline bool system_uses_irq_prio_masking(void)
 
 static inline bool system_supports_mte(void)
 {
-	return cpus_have_const_cap(ARM64_MTE);
+	return alternative_has_cap_unlikely(ARM64_MTE);
 }
 
 static inline bool system_has_prio_mask_debugging(void)
-- 
cgit v1.2.3


From bc75d0c0f376fef3548691a73f1311c6cc1041e6 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:50 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_SSBS

In ssbs_thread_switch() we use cpus_have_const_cap() to check for
ARM64_SSBS, but this is not necessary and alternative_has_cap_*() would
be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpus_have_const_cap() check in ssbs_thread_switch() is an
optimization to avoid the overhead of
spectre_v4_enable_task_mitigation() where all CPUs implement SSBS and
naturally preserve the SSBS bit in SPSR_ELx. In the window between
detecting the ARM64_SSBS system-wide and patching alternative branches
it is benign to continue to call spectre_v4_enable_task_mitigation().

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 0fcc4eb1a7ab..657ea273c0f9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -454,7 +454,7 @@ static void ssbs_thread_switch(struct task_struct *next)
 	 * If all CPUs implement the SSBS extension, then we just need to
 	 * context-switch the PSTATE field.
 	 */
-	if (cpus_have_const_cap(ARM64_SSBS))
+	if (alternative_has_cap_unlikely(ARM64_SSBS))
 		return;
 
 	spectre_v4_enable_task_mitigation(next);
-- 
cgit v1.2.3


From af64543977d66e13e487870780c18ef48187c493 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:51 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_SPECTRE_V2

In arm64_apply_bp_hardening() we use cpus_have_const_cap() to check for
ARM64_SPECTRE_V2 , but this is not necessary and alternative_has_cap_*()
would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpus_have_const_cap() check in arm64_apply_bp_hardening() is
intended to avoid the overhead of looking up and invoking a per-cpu
function pointer when no branch predictor hardening is required. The
arm64_apply_bp_hardening() function itself is called in two distinct
flows:

1) When handling certain exceptions taken from EL0, where the PC could
   be a TTBR1 address and hence might have trained a branch predictor.

   As cpucaps are detected and alternatives are patched long before it
   is possible to execute userspace, it is not necessary to use
   cpus_have_const_cap() for these cases, and cpus_have_final_cap() or
   alternative_has_cap() would be preferable.

2) When switching between tasks in check_and_switch_context().

   This can be called before cpucaps are detected and alternatives are
   patched, but this is long before the kernel mounts filesystems or
   accepts any input. At this stage the kernel hasn't loaded any secrets
   and there is no potential for hostile branch predictor training. Once
   cpucaps have been finalized and alternatives have been patched,
   switching tasks will invalidate any prior predictions. Hence it is
   not necessary to use cpus_have_const_cap() for this case.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/spectre.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
index 9cc501450486..06c357d83b13 100644
--- a/arch/arm64/include/asm/spectre.h
+++ b/arch/arm64/include/asm/spectre.h
@@ -73,7 +73,7 @@ static __always_inline void arm64_apply_bp_hardening(void)
 {
 	struct bp_hardening_data *d;
 
-	if (!cpus_have_const_cap(ARM64_SPECTRE_V2))
+	if (!alternative_has_cap_unlikely(ARM64_SPECTRE_V2))
 		return;
 
 	d = this_cpu_ptr(&bp_hardening_data);
-- 
cgit v1.2.3


From a76521d160284a1edcb866e6fc6727d9f10ccb68 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:52 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_{SVE,SME,SME2,FA64}

In system_supports_{sve,sme,sme2,fa64}() we use cpus_have_const_cap() to
check for the relevant cpucaps, but this is only necessary so that
sve_setup() and sme_setup() can run prior to alternatives being patched,
and otherwise alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

All of system_supports_{sve,sme,sme2,fa64}() will return false prior to
system cpucaps being detected. In the window between system cpucaps being
detected and patching alternatives, we need system_supports_sve() and
system_supports_sme() to run to initialize SVE and SME properties, but
all other users of system_supports_{sve,sme,sme2,fa64}() don't depend on
the relevant cpucap becoming true until alternatives are patched:

* No KVM code runs until after alternatives are patched, and so this can
  safely use cpus_have_final_cap() or alternative_has_cap_*().

* The cpuid_cpu_online() callback in arch/arm64/kernel/cpuinfo.c is
  registered later from cpuinfo_regs_init() as a device_initcall, and so
  this can safely use cpus_have_final_cap() or alternative_has_cap_*().

* The entry, signal, and ptrace code isn't reachable until userspace has
  run, and so this can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* Currently perf_reg_validate() will un-reserve the PERF_REG_ARM64_VG
  pseudo-register before alternatives are patched, and before
  sve_setup() has run. If a sampling event is created early enough, this
  would allow perf_ext_reg_value() to sample (the as-yet uninitialized)
  thread_struct::vl[] prior to alternatives being patched.

  It would be preferable to defer this until alternatives are patched,
  and this can safely use alternative_has_cap_*().

* The context-switch code will run during this window as part of
  stop_machine() used during alternatives_patch_all(), and potentially
  for other work if other kernel threads are created early. No threads
  require the use of SVE/SME/SME2/FA64 prior to alternatives being
  patched, and it would be preferable for the related context-switch
  logic to take effect after alternatives are patched so that ths is
  guaranteed to see a consistent system-wide state (e.g. anything
  initialized by sve_setup() and sme_setup().

  This can safely ues alternative_has_cap_*().

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. The sve_setup() and sme_setup() functions are modified to
use cpus_have_cap() directly so that they can observe the cpucaps being
set prior to alternatives being patched.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 8 ++++----
 arch/arm64/kernel/fpsimd.c          | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 386b339d5367..e1f011f9ea20 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -776,22 +776,22 @@ static inline bool system_uses_ttbr0_pan(void)
 
 static __always_inline bool system_supports_sve(void)
 {
-	return cpus_have_const_cap(ARM64_SVE);
+	return alternative_has_cap_unlikely(ARM64_SVE);
 }
 
 static __always_inline bool system_supports_sme(void)
 {
-	return cpus_have_const_cap(ARM64_SME);
+	return alternative_has_cap_unlikely(ARM64_SME);
 }
 
 static __always_inline bool system_supports_sme2(void)
 {
-	return cpus_have_const_cap(ARM64_SME2);
+	return alternative_has_cap_unlikely(ARM64_SME2);
 }
 
 static __always_inline bool system_supports_fa64(void)
 {
-	return cpus_have_const_cap(ARM64_SME_FA64);
+	return alternative_has_cap_unlikely(ARM64_SME_FA64);
 }
 
 static __always_inline bool system_supports_tpidr2(void)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index d286bcfc3f41..d0d28bc069d2 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1192,7 +1192,7 @@ void __init sve_setup(void)
 	DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
 	unsigned long b;
 
-	if (!system_supports_sve())
+	if (!cpus_have_cap(ARM64_SVE))
 		return;
 
 	/*
@@ -1351,7 +1351,7 @@ void __init sme_setup(void)
 	u64 smcr;
 	int min_bit;
 
-	if (!system_supports_sme())
+	if (!cpus_have_cap(ARM64_SME))
 		return;
 
 	/*
-- 
cgit v1.2.3


From c2ef5f1e159285a0f7944e8838698f81cb709f15 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:53 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_UNMAP_KERNEL_AT_EL0

In arm64_kernel_unmapped_at_el0() we use cpus_have_const_cap() to check
for ARM64_UNMAP_KERNEL_AT_EL0, but this is only necessary so that
arm64_get_bp_hardening_vector() and this_cpu_set_vectors() can run prior
to alternatives being patched. Otherwise this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is a system-wide feature that is
detected and patched before any translation tables are created for
userspace. In the window between detecting the ARM64_UNMAP_KERNEL_AT_EL0
cpucap and patching alternatives, most users of
arm64_kernel_unmapped_at_el0() do not need to know that the cpucap has
been detected:

* As KVM is initialized after cpucaps are finalized, no usaef of
  arm64_kernel_unmapped_at_el0() in the KVM code is reachable during
  this window.

* The arm64_mm_context_get() function in arch/arm64/mm/context.c is only
  called after the SMMU driver is brought up after alternatives have
  been patched. Thus this can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

  Similarly the asids_update_limit() function is called after
  alternatives have been patched as an arch_initcall, and this can
  safely use cpus_have_final_cap() or alternative_has_cap_*().

  Similarly we do not expect an ASID rollover to occur between cpucaps
  being detected and patching alternatives. Thus
  set_reserved_asid_bits() can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* The __tlbi_user() and __tlbi_user_level() macros are not used during
  this window, and only need to invalidate additional entries once
  userspace translation tables have been active on a CPU. Thus these can
  safely use alternative_has_cap_*().

* The xen_kernel_unmapped_at_usr() function is not used during this
  window as it is only used in a late_initcall. Thus this can safely use
  cpus_have_final_cap() or alternative_has_cap_*().

* The arm64_get_meltdown_state() function is not used during this
  window. It only used by arm64_get_meltdown_state() and KVM code, both
  of which are only used after cpucaps have been finalized. Thus this
  can safely use cpus_have_final_cap() or alternative_has_cap_*().

* The tls_thread_switch() uses arm64_kernel_unmapped_at_el0() as an
  optimization to avoid zeroing tpidrro_el0 when KPTI is enabled
  and this will be trampled by the KPTI trampoline. It doesn't matter if
  this continues to zero the register during the window between
  detecting the cpucap and patching alternatives, so this can safely use
  alternative_has_cap_*().

* The sdei_arch_get_entry_point() and do_sdei_event() functions aren't
  reachable at this time as the SDEI driver is registered later by
  acpi_init() -> acpi_ghes_init() -> sdei_init(), where acpi_init is a
  subsys_initcall. Thus these can safely use cpus_have_final_cap() or
  alternative_has_cap_*().

* The uses under drivers/ aren't reachable at this time as the drivers
  are registered later:

  - TRBE is registered via module_init()
  - SMMUv3 is registred via module_driver()
  - SPE is registred via module_init()

* The arm64_get_bp_hardening_vector() and this_cpu_set_vectors()
  functions need to run on boot CPUs prior to patching alternatives.
  As these are only called during the onlining of a CPU, it's fine to
  perform a system_cpucaps bitmap test using cpus_have_cap().

This patch modifies this_cpu_set_vectors() to use cpus_have_cap(), and
replaced all other use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. The ARM64_UNMAP_KERNEL_AT_EL0 cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is
not possible.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/include/asm/mmu.h     | 2 +-
 arch/arm64/include/asm/vectors.h | 2 +-
 arch/arm64/kernel/proton-pack.c  | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index af9550147dd0..5e3ec59fb48b 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -42,6 +42,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_BTI);
 	case ARM64_HAS_TLB_RANGE:
 		return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
+	case ARM64_UNMAP_KERNEL_AT_EL0:
+		return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
 	case ARM64_WORKAROUND_2658417:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	}
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 94b68850cb9f..2fcf51231d6e 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -57,7 +57,7 @@ typedef struct {
 
 static inline bool arm64_kernel_unmapped_at_el0(void)
 {
-	return cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0);
+	return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0);
 }
 
 extern void arm64_memblock_init(void);
diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h
index bc9a2145f419..b815d8f2c0dc 100644
--- a/arch/arm64/include/asm/vectors.h
+++ b/arch/arm64/include/asm/vectors.h
@@ -62,7 +62,7 @@ DECLARE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector);
 static inline const char *
 arm64_get_bp_hardening_vector(enum arm64_bp_harden_el1_vectors slot)
 {
-	if (arm64_kernel_unmapped_at_el0())
+	if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0))
 		return (char *)(TRAMP_VALIAS + SZ_2K * slot);
 
 	WARN_ON_ONCE(slot == EL1_VECTOR_KPTI);
diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c
index 05f40c4e18fd..6268a13a1d58 100644
--- a/arch/arm64/kernel/proton-pack.c
+++ b/arch/arm64/kernel/proton-pack.c
@@ -972,7 +972,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
 	 * When KPTI is in use, the vectors are switched when exiting to
 	 * user-space.
 	 */
-	if (arm64_kernel_unmapped_at_el0())
+	if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0))
 		return;
 
 	write_sysreg(v, vbar_el1);
-- 
cgit v1.2.3


From 0a285dfe875d5206da10c3d00564727f9482a9dd Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:54 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_843419

In count_plts() and is_forbidden_offset_for_adrp() we use
cpus_have_const_cap() to check for ARM64_WORKAROUND_843419, but this is
not necessary and cpus_have_final_cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

It's not possible to load a module in the window between detecting the
ARM64_WORKAROUND_843419 cpucap and patching alternatives. The module VA
range limits are initialized much later in module_init_limits() which is
a subsys_initcall, and module loading cannot happen before this. Hence
it's not necessary for count_plts() or is_forbidden_offset_for_adrp() to
use cpus_have_const_cap().

This patch replaces the use of cpus_have_const_cap() with
cpus_have_final_cap() which will avoid generating code to test the
system_cpucaps bitmap and should be better for all subsequent calls at
runtime. Using cpus_have_final_cap() clearly documents that we do not
expect this code to run before cpucaps are finalized, and will make it
easier to spot issues if code is changed in future to allow modules to
be loaded earlier. The ARM64_WORKAROUND_843419 cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is not
possible, and redundant IS_ENABLED() checks are removed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/include/asm/module.h  | 3 +--
 arch/arm64/kernel/module-plts.c  | 7 +++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 5e3ec59fb48b..68b0a658e082 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -44,6 +44,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
 	case ARM64_UNMAP_KERNEL_AT_EL0:
 		return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
+	case ARM64_WORKAROUND_843419:
+		return IS_ENABLED(CONFIG_ARM64_ERRATUM_843419);
 	case ARM64_WORKAROUND_2658417:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	}
diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h
index bfa6638b4c93..79550b22ba19 100644
--- a/arch/arm64/include/asm/module.h
+++ b/arch/arm64/include/asm/module.h
@@ -44,8 +44,7 @@ struct plt_entry {
 
 static inline bool is_forbidden_offset_for_adrp(void *place)
 {
-	return IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) &&
-	       cpus_have_const_cap(ARM64_WORKAROUND_843419) &&
+	return cpus_have_final_cap(ARM64_WORKAROUND_843419) &&
 	       ((u64)place & 0xfff) >= 0xff8;
 }
 
diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index bd69a4e7cd60..399692c414f0 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -203,8 +203,7 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
 			break;
 		case R_AARCH64_ADR_PREL_PG_HI21_NC:
 		case R_AARCH64_ADR_PREL_PG_HI21:
-			if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) ||
-			    !cpus_have_const_cap(ARM64_WORKAROUND_843419))
+			if (!cpus_have_final_cap(ARM64_WORKAROUND_843419))
 				break;
 
 			/*
@@ -239,13 +238,13 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
 		}
 	}
 
-	if (IS_ENABLED(CONFIG_ARM64_ERRATUM_843419) &&
-	    cpus_have_const_cap(ARM64_WORKAROUND_843419))
+	if (cpus_have_final_cap(ARM64_WORKAROUND_843419)) {
 		/*
 		 * Add some slack so we can skip PLT slots that may trigger
 		 * the erratum due to the placement of the ADRP instruction.
 		 */
 		ret += DIV_ROUND_UP(ret, (SZ_4K / sizeof(struct plt_entry)));
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From d1e40f822290d3116f83acc06606d181cccf5bb2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:55 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_1542419

We use cpus_have_const_cap() to check for ARM64_WORKAROUND_1542419 but
this is not necessary and cpus_have_final_cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_WORKAROUND_1542419 cpucap is detected and patched before any
userspace code can run, and the both __do_compat_cache_op() and
ctr_read_handler() are only reachable from exceptions taken from
userspace. Thus it is not necessary for either to use
cpus_have_const_cap(), and cpus_have_final_cap() is equivalent.

This patch replaces the use of cpus_have_const_cap() with
cpus_have_final_cap(), which will avoid generating code to test the
system_cpucaps bitmap and should be better for all subsequent calls at
runtime. Using cpus_have_final_cap() clearly documents that we do not
expect this code to run before cpucaps are finalized, and will make it
easier to spot issues if code is changed in future to allow these
functions to be reached earlier.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/sys_compat.c | 2 +-
 arch/arm64/kernel/traps.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index df14336c3a29..4a609e9b65de 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -31,7 +31,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
 		if (fatal_signal_pending(current))
 			return 0;
 
-		if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
+		if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) {
 			/*
 			 * The workaround requires an inner-shareable tlbi.
 			 * We pick the reserved-ASID to minimise the impact.
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 8b70759cdbb9..9eba6cdd7038 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -631,7 +631,7 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs)
 	int rt = ESR_ELx_SYS64_ISS_RT(esr);
 	unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0);
 
-	if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
+	if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) {
 		/* Hide DIC so that we can trap the unnecessary maintenance...*/
 		val &= ~BIT(CTR_EL0_DIC_SHIFT);
 
-- 
cgit v1.2.3


From 48b57d9199f471b4366bbc834890be8249525534 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:56 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_1742098

In elf_hwcap_fixup() we use cpus_have_const_cap() to check for
ARM64_WORKAROUND_1742098, but this is not necessary and cpus_have_cap()
would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_WORKAROUND_1742098 cpucap is detected and patched before
elf_hwcap_fixup() can run, and hence it is not necessary to use
cpus_have_const_cap(). We run cpus_have_const_cap() at most twice: once
after finalizing system cpucaps, and potentially once more after
detecting mismatched CPUs which support AArch32 at EL0. Due to this,
it's not necessary to optimize for many calls to elf_hwcap_fixup(), and
it's fine to use cpus_have_cap().

This patch replaces the use of cpus_have_const_cap() with
cpus_have_cap(), which will only generate the bitmap test and avoid
generating an alternative sequence, resulting in slightly simpler annd
smaller code being generated. For consistenct with other cpucaps, the
ARM64_WORKAROUND_1742098 cpucap is added to cpucap_is_possible() so that
code can be elided when this is not possible. However, as we only define
compat_elf_hwcap2 when CONFIG_COMPAT=y, some ifdeffery is still required
within user_feature_fixup() to avoid build errors when CONFIG_COMPAT=n.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/kernel/cpufeature.c   | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 68b0a658e082..5613d9e813b2 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -46,6 +46,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
 	case ARM64_WORKAROUND_843419:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_843419);
+	case ARM64_WORKAROUND_1742098:
+		return IS_ENABLED(CONFIG_ARM64_ERRATUM_1742098);
 	case ARM64_WORKAROUND_2658417:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9ab7e19b7176..027783b7a2aa 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2221,10 +2221,10 @@ static void user_feature_fixup(void)
 
 static void elf_hwcap_fixup(void)
 {
-#ifdef CONFIG_ARM64_ERRATUM_1742098
-	if (cpus_have_const_cap(ARM64_WORKAROUND_1742098))
+#ifdef CONFIG_COMPAT
+	if (cpus_have_cap(ARM64_WORKAROUND_1742098))
 		compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES;
-#endif /* ARM64_ERRATUM_1742098 */
+#endif /* CONFIG_COMPAT */
 }
 
 #ifdef CONFIG_KVM
-- 
cgit v1.2.3


From 412cb3801dfac4703032f438fa538e850cc14b6d Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:57 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_2645198

We use cpus_have_const_cap() to check for ARM64_WORKAROUND_2645198 but
this is not necessary and alternative_has_cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_WORKAROUND_2645198 cpucap is detected and patched before any
userspace translation table exist, and the workaround is only necessary
when manipulating usrspace translation tables which are in use. Thus it
is not necessary to use cpus_have_const_cap(), and alternative_has_cap()
is equivalent.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime.  The ARM64_WORKAROUND_2645198 cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is
not possible, and redundant IS_ENABLED() checks are removed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/mm/hugetlbpage.c      | 3 +--
 arch/arm64/mm/mmu.c              | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 5613d9e813b2..c5b67a64613e 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -48,6 +48,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_843419);
 	case ARM64_WORKAROUND_1742098:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_1742098);
+	case ARM64_WORKAROUND_2645198:
+		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198);
 	case ARM64_WORKAROUND_2658417:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 9c52718ea750..e9fc56e4f98c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -555,8 +555,7 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 
 pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
-	if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) &&
-	    cpus_have_const_cap(ARM64_WORKAROUND_2645198)) {
+	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
 		/*
 		 * Break-before-make (BBM) is required for all user space mappings
 		 * when the permission changes from executable to non-executable
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 47781bec6171..15f6347d23b6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1469,8 +1469,7 @@ early_initcall(prevent_bootmem_remove_init);
 
 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
 {
-	if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) &&
-	    cpus_have_const_cap(ARM64_WORKAROUND_2645198)) {
+	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
 		/*
 		 * Break-before-make (BBM) is required for all user space mappings
 		 * when the permission changes from executable to non-executable
-- 
cgit v1.2.3


From a98a5eac4d69e639e2a3edbac9fce89de4ec39ee Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:58 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_CAVIUM_23154

In gic_read_iar() we use cpus_have_const_cap() to check for
ARM64_WORKAROUND_CAVIUM_23154 but this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The ARM64_WORKAROUND_CAVIUM_23154 cpucap is detected and patched early
on the boot CPU before the GICv3 driver is initialized and hence before
gic_read_iar() is ever called. Thus it is not necessary to use
cpus_have_const_cap(), and alternative_has_cap() is equivalent.

In addition, arm64's gic_read_iar() lives in irq-gic-v3.c purely for
historical reasons. It was originally added prior to 32-bit arm support
in commit:

  6d4e11c5e2e8cd54 ("irqchip/gicv3: Workaround for Cavium ThunderX erratum 23154")

When support for 32-bit arm was added, 32-bit arm's gic_read_iar()
implementation was placed in <asm/arch_gicv3.h>, but the arm64 version
was kept within irq-gic-v3.c as it depended on a static key local to
irq-gic-v3.c and it was easier to add ifdeffery, which is what we did in
commit:

  7936e914f7b0827c ("irqchip/gic-v3: Refactor the arm64 specific parts")

Subsequently the static key was replaced with a cpucap in commit:

  a4023f682739439b ("arm64: Add hypervisor safe helper for checking constant capabilities")

Since that commit there has been no need to keep arm64's gic_read_iar()
in irq-gic-v3.c.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. For consistency, move the arm64-specific gic_read_iar()
implementation over to arm64's <asm/arch_gicv3.h>. The
ARM64_WORKAROUND_CAVIUM_23154 cpucap is added to cpucap_is_possible() so
that code can be elided entirely when this is not possible.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/arch_gicv3.h |  8 ++++++++
 arch/arm64/include/asm/cpucaps.h    |  2 ++
 drivers/irqchip/irq-gic-v3.c        | 11 -----------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h
index 01281a5336cf..5f172611654b 100644
--- a/arch/arm64/include/asm/arch_gicv3.h
+++ b/arch/arm64/include/asm/arch_gicv3.h
@@ -79,6 +79,14 @@ static inline u64 gic_read_iar_cavium_thunderx(void)
 	return 0x3ff;
 }
 
+static u64 __maybe_unused gic_read_iar(void)
+{
+	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_CAVIUM_23154))
+		return gic_read_iar_cavium_thunderx();
+	else
+		return gic_read_iar_common();
+}
+
 static inline void gic_write_ctlr(u32 val)
 {
 	write_sysreg_s(val, SYS_ICC_CTLR_EL1);
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index c5b67a64613e..34b6428f08ba 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -52,6 +52,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198);
 	case ARM64_WORKAROUND_2658417:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
+	case ARM64_WORKAROUND_CAVIUM_23154:
+		return IS_ENABLED(CONFIG_CAVIUM_ERRATUM_23154);
 	}
 
 	return true;
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index eedfa8e9f077..6d6c01f8f7b4 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -270,17 +270,6 @@ static void gic_redist_wait_for_rwp(void)
 	gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP);
 }
 
-#ifdef CONFIG_ARM64
-
-static u64 __maybe_unused gic_read_iar(void)
-{
-	if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_23154))
-		return gic_read_iar_cavium_thunderx();
-	else
-		return gic_read_iar_common();
-}
-#endif
-
 static void gic_enable_redist(bool enable)
 {
 	void __iomem *rbase;
-- 
cgit v1.2.3


From 0d48058ef8294a95907e647db018cefb522af98a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:24:59 +0100
Subject: arm64: Avoid cpus_have_const_cap() for
 ARM64_WORKAROUND_NVIDIA_CARMEL_CNP

In has_useable_cnp() we use cpus_have_const_cap() to check for
ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, but this is not necessary and
cpus_have_cap() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

We use has_useable_cnp() to determine whether we have the system-wide
ARM64_HAS_CNP cpucap. Due to the structure of the cpufeature code, we
call has_useable_cnp() in two distinct cases:

1) When finalizing system capabilities, setup_system_capabilities() will
   call has_useable_cnp() with SCOPE_SYSTEM to determine whether all
   CPUs have the feature. This is called after we've detected any local
   cpucaps including ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, but prior to
   patching alternatives.

   If the ARM64_WORKAROUND_NVIDIA_CARMEL_CNP was detected, we will not
   detect ARM64_HAS_CNP.

2) After finalizing system capabilties, verify_local_cpu_capabilities()
   will call has_useable_cnp() with SCOPE_LOCAL_CPU to verify that CPUs
   have CNP if we previously detected it.

   Note that if ARM64_WORKAROUND_NVIDIA_CARMEL_CNP was detected, we will
   not have detected ARM64_HAS_CNP.

For case 1 we must check the system_cpucaps bitmap as this occurs prior
to patching the alternatives. For case 2 we'll only call
has_useable_cnp() once per subsequent onlining of a CPU, and as this
isn't a fast path it's not necessary to optimize for this case.

This patch replaces the use of cpus_have_const_cap() with
cpus_have_cap(), which will only generate the bitmap test and avoid
generating an alternative sequence, resulting in slightly simpler annd
smaller code being generated. The ARM64_WORKAROUND_NVIDIA_CARMEL_CNP
cpucap is added to cpucap_is_possible() so that code can be elided
entirely when this is not possible.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h | 2 ++
 arch/arm64/kernel/cpufeature.c   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 34b6428f08ba..7ddb79c235c2 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -54,6 +54,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_2658417);
 	case ARM64_WORKAROUND_CAVIUM_23154:
 		return IS_ENABLED(CONFIG_CAVIUM_ERRATUM_23154);
+	case ARM64_WORKAROUND_NVIDIA_CARMEL_CNP:
+		return IS_ENABLED(CONFIG_NVIDIA_CARMEL_CNP_ERRATUM);
 	}
 
 	return true;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 027783b7a2aa..b131ce5698b6 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1629,7 +1629,7 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope)
 	if (is_kdump_kernel())
 		return false;
 
-	if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP))
+	if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP))
 		return false;
 
 	return has_cpuid_feature(entry, scope);
-- 
cgit v1.2.3


From 47759eca76d1807923fffec6421ba8014432c191 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:25:00 +0100
Subject: arm64: Avoid cpus_have_const_cap() for ARM64_WORKAROUND_REPEAT_TLBI

In arch_tlbbatch_should_defer() we use cpus_have_const_cap() to check
for ARM64_WORKAROUND_REPEAT_TLBI, but this is not necessary and
alternative_has_cap_*() would be preferable.

For historical reasons, cpus_have_const_cap() is more complicated than
it needs to be. Before cpucaps are finalized, it will perform a bitmap
test of the system_cpucaps bitmap, and once cpucaps are finalized it
will use an alternative branch. This used to be necessary to handle some
race conditions in the window between cpucap detection and the
subsequent patching of alternatives and static branches, where different
branches could be out-of-sync with one another (or w.r.t. alternative
sequences). Now that we use alternative branches instead of static
branches, these are all patched atomically w.r.t. one another, and there
are only a handful of cases that need special care in the window between
cpucap detection and alternative patching.

Due to the above, it would be nice to remove cpus_have_const_cap(), and
migrate callers over to alternative_has_cap_*(), cpus_have_final_cap(),
or cpus_have_cap() depending on when their requirements. This will
remove redundant instructions and improve code generation, and will make
it easier to determine how each callsite will behave before, during, and
after alternative patching.

The cpus_have_const_cap() check in arch_tlbbatch_should_defer() is an
optimization to avoid some redundant work when the
ARM64_WORKAROUND_REPEAT_TLBI cpucap is detected and forces the immediate
use of TLBI + DSB ISH. In the window between detecting the
ARM64_WORKAROUND_REPEAT_TLBI cpucap and patching alternatives this is
not a big concern and there's no need to optimize this window at the
expsense of subsequent usage at runtime.

This patch replaces the use of cpus_have_const_cap() with
alternative_has_cap_unlikely(), which will avoid generating code to test
the system_cpucaps bitmap and should be better for all subsequent calls
at runtime. The ARM64_WORKAROUND_REPEAT_TLBI cpucap is added to
cpucap_is_possible() so that code can be elided entirely when this is
not possible without requiring ifdeffery or IS_ENABLED() checks at each
usage.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h  | 2 ++
 arch/arm64/include/asm/tlbflush.h | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 7ddb79c235c2..270680e2b5c4 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -56,6 +56,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_CAVIUM_ERRATUM_23154);
 	case ARM64_WORKAROUND_NVIDIA_CARMEL_CNP:
 		return IS_ENABLED(CONFIG_NVIDIA_CARMEL_CNP_ERRATUM);
+	case ARM64_WORKAROUND_REPEAT_TLBI:
+		return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI);
 	}
 
 	return true;
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 53ed194626e1..7aa476a52180 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -284,16 +284,15 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
 {
-#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
 	/*
 	 * TLB flush deferral is not required on systems which are affected by
 	 * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
 	 * will have two consecutive TLBI instructions with a dsb(ish) in between
 	 * defeating the purpose (i.e save overall 'dsb ish' cost).
 	 */
-	if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI)))
+	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI))
 		return false;
-#endif
+
 	return true;
 }
 
-- 
cgit v1.2.3


From e8d4006dc24e66105444715d6d5daa244aa2bdec Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 16 Oct 2023 11:25:01 +0100
Subject: arm64: Remove cpus_have_const_cap()

There are no longer any users of cpus_have_const_cap(), and therefore it
can be removed.

Remove cpus_have_const_cap(). At the same time, remove
__cpus_have_const_cap(), as this is a trivial wrapper of
alternative_has_cap_unlikely(), which can be used directly instead.

The comment for __system_matches_cap() is updated to no longer refer to
cpus_have_const_cap(). As we have a number of ways to check the cpucaps,
the specific suggestions are removed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Kristina Martsenko <kristina.martsenko@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h | 38 ++-----------------------------------
 arch/arm64/kernel/cpufeature.c      |  1 -
 2 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index e1f011f9ea20..396af9b9c857 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -462,19 +462,6 @@ static __always_inline bool cpus_have_cap(unsigned int num)
 	return arch_test_bit(num, system_cpucaps);
 }
 
-/*
- * Test for a capability without a runtime check.
- *
- * Before capabilities are finalized, this returns false.
- * After capabilities are finalized, this is patched to avoid a runtime check.
- *
- * @num must be a compile-time constant.
- */
-static __always_inline bool __cpus_have_const_cap(int num)
-{
-	return alternative_has_cap_unlikely(num);
-}
-
 /*
  * Test for a capability without a runtime check.
  *
@@ -487,7 +474,7 @@ static __always_inline bool __cpus_have_const_cap(int num)
 static __always_inline bool cpus_have_final_boot_cap(int num)
 {
 	if (boot_capabilities_finalized())
-		return __cpus_have_const_cap(num);
+		return alternative_has_cap_unlikely(num);
 	else
 		BUG();
 }
@@ -504,32 +491,11 @@ static __always_inline bool cpus_have_final_boot_cap(int num)
 static __always_inline bool cpus_have_final_cap(int num)
 {
 	if (system_capabilities_finalized())
-		return __cpus_have_const_cap(num);
+		return alternative_has_cap_unlikely(num);
 	else
 		BUG();
 }
 
-/*
- * Test for a capability, possibly with a runtime check for non-hyp code.
- *
- * For hyp code, this behaves the same as cpus_have_final_cap().
- *
- * For non-hyp code:
- * Before capabilities are finalized, this behaves as cpus_have_cap().
- * After capabilities are finalized, this is patched to avoid a runtime check.
- *
- * @num must be a compile-time constant.
- */
-static __always_inline bool cpus_have_const_cap(int num)
-{
-	if (is_hyp_code())
-		return cpus_have_final_cap(num);
-	else if (system_capabilities_finalized())
-		return __cpus_have_const_cap(num);
-	else
-		return cpus_have_cap(num);
-}
-
 static inline int __attribute_const__
 cpuid_feature_extract_signed_field_width(u64 features, int field, int width)
 {
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index b131ce5698b6..ad7ec30d3bd3 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -3322,7 +3322,6 @@ EXPORT_SYMBOL_GPL(this_cpu_has_cap);
  * This helper function is used in a narrow window when,
  * - The system wide safe registers are set with all the SMP CPUs and,
  * - The SYSTEM_FEATURE system_cpucaps may not have been set.
- * In all other cases cpus_have_{const_}cap() should be used.
  */
 static bool __maybe_unused __system_matches_cap(unsigned int n)
 {
-- 
cgit v1.2.3


From 3425cec42c3ce0f65fe74e412756b567b152e61d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 5 Oct 2023 15:07:30 +0100
Subject: arm64/mm: Hoist synchronization out of set_ptes() loop

set_ptes() sets a physically contiguous block of memory (which all
belongs to the same folio) to a contiguous block of ptes. The arm64
implementation of this previously just looped, operating on each
individual pte. But the __sync_icache_dcache() and mte_sync_tags()
operations can both be hoisted out of the loop so that they are
performed once for the contiguous set of pages (which may be less than
the whole folio). This should result in minor performance gains.

__sync_icache_dcache() already acts on the whole folio, and sets a flag
in the folio so that it skips duplicate calls. But by hoisting the call,
all the pte testing is done only once.

mte_sync_tags() operates on each individual page with its own loop. But
by passing the number of pages explicitly, we can rely solely on its
loop and do the checks only once. This approach also makes it robust for
the future, rather than assuming if a head page of a compound page is
being mapped, then the whole compound page is being mapped, instead we
explicitly know how many pages are being mapped. The old assumption may
not continue to hold once the "anonymous large folios" feature is
merged.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Link: https://lore.kernel.org/r/20231005140730.2191134-1-ryan.roberts@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mte.h     |  4 ++--
 arch/arm64/include/asm/pgtable.h | 27 +++++++++++++++++----------
 arch/arm64/kernel/mte.c          |  4 ++--
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 4cedbaa16f41..91fbd5c8a391 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page)
 }
 
 void mte_zero_clear_page_tags(void *addr);
-void mte_sync_tags(pte_t pte);
+void mte_sync_tags(pte_t pte, unsigned int nr_pages);
 void mte_copy_page_tags(void *kto, const void *kfrom);
 void mte_thread_init_user(void);
 void mte_thread_switch(struct task_struct *next);
@@ -122,7 +122,7 @@ static inline bool try_page_mte_tagging(struct page *page)
 static inline void mte_zero_clear_page_tags(void *addr)
 {
 }
-static inline void mte_sync_tags(pte_t pte)
+static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages)
 {
 }
 static inline void mte_copy_page_tags(void *kto, const void *kfrom)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7f7d9b1df4e5..68984ba9ce2a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -325,8 +325,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 		     __func__, pte_val(old_pte), pte_val(pte));
 }
 
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, pte_t pte)
+static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
 {
 	if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
 		__sync_icache_dcache(pte);
@@ -339,20 +338,18 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 */
 	if (system_supports_mte() && pte_access_permitted(pte, false) &&
 	    !pte_special(pte) && pte_tagged(pte))
-		mte_sync_tags(pte);
-
-	__check_safe_pte_update(mm, ptep, pte);
-
-	set_pte(ptep, pte);
+		mte_sync_tags(pte, nr_pages);
 }
 
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	page_table_check_ptes_set(mm, ptep, pte, nr);
+	__sync_cache_and_tags(pte, nr);
 
 	for (;;) {
-		__set_pte_at(mm, addr, ptep, pte);
+		__check_safe_pte_update(mm, ptep, pte);
+		set_pte(ptep, pte);
 		if (--nr == 0)
 			break;
 		ptep++;
@@ -531,18 +528,28 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
 #define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
+static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	__sync_cache_and_tags(pte, nr);
+	__check_safe_pte_update(mm, ptep, pte);
+	set_pte(ptep, pte);
+}
+
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 			      pmd_t *pmdp, pmd_t pmd)
 {
 	page_table_check_pmd_set(mm, pmdp, pmd);
-	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
+	return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd),
+						PMD_SIZE >> PAGE_SHIFT);
 }
 
 static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
 			      pud_t *pudp, pud_t pud)
 {
 	page_table_check_pud_set(mm, pudp, pud);
-	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
+	return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud),
+						PUD_SIZE >> PAGE_SHIFT);
 }
 
 #define __p4d_to_phys(p4d)	__pte_to_phys(p4d_pte(p4d))
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 4edecaac8f91..2fb5e7a7a4d5 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -35,10 +35,10 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode);
 EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode);
 #endif
 
-void mte_sync_tags(pte_t pte)
+void mte_sync_tags(pte_t pte, unsigned int nr_pages)
 {
 	struct page *page = pte_page(pte);
-	long i, nr_pages = compound_nr(page);
+	unsigned int i;
 
 	/* if PG_mte_tagged is set, tags have already been initialised */
 	for (i = 0; i < nr_pages; i++, page++) {
-- 
cgit v1.2.3


From 32269e09b137ade1e5cb5c8a7d67db1ba9b1e4fe Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 12 Oct 2023 12:35:43 +0200
Subject: perf/amlogic: add missing MODULE_DEVICE_TABLE

Add missing MODULE_DEVICE_TABLE macro to let this driver to be
automatically loaded as module.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20231012103543.3381326-1-m.szyprowski@samsung.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/amlogic/meson_g12_ddr_pmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
index 8b643888d503..15d52ab3276a 100644
--- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c
+++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
@@ -377,6 +377,7 @@ static const struct of_device_id meson_ddr_pmu_dt_match[] = {
 	},
 	{}
 };
+MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match);
 
 static struct platform_driver g12_ddr_pmu_driver = {
 	.probe = g12_ddr_pmu_probe,
-- 
cgit v1.2.3


From 1f33cdef8ca174661fb0d82d0f74fd0589dc9e46 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Mon, 9 Oct 2023 12:29:09 -0500
Subject: drivers/perf: xgene: Use device_get_match_data()

Use preferred device_get_match_data() instead of of_match_device() and
acpi_match_device() to get the driver match data. With this, adjust the
includes to explicitly include the correct headers.

Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231009172923.2457844-14-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/xgene_pmu.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index 9972bfc11a5c..7ce344248dda 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -16,11 +16,9 @@
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
-#include <linux/of_fdt.h>
-#include <linux/of_irq.h>
-#include <linux/of_platform.h>
 #include <linux/perf_event.h>
 #include <linux/platform_device.h>
+#include <linux/property.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
 
@@ -1731,6 +1729,12 @@ static const struct xgene_pmu_data xgene_pmu_v2_data = {
 	.id   = PCP_PMU_V2,
 };
 
+#ifdef CONFIG_ACPI
+static const struct xgene_pmu_data xgene_pmu_v3_data = {
+	.id   = PCP_PMU_V3,
+};
+#endif
+
 static const struct xgene_pmu_ops xgene_pmu_ops = {
 	.mask_int = xgene_pmu_mask_int,
 	.unmask_int = xgene_pmu_unmask_int,
@@ -1773,9 +1777,9 @@ static const struct of_device_id xgene_pmu_of_match[] = {
 MODULE_DEVICE_TABLE(of, xgene_pmu_of_match);
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id xgene_pmu_acpi_match[] = {
-	{"APMC0D5B", PCP_PMU_V1},
-	{"APMC0D5C", PCP_PMU_V2},
-	{"APMC0D83", PCP_PMU_V3},
+	{"APMC0D5B", (kernel_ulong_t)&xgene_pmu_data},
+	{"APMC0D5C", (kernel_ulong_t)&xgene_pmu_v2_data},
+	{"APMC0D83", (kernel_ulong_t)&xgene_pmu_v3_data},
 	{},
 };
 MODULE_DEVICE_TABLE(acpi, xgene_pmu_acpi_match);
@@ -1831,7 +1835,6 @@ static int xgene_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
 static int xgene_pmu_probe(struct platform_device *pdev)
 {
 	const struct xgene_pmu_data *dev_data;
-	const struct of_device_id *of_id;
 	struct xgene_pmu *xgene_pmu;
 	int irq, rc;
 	int version;
@@ -1850,24 +1853,10 @@ static int xgene_pmu_probe(struct platform_device *pdev)
 	xgene_pmu->dev = &pdev->dev;
 	platform_set_drvdata(pdev, xgene_pmu);
 
-	version = -EINVAL;
-	of_id = of_match_device(xgene_pmu_of_match, &pdev->dev);
-	if (of_id) {
-		dev_data = (const struct xgene_pmu_data *) of_id->data;
-		version = dev_data->id;
-	}
-
-#ifdef CONFIG_ACPI
-	if (ACPI_COMPANION(&pdev->dev)) {
-		const struct acpi_device_id *acpi_id;
-
-		acpi_id = acpi_match_device(xgene_pmu_acpi_match, &pdev->dev);
-		if (acpi_id)
-			version = (int) acpi_id->driver_data;
-	}
-#endif
-	if (version < 0)
+	dev_data = device_get_match_data(&pdev->dev);
+	if (!dev_data)
 		return -ENODEV;
+	version = dev_data->id;
 
 	if (version == PCP_PMU_V3)
 		xgene_pmu->ops = &xgene_pmu_v3_ops;
-- 
cgit v1.2.3


From dba2ff4922b3cf573c25c3886e869258a6076030 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 17 Oct 2023 11:57:55 +0100
Subject: arm64: Mark the 'addr' argument to set_ptes() and __set_pte_at() as
 unused

This argument is not used by the arm64 implementation. Mark it as
__always_unused and also remove the unnecessary 'addr' increment in
set_ptes().

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202310140531.BQQwt3NQ-lkp@intel.com/
Cc: Will Deacon <will@kernel.org>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://lore.kernel.org/r/ZS6EvMiJ0QF5INkv@arm.com
---
 arch/arm64/include/asm/pgtable.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 68984ba9ce2a..b19a8aee684c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -341,8 +341,9 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
 		mte_sync_tags(pte, nr_pages);
 }
 
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
-			      pte_t *ptep, pte_t pte, unsigned int nr)
+static inline void set_ptes(struct mm_struct *mm,
+			    unsigned long __always_unused addr,
+			    pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	page_table_check_ptes_set(mm, ptep, pte, nr);
 	__sync_cache_and_tags(pte, nr);
@@ -353,7 +354,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		if (--nr == 0)
 			break;
 		ptep++;
-		addr += PAGE_SIZE;
 		pte_val(pte) += PAGE_SIZE;
 	}
 }
@@ -528,7 +528,8 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 #define pud_pfn(pud)		((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
 #define pfn_pud(pfn,prot)	__pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
-static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
+static inline void __set_pte_at(struct mm_struct *mm,
+				unsigned long __always_unused addr,
 				pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	__sync_cache_and_tags(pte, nr);
-- 
cgit v1.2.3


From 0899a6278a86b32e0b9d55f68f265519306a5be0 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Tue, 17 Oct 2023 10:50:36 +1000
Subject: arm64: Remove system_uses_lse_atomics()

There are two variants of system_uses_lse_atomics(), depending on
CONFIG_ARM64_LSE_ATOMICS. The function isn't called anywhere when
CONFIG_ARM64_LSE_ATOMICS is disabled. It can be directly replaced
by alternative_has_cap_likely(ARM64_HAS_LSE_ATOMICS) when the kernel
option is enabled.

No need to keep system_uses_lse_atomics() and just remove it.

Signed-off-by: Gavin Shan <gshan@redhat.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231017005036.334067-1-gshan@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/lse.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/lse.h b/arch/arm64/include/asm/lse.h
index cbbcdc35c4cd..3129a5819d0e 100644
--- a/arch/arm64/include/asm/lse.h
+++ b/arch/arm64/include/asm/lse.h
@@ -16,14 +16,9 @@
 #include <asm/atomic_lse.h>
 #include <asm/cpucaps.h>
 
-static __always_inline bool system_uses_lse_atomics(void)
-{
-	return alternative_has_cap_likely(ARM64_HAS_LSE_ATOMICS);
-}
-
 #define __lse_ll_sc_body(op, ...)					\
 ({									\
-	system_uses_lse_atomics() ?					\
+	alternative_has_cap_likely(ARM64_HAS_LSE_ATOMICS) ?		\
 		__lse_##op(__VA_ARGS__) :				\
 		__ll_sc_##op(__VA_ARGS__);				\
 })
@@ -34,8 +29,6 @@ static __always_inline bool system_uses_lse_atomics(void)
 
 #else	/* CONFIG_ARM64_LSE_ATOMICS */
 
-static inline bool system_uses_lse_atomics(void) { return false; }
-
 #define __lse_ll_sc_body(op, ...)		__ll_sc_##op(__VA_ARGS__)
 
 #define ARM64_LSE_ATOMIC_INSN(llsc, lse)	llsc
-- 
cgit v1.2.3


From 851354cbd12bb9500909733c3d4054306f61df87 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Mon, 16 Oct 2023 16:31:27 +0100
Subject: clocksource/drivers/arm_arch_timer: limit XGene-1 workaround

The AppliedMicro XGene-1 CPU has an erratum where the timer condition
would only consider TVAL, not CVAL. We currently apply a workaround when
seeing the PartNum field of MIDR_EL1 being 0x000, under the assumption
that this would match only the XGene-1 CPU model.
However even the Ampere eMAG (aka XGene-3) uses that same part number, and
only differs in the "Variant" and "Revision" fields: XGene-1's MIDR is
0x500f0000, our eMAG reports 0x503f0002. Experiments show the latter
doesn't show the faulty behaviour.

Increase the specificity of the check to only consider partnum 0x000 and
variant 0x00, to exclude the Ampere eMAG.

Fixes: 012f18850452 ("clocksource/drivers/arm_arch_timer: Work around broken CVAL implementations")
Reported-by: Ross Burton <ross.burton@arm.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20231016153127.116101-1-andre.przywara@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cputype.h     | 3 ++-
 arch/arm64/kvm/guest.c               | 2 +-
 drivers/clocksource/arm_arch_timer.c | 5 +++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 5f6f84837a49..818ea1c83b50 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -85,7 +85,8 @@
 #define ARM_CPU_PART_NEOVERSE_N2	0xD49
 #define ARM_CPU_PART_CORTEX_A78C	0xD4B
 
-#define APM_CPU_PART_POTENZA		0x000
+#define APM_CPU_PART_XGENE		0x000
+#define APM_CPU_VAR_POTENZA		0x00
 
 #define CAVIUM_CPU_PART_THUNDERX	0x0A1
 #define CAVIUM_CPU_PART_THUNDERX_81XX	0x0A2
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 95f6945c4432..a1710e5fa72b 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -874,7 +874,7 @@ u32 __attribute_const__ kvm_target_cpu(void)
 		break;
 	case ARM_CPU_IMP_APM:
 		switch (part_number) {
-		case APM_CPU_PART_POTENZA:
+		case APM_CPU_PART_XGENE:
 			return KVM_ARM_TARGET_XGENE_POTENZA;
 		}
 		break;
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 7dd2c615bce2..071b04f1ee73 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -836,8 +836,9 @@ static u64 __arch_timer_check_delta(void)
 		 * Note that TVAL is signed, thus has only 31 of its
 		 * 32 bits to express magnitude.
 		 */
-		MIDR_ALL_VERSIONS(MIDR_CPU_MODEL(ARM_CPU_IMP_APM,
-						 APM_CPU_PART_POTENZA)),
+		MIDR_REV_RANGE(MIDR_CPU_MODEL(ARM_CPU_IMP_APM,
+					      APM_CPU_PART_XGENE),
+			       APM_CPU_VAR_POTENZA, 0x0, 0xf),
 		{},
 	};
 
-- 
cgit v1.2.3


From 50b560783f7f71790bcf70e9e9855155fb0af8c1 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao418@huawei.com>
Date: Thu, 19 Oct 2023 17:13:52 +0800
Subject: drivers/perf: hisi: use cpuhp_state_remove_instance_nocalls() for
 hisi_hns3_pmu uninit process

When tearing down a 'hisi_hns3' PMU, we mistakenly run the CPU hotplug
callbacks after the device has been unregistered, leading to fireworks
when we try to execute empty function callbacks within the driver:

  | Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
  | CPU: 0 PID: 15 Comm: cpuhp/0 Tainted: G        W  O      5.12.0-rc4+ #1
  | Hardware name:  , BIOS KpxxxFPGA 1P B600 V143 04/22/2021
  | pstate: 80400009 (Nzcv daif +PAN -UAO -TCO BTYPE=--)
  | pc : perf_pmu_migrate_context+0x98/0x38c
  | lr : perf_pmu_migrate_context+0x94/0x38c
  |
  | Call trace:
  |  perf_pmu_migrate_context+0x98/0x38c
  |  hisi_hns3_pmu_offline_cpu+0x104/0x12c [hisi_hns3_pmu]

Use cpuhp_state_remove_instance_nocalls() instead of
cpuhp_state_remove_instance() so that the notifiers don't execute after
the PMU device has been unregistered.

Fixes: 66637ab137b4 ("drivers/perf: hisi: add driver for HNS3 PMU")
Signed-off-by: Hao Chen <chenhao418@huawei.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
Reviewed-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20231019091352.998964-1-shaojijie@huawei.com
[will: Rewrote commit message]
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hns3_pmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c
index e0457d84af6b..16869bf5bf4c 100644
--- a/drivers/perf/hisilicon/hns3_pmu.c
+++ b/drivers/perf/hisilicon/hns3_pmu.c
@@ -1556,8 +1556,8 @@ static int hns3_pmu_init_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu)
 	ret = perf_pmu_register(&hns3_pmu->pmu, hns3_pmu->pmu.name, -1);
 	if (ret) {
 		pci_err(pdev, "failed to register perf PMU, ret = %d.\n", ret);
-		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
-					    &hns3_pmu->node);
+		cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+						    &hns3_pmu->node);
 	}
 
 	return ret;
@@ -1568,8 +1568,8 @@ static void hns3_pmu_uninit_pmu(struct pci_dev *pdev)
 	struct hns3_pmu *hns3_pmu = pci_get_drvdata(pdev);
 
 	perf_pmu_unregister(&hns3_pmu->pmu);
-	cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
-				    &hns3_pmu->node);
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+					    &hns3_pmu->node);
 }
 
 static int hns3_pmu_init_dev(struct pci_dev *pdev)
-- 
cgit v1.2.3


From 58f8fc57b1d314b5402c374a8c454ac7c870574c Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 13 Oct 2023 08:13:54 +0530
Subject: drivers: perf: arm_pmuv3: Read PMMIR_EL1 unconditionally

Currently the PMUv3 driver only reads PMMIR_EL1 if the PMU implements
FEAT_PMUv3p4 and the STALL_SLOT event, but the check for STALL_SLOT event
isn't necessary and can be removed.

The check for STALL_SLOT event was introduced with the read of PMMIR_EL1 in
commit f5be3a61fdb5dd11 ("arm64: perf: Add support caps under sysfs")

When this logic was written, the ARM ARM said:

| If STALL_SLOT is not implemented, it is IMPLEMENTATION DEFINED whether
| the PMMIR System registers are implemented.

... and thus the driver had to check for STALL_SLOT event to verify that
PMMIR_EL1 was implemented and accesses to PMMIR_EL1 would not be UNDEFINED.

Subsequently, the architecture was retrospectively tightened to require
that any FEAT_PMUv3p4 implementation implements PMMIR_EL1. Since the G.b
release of the ARM ARM, the wording regarding STALL_SLOT event has been
removed, and the description of PMMIR_EL1 says:

| This register is present only when FEAT_PMUv3p4 is implemented.

Drop the unnecessary check for STALL_SLOT event when reading PMMIR_EL1.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20231013024354.1289070-1-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 8fcaa26f0f8a..4ee0afe00527 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1126,7 +1126,7 @@ static void __armv8pmu_probe_pmu(void *info)
 			     pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);
 
 	/* store PMMIR register for sysfs */
-	if (is_pmuv3p4(pmuver) && (pmceid_raw[1] & BIT(31)))
+	if (is_pmuv3p4(pmuver))
 		cpu_pmu->reg_pmmir = read_pmmir();
 	else
 		cpu_pmu->reg_pmmir = 0;
-- 
cgit v1.2.3


From 3b9a22d345ff89232227b2449a311bf3f910f5f2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 16 Oct 2023 08:24:36 +0530
Subject: drivers: perf: arm_pmuv3: Drop some unused arguments from
 armv8_pmu_init()

All the PMU init functions want the default sysfs attribute groups, and so
these all call armv8_pmu_init_nogroups() helper, with none of them calling
armv8_pmu_init() directly. When we introduced armv8_pmu_init_nogroups() in
the commit e424b1798526 ("arm64: perf: Refactor PMU init callbacks")

 ... we thought that we might need custom attribute groups in future, but
as we evidently haven't, we can remove the option.

This patch folds armv8_pmu_init_nogroups() into armv8_pmu_init(), removing
the ability to use custom attribute groups and simplifying the code.

CC: James Clark <james.clark@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lore.kernel.org/r/20231016025436.1368945-1-anshuman.khandual@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_pmuv3.c | 44 ++++++++++++--------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 4ee0afe00527..4f6923ad4589 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -1187,10 +1187,7 @@ static void armv8_pmu_register_sysctl_table(void)
 }
 
 static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
-			  int (*map_event)(struct perf_event *event),
-			  const struct attribute_group *events,
-			  const struct attribute_group *format,
-			  const struct attribute_group *caps)
+			  int (*map_event)(struct perf_event *event))
 {
 	int ret = armv8pmu_probe_pmu(cpu_pmu);
 	if (ret)
@@ -1212,27 +1209,17 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
-			events : &armv8_pmuv3_events_attr_group;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ?
-			format : &armv8_pmuv3_format_attr_group;
-	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
-			caps : &armv8_pmuv3_caps_attr_group;
-
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group;
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group;
+	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = &armv8_pmuv3_caps_attr_group;
 	armv8_pmu_register_sysctl_table();
 	return 0;
 }
 
-static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
-				   int (*map_event)(struct perf_event *event))
-{
-	return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
-}
-
 #define PMUV3_INIT_SIMPLE(name)						\
 static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
 {									\
-	return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\
+	return armv8_pmu_init(cpu_pmu, #name, armv8_pmuv3_map_event);	\
 }
 
 PMUV3_INIT_SIMPLE(armv8_pmuv3)
@@ -1263,44 +1250,37 @@ PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
 
 static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35",
-				       armv8_a53_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cortex_a35", armv8_a53_map_event);
 }
 
 static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53",
-				       armv8_a53_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cortex_a53", armv8_a53_map_event);
 }
 
 static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
-				       armv8_a57_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cortex_a57", armv8_a57_map_event);
 }
 
 static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
-				       armv8_a57_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cortex_a72", armv8_a57_map_event);
 }
 
 static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73",
-				       armv8_a73_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cortex_a73", armv8_a73_map_event);
 }
 
 static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
-				       armv8_thunder_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_cavium_thunder", armv8_thunder_map_event);
 }
 
 static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
 {
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan",
-				       armv8_vulcan_map_event);
+	return armv8_pmu_init(cpu_pmu, "armv8_brcm_vulcan", armv8_vulcan_map_event);
 }
 
 static const struct of_device_id armv8_pmu_of_device_ids[] = {
-- 
cgit v1.2.3


From e3e73f511c49c741f6309862c2248958ad77bbaa Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 20 Oct 2023 18:51:25 +0100
Subject: perf/arm-cmn: Fix DTC domain detection

It transpires that dtm_unit_info is another register which got shuffled
in CMN-700 without me noticing. Fix that in a way which also proactively
fixes the fragile laziness of its consumer, just in case any further
fields ever get added alongside dtc_domain.

Fixes: 23760a014417 ("perf/arm-cmn: Add CMN-700 support")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/3076ee83d0554f6939fbb6ee49ab2bdb28d8c7ee.1697824215.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 913dc04b3a40..f1ac8d0cdb3b 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -112,7 +112,9 @@
 
 #define CMN_DTM_PMEVCNTSR		0x240
 
-#define CMN_DTM_UNIT_INFO		0x0910
+#define CMN650_DTM_UNIT_INFO		0x0910
+#define CMN_DTM_UNIT_INFO		0x0960
+#define CMN_DTM_UNIT_INFO_DTC_DOMAIN	GENMASK_ULL(1, 0)
 
 #define CMN_DTM_NUM_COUNTERS		4
 /* Want more local counters? Why not replicate the whole DTM! Ugh... */
@@ -2117,6 +2119,16 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 	return 0;
 }
 
+static unsigned int arm_cmn_dtc_domain(struct arm_cmn *cmn, void __iomem *xp_region)
+{
+	int offset = CMN_DTM_UNIT_INFO;
+
+	if (cmn->part == PART_CMN650 || cmn->part == PART_CI700)
+		offset = CMN650_DTM_UNIT_INFO;
+
+	return FIELD_GET(CMN_DTM_UNIT_INFO_DTC_DOMAIN, readl_relaxed(xp_region + offset));
+}
+
 static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_cmn_node *node)
 {
 	int level;
@@ -2248,7 +2260,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (cmn->part == PART_CMN600)
 			xp->dtc = 0xf;
 		else
-			xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO);
+			xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region);
 
 		xp->dtm = dtm - cmn->dtms;
 		arm_cmn_init_dtm(dtm++, xp, 0);
-- 
cgit v1.2.3


From 7633ec2c262fab3e7c5bf3cd3876b5748f584a57 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 20 Oct 2023 18:51:26 +0100
Subject: perf/arm-cmn: Rework DTC counters (again)

The bitmap-based scheme for tracking DTC counter usage turns out to be a
complete dead-end for its imagined purpose, since by the time we have to
keep track of a per-DTC counter index anyway, we already have enough
information to make the bitmap itself redundant. Revert the remains of
it back to almost the original scheme, but now expanded to track per-DTC
indices, in preparation for making use of them in anger.

Note that since cycle count events always use a dedicated counter on a
single DTC, we reuse the field to encode their DTC index directly.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Ilkka Koskinen <ilkka@os.amperecomputing.com>
Link: https://lore.kernel.org/r/5f6ade76b47f033836d7a36c03555da896dfb4a3.1697824215.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++++++------------------------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index f1ac8d0cdb3b..675f1638013e 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -281,16 +281,13 @@ struct arm_cmn_node {
 	u16 id, logid;
 	enum cmn_node_type type;
 
-	int dtm;
-	union {
-		/* DN/HN-F/CXHA */
-		struct {
-			u8 val : 4;
-			u8 count : 4;
-		} occupid[SEL_MAX];
-		/* XP */
-		u8 dtc;
-	};
+	u8 dtm;
+	s8 dtc;
+	/* DN/HN-F/CXHA */
+	struct {
+		u8 val : 4;
+		u8 count : 4;
+	} occupid[SEL_MAX];
 	union {
 		u8 event[4];
 		__le32 event_sel;
@@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
 
 		seq_puts(s, "\n     |");
 		for (x = 0; x < cmn->mesh_x; x++) {
-			u8 dtc = cmn->xps[xp_base + x].dtc;
+			s8 dtc = cmn->xps[xp_base + x].dtc;
 
-			if (dtc & (dtc - 1))
+			if (dtc < 0)
 				seq_puts(s, " DTC ?? |");
 			else
-				seq_printf(s, " DTC %ld  |", __ffs(dtc));
+				seq_printf(s, " DTC %d  |", dtc);
 		}
 		seq_puts(s, "\n     |");
 		for (x = 0; x < cmn->mesh_x; x++)
@@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[4];
-	unsigned int dtc_idx;
-	u8 dtcs_used;
+	s8 dtc_idx[CMN_MAX_DTCS];
 	u8 num_dns;
 	u8 dtm_offset;
 	bool wide_sel;
@@ -600,6 +596,10 @@ struct arm_cmn_hw_event {
 #define for_each_hw_dn(hw, dn, i) \
 	for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++)
 
+/* @i is the DTC number, @idx is the counter index on that DTC */
+#define for_each_hw_dtc_idx(hw, i, idx) \
+	for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0)
+
 static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event)
 {
 	BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target));
@@ -1429,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
-	unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx);
 	u64 count;
 
-	for (i = 0; hw->dtcs_used & (1U << i); i++) {
-		writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt);
-		cmn->dtc[i].counters[hw->dtc_idx] = event;
+	for_each_hw_dtc_idx(hw, i, idx) {
+		writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx));
+		cmn->dtc[i].counters[idx] = event;
 	}
 
 	count = arm_cmn_read_dtm(cmn, hw, false);
@@ -1447,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event)
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	u64 delta, new, prev;
 	unsigned long flags;
-	unsigned int i;
 
-	if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) {
-		i = __ffs(hw->dtcs_used);
-		delta = arm_cmn_read_cc(cmn->dtc + i);
+	if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) {
+		delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]);
 		local64_add(delta, &event->count);
 		return;
 	}
@@ -1461,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event)
 	delta = new - prev;
 
 	local_irq_save(flags);
-	for (i = 0; hw->dtcs_used & (1U << i); i++) {
-		new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx);
+	for_each_hw_dtc_idx(hw, i, idx) {
+		new = arm_cmn_read_counter(cmn->dtc + i, idx);
 		delta += new << 16;
 	}
 	local_irq_restore(flags);
@@ -1518,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
 	int i;
 
 	if (type == CMN_TYPE_DTC) {
-		i = __ffs(hw->dtcs_used);
+		i = hw->dtc_idx[0];
 		writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
 		cmn->dtc[i].cc_active = true;
 	} else if (type == CMN_TYPE_WP) {
@@ -1549,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 	int i;
 
 	if (type == CMN_TYPE_DTC) {
-		i = __ffs(hw->dtcs_used);
+		i = hw->dtc_idx[0];
 		cmn->dtc[i].cc_active = false;
 	} else if (type == CMN_TYPE_WP) {
 		int wp_idx = arm_cmn_wp_idx(event);
@@ -1735,12 +1732,19 @@ static int arm_cmn_event_init(struct perf_event *event)
 	hw->dn = arm_cmn_node(cmn, type);
 	if (!hw->dn)
 		return -EINVAL;
+
+	memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx));
 	for (dn = hw->dn; dn->type == type; dn++) {
 		if (bynodeid && dn->id != nodeid) {
 			hw->dn++;
 			continue;
 		}
 		hw->num_dns++;
+		if (dn->dtc < 0)
+			memset(hw->dtc_idx, 0, cmn->num_dtcs);
+		else
+			hw->dtc_idx[dn->dtc] = 0;
+
 		if (bynodeid)
 			break;
 	}
@@ -1752,12 +1756,6 @@ static int arm_cmn_event_init(struct perf_event *event)
 			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
-	/*
-	 * Keep assuming non-cycles events count in all DTC domains; turns out
-	 * it's hard to make a worthwhile optimisation around this, short of
-	 * going all-in with domain-local counter allocation as well.
-	 */
-	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
 
 	return arm_cmn_validate_group(cmn, event);
 }
@@ -1783,28 +1781,25 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	}
 	memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
 
-	for (i = 0; hw->dtcs_used & (1U << i); i++)
-		cmn->dtc[i].counters[hw->dtc_idx] = NULL;
+	for_each_hw_dtc_idx(hw, j, idx)
+		cmn->dtc[j].counters[idx] = NULL;
 }
 
 static int arm_cmn_event_add(struct perf_event *event, int flags)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
-	struct arm_cmn_dtc *dtc = &cmn->dtc[0];
 	struct arm_cmn_node *dn;
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
-	unsigned int i, dtc_idx, input_sel;
+	unsigned int input_sel, i = 0;
 
 	if (type == CMN_TYPE_DTC) {
-		i = 0;
 		while (cmn->dtc[i].cycles)
 			if (++i == cmn->num_dtcs)
 				return -ENOSPC;
 
 		cmn->dtc[i].cycles = event;
-		hw->dtc_idx = CMN_DT_NUM_COUNTERS;
-		hw->dtcs_used = 1U << i;
+		hw->dtc_idx[0] = i;
 
 		if (flags & PERF_EF_START)
 			arm_cmn_event_start(event, 0);
@@ -1812,17 +1807,22 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 	}
 
 	/* Grab a free global counter first... */
-	dtc_idx = 0;
-	while (dtc->counters[dtc_idx])
-		if (++dtc_idx == CMN_DT_NUM_COUNTERS)
-			return -ENOSPC;
-
-	hw->dtc_idx = dtc_idx;
+	for_each_hw_dtc_idx(hw, j, idx) {
+		if (j > 0) {
+			idx = hw->dtc_idx[0];
+		} else {
+			idx = 0;
+			while (cmn->dtc[j].counters[idx])
+				if (++idx == CMN_DT_NUM_COUNTERS)
+					goto free_dtms;
+		}
+		hw->dtc_idx[j] = idx;
+	}
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
 		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
-		unsigned int dtm_idx, shift;
+		unsigned int dtm_idx, shift, d = 0;
 		u64 reg;
 
 		dtm_idx = 0;
@@ -1841,11 +1841,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 			tmp = dtm->wp_event[wp_idx ^ 1];
 			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
-					CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
+					CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
 				goto free_dtms;
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
-			dtm->wp_event[wp_idx] = dtc_idx;
+			dtm->wp_event[wp_idx] = hw->dtc_idx[d];
 			writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
@@ -1865,7 +1865,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 		dtm->input_sel[dtm_idx] = input_sel;
 		shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
 		dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
-		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
+		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift;
 		dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
 		reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
 		writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
@@ -1893,7 +1893,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags)
 	arm_cmn_event_stop(event, PERF_EF_UPDATE);
 
 	if (type == CMN_TYPE_DTC)
-		cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL;
+		cmn->dtc[hw->dtc_idx[0]].cycles = NULL;
 	else
 		arm_cmn_event_clear(cmn, event, hw->num_dns);
 }
@@ -2074,7 +2074,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
 	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
-	u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
 	if (!cmn->dtc)
@@ -2084,23 +2083,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
+	if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) {
+		/* We do at least know that a DTC's XP must be in that DTC's domain */
+		dn = arm_cmn_node(cmn, CMN_TYPE_DTC);
+		for (int i = 0; i < cmn->num_dtcs; i++)
+			arm_cmn_node_to_xp(cmn, dn + i)->dtc = i;
+	}
+
 	for (dn = cmn->dns; dn->type; dn++) {
-		if (dn->type == CMN_TYPE_XP) {
-			dn->dtc &= dtcs_present;
+		if (dn->type == CMN_TYPE_XP)
 			continue;
-		}
 
 		xp = arm_cmn_node_to_xp(cmn, dn);
+		dn->dtc = xp->dtc;
 		dn->dtm = xp->dtm;
 		if (cmn->multi_dtm)
 			dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
 
 		if (dn->type == CMN_TYPE_DTC) {
-			int err;
-			/* We do at least know that a DTC's XP must be in that DTC's domain */
-			if (xp->dtc == 0xf)
-				xp->dtc = 1 << dtc_idx;
-			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+			int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+
 			if (err)
 				return err;
 		}
@@ -2258,9 +2260,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			cmn->mesh_x = xp->logid;
 
 		if (cmn->part == PART_CMN600)
-			xp->dtc = 0xf;
+			xp->dtc = -1;
 		else
-			xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region);
+			xp->dtc = arm_cmn_dtc_domain(cmn, xp_region);
 
 		xp->dtm = dtm - cmn->dtms;
 		arm_cmn_init_dtm(dtm++, xp, 0);
-- 
cgit v1.2.3


From ab33c66fd8f124f7c4a35c96fcfbcd262764b0f6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 20 Oct 2023 18:51:27 +0100
Subject: perf/arm-cmn: Enable per-DTC counter allocation

Finally enable independent per-DTC-domain counter allocation, except on
CMN-600 where we still need to cope with not knowing the domain topology
and thus keep counter indices sychronised across domains. This allows
users to simultaneously count up to 8 targeted events per domain, rather
than 8 globally, for up to 4x wider coverage on maximum configurations.

Even though this now looks deceptively simple, I stand by my previous
assertion that it was a flippin' nightmare to implement; all the real
head-scratchers are hidden in the foundations in the previous patch...

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/849f65566582cb102c6d0843d0f26e231180f8ac.1697824215.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 675f1638013e..9479e919c063 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1570,7 +1570,7 @@ struct arm_cmn_val {
 	u8 dtm_count[CMN_MAX_DTMS];
 	u8 occupid[CMN_MAX_DTMS][SEL_MAX];
 	u8 wp[CMN_MAX_DTMS][4];
-	int dtc_count;
+	int dtc_count[CMN_MAX_DTCS];
 	bool cycles;
 };
 
@@ -1591,7 +1591,8 @@ static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
 		return;
 	}
 
-	val->dtc_count++;
+	for_each_hw_dtc_idx(hw, dtc, idx)
+		val->dtc_count[dtc]++;
 
 	for_each_hw_dn(hw, dn, i) {
 		int wp_idx, dtm = dn->dtm, sel = hw->filter_sel;
@@ -1638,8 +1639,9 @@ static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 		goto done;
 	}
 
-	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
-		goto done;
+	for (i = 0; i < CMN_MAX_DTCS; i++)
+		if (val->dtc_count[i] == CMN_DT_NUM_COUNTERS)
+			goto done;
 
 	for_each_hw_dn(hw, dn, i) {
 		int wp_idx, wp_cmb, dtm = dn->dtm, sel = hw->filter_sel;
@@ -1806,9 +1808,9 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 		return 0;
 	}
 
-	/* Grab a free global counter first... */
+	/* Grab the global counters first... */
 	for_each_hw_dtc_idx(hw, j, idx) {
-		if (j > 0) {
+		if (cmn->part == PART_CMN600 && j > 0) {
 			idx = hw->dtc_idx[0];
 		} else {
 			idx = 0;
@@ -1819,10 +1821,10 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 		hw->dtc_idx[j] = idx;
 	}
 
-	/* ...then the local counters to feed it. */
+	/* ...then the local counters to feed them */
 	for_each_hw_dn(hw, dn, i) {
 		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
-		unsigned int dtm_idx, shift, d = 0;
+		unsigned int dtm_idx, shift, d = max_t(int, dn->dtc, 0);
 		u64 reg;
 
 		dtm_idx = 0;
-- 
cgit v1.2.3


From 23b727dc2092d4b0604347ff3f0a75705078058a Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Tue, 17 Oct 2023 00:23:20 -0500
Subject: arm64: cpufeature: Display the set of cores with a feature

The AMU feature can be enabled on a subset of the cores in a system.
Because of that, it prints a message for each core as it is detected.
This becomes tedious when there are hundreds of cores. Instead, for
CPU features which can be enabled on a subset of the present cores,
lets wait until update_cpu_capabilities() and print the subset of cores
the feature was enabled on.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Tested-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
Tested-by: Punit Agrawal <punit.agrawal@bytedance.com>
Link: https://lore.kernel.org/r/20231017052322.1211099-2-jeremy.linton@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  2 ++
 arch/arm64/kernel/cpufeature.c      | 22 +++++++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 5bba39376055..19b4d001d845 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -23,6 +23,7 @@
 #include <linux/bug.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
+#include <linux/cpumask.h>
 
 /*
  * CPU feature register tracking
@@ -380,6 +381,7 @@ struct arm64_cpu_capabilities {
 	 * method is robust against being called multiple times.
 	 */
 	const struct arm64_cpu_capabilities *match_list;
+	const struct cpumask *cpus;
 };
 
 static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 444a73c2e638..2dd695fc3472 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1944,8 +1944,6 @@ int get_cpu_with_amu_feat(void)
 static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap)
 {
 	if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) {
-		pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n",
-			smp_processor_id());
 		cpumask_set_cpu(smp_processor_id(), &amu_cpus);
 
 		/* 0 reference values signal broken/disabled counters */
@@ -2405,16 +2403,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 #endif /* CONFIG_ARM64_RAS_EXTN */
 #ifdef CONFIG_ARM64_AMU_EXTN
 	{
-		/*
-		 * The feature is enabled by default if CONFIG_ARM64_AMU_EXTN=y.
-		 * Therefore, don't provide .desc as we don't want the detection
-		 * message to be shown until at least one CPU is detected to
-		 * support the feature.
-		 */
+		.desc = "Activity Monitors Unit (AMU)",
 		.capability = ARM64_HAS_AMU_EXTN,
 		.type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
 		.matches = has_amu,
 		.cpu_enable = cpu_amu_enable,
+		.cpus = &amu_cpus,
 		ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP)
 	},
 #endif /* CONFIG_ARM64_AMU_EXTN */
@@ -2981,7 +2975,7 @@ static void update_cpu_capabilities(u16 scope_mask)
 		    !caps->matches(caps, cpucap_default_scope(caps)))
 			continue;
 
-		if (caps->desc)
+		if (caps->desc && !caps->cpus)
 			pr_info("detected: %s\n", caps->desc);
 
 		__set_bit(caps->capability, system_cpucaps);
@@ -3330,6 +3324,7 @@ unsigned long cpu_get_elf_hwcap2(void)
 
 static void __init setup_system_capabilities(void)
 {
+	int i;
 	/*
 	 * We have finalised the system-wide safe feature
 	 * registers, finalise the capabilities that depend
@@ -3338,6 +3333,15 @@ static void __init setup_system_capabilities(void)
 	 */
 	update_cpu_capabilities(SCOPE_SYSTEM);
 	enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU);
+
+	for (i = 0; i < ARM64_NCAPS; i++) {
+		const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i];
+
+		if (caps && caps->cpus && caps->desc &&
+			cpumask_any(caps->cpus) < nr_cpu_ids)
+			pr_info("detected: %s on CPU%*pbl\n",
+				caps->desc, cpumask_pr_args(caps->cpus));
+	}
 }
 
 void __init setup_cpu_features(void)
-- 
cgit v1.2.3


From 04d402a453c37f74487d690d11caeccd59fdeb9f Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Tue, 17 Oct 2023 00:23:21 -0500
Subject: arm64: cpufeature: Change DBM to display enabled cores

Now that we have the ability to display the list of cores
with a feature when its selectivly enabled, lets convert
DBM to use that as well.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Link: https://lore.kernel.org/r/20231017052322.1211099-3-jeremy.linton@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 2dd695fc3472..b7b67bac0e60 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1848,6 +1848,8 @@ static int __init parse_kpti(char *str)
 early_param("kpti", parse_kpti);
 
 #ifdef CONFIG_ARM64_HW_AFDBM
+static struct cpumask dbm_cpus __read_mostly;
+
 static inline void __cpu_enable_hw_dbm(void)
 {
 	u64 tcr = read_sysreg(tcr_el1) | TCR_HD;
@@ -1883,35 +1885,22 @@ static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap)
 
 static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap)
 {
-	if (cpu_can_use_dbm(cap))
+	if (cpu_can_use_dbm(cap)) {
 		__cpu_enable_hw_dbm();
+		cpumask_set_cpu(smp_processor_id(), &dbm_cpus);
+	}
 }
 
 static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap,
 		       int __unused)
 {
-	static bool detected = false;
 	/*
 	 * DBM is a non-conflicting feature. i.e, the kernel can safely
 	 * run a mix of CPUs with and without the feature. So, we
 	 * unconditionally enable the capability to allow any late CPU
 	 * to use the feature. We only enable the control bits on the
-	 * CPU, if it actually supports.
-	 *
-	 * We have to make sure we print the "feature" detection only
-	 * when at least one CPU actually uses it. So check if this CPU
-	 * can actually use it and print the message exactly once.
-	 *
-	 * This is safe as all CPUs (including secondary CPUs - due to the
-	 * LOCAL_CPU scope - and the hotplugged CPUs - via verification)
-	 * goes through the "matches" check exactly once. Also if a CPU
-	 * matches the criteria, it is guaranteed that the CPU will turn
-	 * the DBM on, as the capability is unconditionally enabled.
+	 * CPU, if it is supported.
 	 */
-	if (!detected && cpu_can_use_dbm(cap)) {
-		detected = true;
-		pr_info("detected: Hardware dirty bit management\n");
-	}
 
 	return true;
 }
@@ -2448,18 +2437,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	},
 #ifdef CONFIG_ARM64_HW_AFDBM
 	{
-		/*
-		 * Since we turn this on always, we don't want the user to
-		 * think that the feature is available when it may not be.
-		 * So hide the description.
-		 *
-		 * .desc = "Hardware pagetable Dirty Bit Management",
-		 *
-		 */
+		.desc = "Hardware dirty bit management",
 		.type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
 		.capability = ARM64_HW_DBM,
 		.matches = has_hw_dbm,
 		.cpu_enable = cpu_enable_hw_dbm,
+		.cpus = &dbm_cpus,
 		ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM)
 	},
 #endif
-- 
cgit v1.2.3


From 6d7d51e88e21c0af1ca96a3617afef334bfeffcf Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Tue, 24 Oct 2023 17:29:53 +0800
Subject: drivers/perf: hisi_pcie: Check the type first in pmu::event_init()

Check whether the event type matches the PMU type firstly in
pmu::event_init() before touching the event. Otherwise we'll
change the events of others and lead to incorrect results.
Since in perf_init_event() we may call every pmu's event_init()
in a certain case, we should not modify the event if it's not
ours.

Fixes: 8404b0fbc7fb ("drivers/perf: hisi: Add driver for HiSilicon PCIe PMU")
Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20231024092954.42297-2-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 5a00adb2de8c..051efffc44c8 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -353,6 +353,10 @@ static int hisi_pcie_pmu_event_init(struct perf_event *event)
 	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
 	struct hw_perf_event *hwc = &event->hw;
 
+	/* Check the type first before going on, otherwise it's not our event */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
 	event->cpu = pcie_pmu->on_cpu;
 
 	if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event)))
@@ -360,9 +364,6 @@ static int hisi_pcie_pmu_event_init(struct perf_event *event)
 	else
 		hwc->event_base = HISI_PCIE_CNT;
 
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
 	/* Sampling is not supported. */
 	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3


From 868f8a709874729998bcad3422124b678b18b755 Mon Sep 17 00:00:00 2001
From: Yicong Yang <yangyicong@hisilicon.com>
Date: Tue, 24 Oct 2023 17:29:54 +0800
Subject: drivers/perf: hisi_pcie: Initialize event->cpu only on success

Initialize the event->cpu only on success. To be more reasonable
and keep consistent with other PMUs.

Signed-off-by: Yicong Yang <yangyicong@hisilicon.com>
Link: https://lore.kernel.org/r/20231024092954.42297-3-yangyicong@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
index 051efffc44c8..b90ba8aca3fa 100644
--- a/drivers/perf/hisilicon/hisi_pcie_pmu.c
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -357,8 +357,6 @@ static int hisi_pcie_pmu_event_init(struct perf_event *event)
 	if (event->attr.type != event->pmu->type)
 		return -ENOENT;
 
-	event->cpu = pcie_pmu->on_cpu;
-
 	if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event)))
 		hwc->event_base = HISI_PCIE_EXT_CNT;
 	else
@@ -374,6 +372,8 @@ static int hisi_pcie_pmu_event_init(struct perf_event *event)
 	if (!hisi_pcie_pmu_validate_event_group(event))
 		return -EINVAL;
 
+	event->cpu = pcie_pmu->on_cpu;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b805cafc604bfdb671fae7347a57f51154afa735 Mon Sep 17 00:00:00 2001
From: Junhao He <hejunhao3@huawei.com>
Date: Tue, 24 Oct 2023 19:36:30 +0800
Subject: perf: hisi: Fix use-after-free when register pmu fails

When we fail to register the uncore pmu, the pmu context may not been
allocated. The error handing will call cpuhp_state_remove_instance()
to call uncore pmu offline callback, which migrate the pmu context.
Since that's liable to lead to some kind of use-after-free.

Use cpuhp_state_remove_instance_nocalls() instead of
cpuhp_state_remove_instance() so that the notifiers don't execute after
the PMU device has been failed to register.

Fixes: a0ab25cd82ee ("drivers/perf: hisi: Add support for HiSilicon PA PMU driver")
FIxes: 3bf30882c3c7 ("drivers/perf: hisi: Add support for HiSilicon SLLC PMU driver")
Signed-off-by: Junhao He <hejunhao3@huawei.com>
Link: https://lore.kernel.org/r/20231024113630.13472-1-hejunhao3@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/hisilicon/hisi_uncore_pa_pmu.c   | 4 ++--
 drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index d941e746b424..797cf201996a 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -505,8 +505,8 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev)
 	ret = perf_pmu_register(&pa_pmu->pmu, name, -1);
 	if (ret) {
 		dev_err(pa_pmu->dev, "PMU register failed, ret = %d\n", ret);
-		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
-					    &pa_pmu->node);
+		cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
+						    &pa_pmu->node);
 		return ret;
 	}
 
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 6fe534a665ed..e706ca567676 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -450,8 +450,8 @@ static int hisi_sllc_pmu_probe(struct platform_device *pdev)
 	ret = perf_pmu_register(&sllc_pmu->pmu, name, -1);
 	if (ret) {
 		dev_err(sllc_pmu->dev, "PMU register failed, ret = %d\n", ret);
-		cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
-					    &sllc_pmu->node);
+		cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+						    &sllc_pmu->node);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From c54e52f84d7aa590e90e1f73f462517ac40051e1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Mon, 23 Oct 2023 14:35:03 +0100
Subject: arm64, irqchip/gic-v3, ACPI: Move MADT GICC enabled check into a
 helper

ACPI, irqchip and the architecture code all inspect the MADT
enabled bit for a GICC entry in the MADT.

The addition of an 'online capable' bit means all these sites need
updating.

Move the current checks behind a helper to make future updates easier.

Signed-off-by: James Morse <james.morse@arm.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Acked-by: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Link: https://lore.kernel.org/r/E1quv5D-00AeNJ-U8@rmk-PC.armlinux.org.uk
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/smp.c       |  2 +-
 drivers/acpi/processor_core.c |  2 +-
 drivers/irqchip/irq-gic-v3.c  | 10 ++++------
 include/linux/acpi.h          |  5 +++++
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 960b98b43506..8c8f55721786 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -520,7 +520,7 @@ acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
 {
 	u64 hwid = processor->arm_mpidr;
 
-	if (!(processor->flags & ACPI_MADT_ENABLED)) {
+	if (!acpi_gicc_is_usable(processor)) {
 		pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid);
 		return;
 	}
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 7dd6dbaa98c3..b203cfe28550 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -90,7 +90,7 @@ static int map_gicc_mpidr(struct acpi_subtable_header *entry,
 	struct acpi_madt_generic_interrupt *gicc =
 	    container_of(entry, struct acpi_madt_generic_interrupt, header);
 
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return -ENODEV;
 
 	/* device_declaration means Device object in DSDT, in the
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index eedfa8e9f077..72d3cdebdad1 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -2367,8 +2367,7 @@ gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
 	u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
 	void __iomem *redist_base;
 
-	/* GICC entry which has !ACPI_MADT_ENABLED is not unusable so skip */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	redist_base = ioremap(gicc->gicr_base_address, size);
@@ -2418,7 +2417,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 	 * If GICC is enabled and has valid gicr base address, then it means
 	 * GICR base is presented via GICC
 	 */
-	if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address) {
+	if (acpi_gicc_is_usable(gicc) && gicc->gicr_base_address) {
 		acpi_data.enabled_rdists++;
 		return 0;
 	}
@@ -2427,7 +2426,7 @@ static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
 	 * It's perfectly valid firmware can pass disabled GICC entry, driver
 	 * should not treat as errors, skip the entry instead of probe fail.
 	 */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	return -ENODEV;
@@ -2486,8 +2485,7 @@ static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *hea
 	int maint_irq_mode;
 	static int first_madt = true;
 
-	/* Skip unusable CPUs */
-	if (!(gicc->flags & ACPI_MADT_ENABLED))
+	if (!acpi_gicc_is_usable(gicc))
 		return 0;
 
 	maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index a73246c3c35e..fae2aa0b5e8b 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -256,6 +256,11 @@ acpi_table_parse_cedt(enum acpi_cedt_type id,
 int acpi_parse_mcfg (struct acpi_table_header *header);
 void acpi_table_print_madt_entry (struct acpi_subtable_header *madt);
 
+static inline bool acpi_gicc_is_usable(struct acpi_madt_generic_interrupt *gicc)
+{
+	return gicc->flags & ACPI_MADT_ENABLED;
+}
+
 /* the following numa functions are architecture-dependent */
 void acpi_numa_slit_init (struct acpi_table_slit *slit);
 
-- 
cgit v1.2.3


From d35686444fc80950c731e33a2f6ad4a55822be9b Mon Sep 17 00:00:00 2001
From: Maria Yu <quic_aiquny@quicinc.com>
Date: Tue, 24 Oct 2023 09:09:54 +0800
Subject: arm64: module: Fix PLT counting when CONFIG_RANDOMIZE_BASE=n

The counting of module PLTs has been broken when CONFIG_RANDOMIZE_BASE=n
since commit:

  3e35d303ab7d22c4 ("arm64: module: rework module VA range selection")

Prior to that commit, when CONFIG_RANDOMIZE_BASE=n, the kernel image and
all modules were placed within a 128M region, and no PLTs were necessary
for B or BL. Hence count_plts() and partition_branch_plt_relas() skipped
handling B and BL when CONFIG_RANDOMIZE_BASE=n.

After that commit, modules can be placed anywhere within a 2G window
regardless of CONFIG_RANDOMIZE_BASE, and hence PLTs may be necessary for
B and BL even when CONFIG_RANDOMIZE_BASE=n. Unfortunately that commit
failed to update count_plts() and partition_branch_plt_relas()
accordingly.

Due to this, module_emit_plt_entry() may fail if an insufficient number
of PLT entries have been reserved, resulting in modules failing to load
with -ENOEXEC.

Fix this by counting PLTs regardless of CONFIG_RANDOMIZE_BASE in
count_plts() and partition_branch_plt_relas().

Fixes: 3e35d303ab7d ("arm64: module: rework module VA range selection")
Signed-off-by: Maria Yu <quic_aiquny@quicinc.com>
Cc: <stable@vger.kernel.org> # 6.5.x
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Fixes: 3e35d303ab7d ("arm64: module: rework module VA range selection")
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231024010954.6768-1-quic_aiquny@quicinc.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/module-plts.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index bd69a4e7cd60..79200f21e123 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -167,9 +167,6 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
 		switch (ELF64_R_TYPE(rela[i].r_info)) {
 		case R_AARCH64_JUMP26:
 		case R_AARCH64_CALL26:
-			if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
-				break;
-
 			/*
 			 * We only have to consider branch targets that resolve
 			 * to symbols that are defined in a different section.
@@ -269,9 +266,6 @@ static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela,
 {
 	int i = 0, j = numrels - 1;
 
-	if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
-		return 0;
-
 	while (i < j) {
 		if (branch_rela_needs_plt(syms, &rela[i], dstidx))
 			i++;
-- 
cgit v1.2.3


From 146a15b873353f8ac28dc281c139ff611a3c4848 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 25 Oct 2023 10:21:28 -0700
Subject: arm64: Restrict CPU_BIG_ENDIAN to GNU as or LLVM IAS 15.x or newer

Prior to LLVM 15.0.0, LLVM's integrated assembler would incorrectly
byte-swap NOP when compiling for big-endian, and the resulting series of
bytes happened to match the encoding of FNMADD S21, S30, S0, S0.

This went unnoticed until commit:

  34f66c4c4d5518c1 ("arm64: Use a positive cpucap for FP/SIMD")

Prior to that commit, the kernel would always enable the use of FPSIMD
early in boot when __cpu_setup() initialized CPACR_EL1, and so usage of
FNMADD within the kernel was not detected, but could result in the
corruption of user or kernel FPSIMD state.

After that commit, the instructions happen to trap during boot prior to
FPSIMD being detected and enabled, e.g.

| Unhandled 64-bit el1h sync exception on CPU0, ESR 0x000000001fe00000 -- ASIMD
| CPU: 0 PID: 0 Comm: swapper Not tainted 6.6.0-rc3-00013-g34f66c4c4d55 #1
| Hardware name: linux,dummy-virt (DT)
| pstate: 400000c9 (nZcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : __pi_strcmp+0x1c/0x150
| lr : populate_properties+0xe4/0x254
| sp : ffffd014173d3ad0
| x29: ffffd014173d3af0 x28: fffffbfffddffcb8 x27: 0000000000000000
| x26: 0000000000000058 x25: fffffbfffddfe054 x24: 0000000000000008
| x23: fffffbfffddfe000 x22: fffffbfffddfe000 x21: fffffbfffddfe044
| x20: ffffd014173d3b70 x19: 0000000000000001 x18: 0000000000000005
| x17: 0000000000000010 x16: 0000000000000000 x15: 00000000413e7000
| x14: 0000000000000000 x13: 0000000000001bcc x12: 0000000000000000
| x11: 00000000d00dfeed x10: ffffd414193f2cd0 x9 : 0000000000000000
| x8 : 0101010101010101 x7 : ffffffffffffffc0 x6 : 0000000000000000
| x5 : 0000000000000000 x4 : 0101010101010101 x3 : 000000000000002a
| x2 : 0000000000000001 x1 : ffffd014171f2988 x0 : fffffbfffddffcb8
| Kernel panic - not syncing: Unhandled exception
| CPU: 0 PID: 0 Comm: swapper Not tainted 6.6.0-rc3-00013-g34f66c4c4d55 #1
| Hardware name: linux,dummy-virt (DT)
| Call trace:
|  dump_backtrace+0xec/0x108
|  show_stack+0x18/0x2c
|  dump_stack_lvl+0x50/0x68
|  dump_stack+0x18/0x24
|  panic+0x13c/0x340
|  el1t_64_irq_handler+0x0/0x1c
|  el1_abort+0x0/0x5c
|  el1h_64_sync+0x64/0x68
|  __pi_strcmp+0x1c/0x150
|  unflatten_dt_nodes+0x1e8/0x2d8
|  __unflatten_device_tree+0x5c/0x15c
|  unflatten_device_tree+0x38/0x50
|  setup_arch+0x164/0x1e0
|  start_kernel+0x64/0x38c
|  __primary_switched+0xbc/0xc4

Restrict CONFIG_CPU_BIG_ENDIAN to a known good assembler, which is
either GNU as or LLVM's IAS 15.0.0 and newer, which contains the linked
commit.

Closes: https://github.com/ClangBuiltLinux/linux/issues/1948
Link: https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Cc: stable@vger.kernel.org
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20231025-disable-arm64-be-ias-b4-llvm-15-v1-1-b25263ed8b23@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b10515c0200b..cacc67121adb 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1355,6 +1355,8 @@ choice
 config CPU_BIG_ENDIAN
 	bool "Build big-endian kernel"
 	depends on !LD_IS_LLD || LLD_VERSION >= 130000
+	# https://github.com/llvm/llvm-project/commit/1379b150991f70a5782e9a143c2ba5308da1161c
+	depends on AS_IS_GNU || AS_VERSION >= 150000
 	help
 	  Say Y if you plan on running a kernel with a big-endian userspace.
 
-- 
cgit v1.2.3