From aa44284960d550eb4d8614afdffebc68a432a9b4 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Mon, 6 Jun 2022 11:01:23 -0700
Subject: x86/mm/tlb: Avoid reading mm_tlb_gen when possible

On extreme TLB shootdown storms, the mm's tlb_gen cacheline is highly
contended and reading it should (arguably) be avoided as much as
possible.

Currently, flush_tlb_func() reads the mm's tlb_gen unconditionally,
even when it is not necessary (e.g., the mm was already switched).
This is wasteful.

Moreover, one of the existing optimizations is to read mm's tlb_gen to
see if there are additional in-flight TLB invalidations and flush the
entire TLB in such a case. However, if the request's tlb_gen was already
flushed, the benefit of checking the mm's tlb_gen is likely to be offset
by the overhead of the check itself.

Running will-it-scale with tlb_flush1_threads show a considerable
benefit on 56-core Skylake (up to +24%):

threads		Baseline (v5.17+)	+Patch
1		159960			160202
5		310808			308378 (-0.7%)
10		479110			490728
15		526771			562528
20		534495			587316
25		547462			628296
30		579616			666313
35		594134			701814
40		612288			732967
45		617517			749727
50		637476			735497
55		614363			778913 (+24%)

Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lkml.kernel.org/r/20220606180123.2485171-1-namit@vmware.com
---
 arch/x86/mm/tlb.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d400b6d9d246..d9314cc8b81f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
 	const struct flush_tlb_info *f = info;
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
-	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
 	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
 	bool local = smp_processor_id() == f->initiating_cpu;
 	unsigned long nr_invalidate = 0;
+	u64 mm_tlb_gen;
 
 	/* This code cannot presently handle being reentered. */
 	VM_WARN_ON(!irqs_disabled());
@@ -771,6 +771,22 @@ static void flush_tlb_func(void *info)
 		return;
 	}
 
+	if (f->new_tlb_gen <= local_tlb_gen) {
+		/*
+		 * The TLB is already up to date in respect to f->new_tlb_gen.
+		 * While the core might be still behind mm_tlb_gen, checking
+		 * mm_tlb_gen unnecessarily would have negative caching effects
+		 * so avoid it.
+		 */
+		return;
+	}
+
+	/*
+	 * Defer mm_tlb_gen reading as long as possible to avoid cache
+	 * contention.
+	 */
+	mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+
 	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
 		/*
 		 * There's nothing to do: we're already up to date.  This can
-- 
cgit v1.2.3


From f8c1d4ca55177326adad1fdc6bf602423a507542 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 19 Apr 2022 10:06:06 -0700
Subject: Documentation/protection-keys: Clean up documentation for User Space
 pkeys

The documentation for user space pkeys was a bit dated including things
such as Amazon and distribution testing information which is irrelevant
now.

Update the documentation.  This also streamlines adding the Supervisor
pkey documentation later on.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20220419170649.1022246-2-ira.weiny@intel.com
---
 Documentation/core-api/protection-keys.rst | 44 ++++++++++++++----------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index ec575e72d0b2..bf28ac0401f3 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,31 +4,29 @@
 Memory Protection Keys
 ======================
 
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
-which is found on Intel's Skylake (and later) "Scalable Processor"
-Server CPUs. It will be available in future non-server Intel parts
-and future AMD processors.
-
-For anyone wishing to test or use this feature, it is available in
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
-17.04 image.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
+Memory Protection Keys provide a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables when an
+application changes protection domains.
+
+Pkeys Userspace (PKU) is a feature which can be found on:
+        * Intel server CPUs, Skylake and later
+        * Intel client CPUs, Tiger Lake (11th Gen Core) and later
+        * Future AMD CPUs
+
+Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
+a "protection key", giving 16 possible keys.
+
+Protections for each key are defined with a per-CPU user-accessible register
+(PKRU).  Each of these is a 32-bit register storing two bits (Access Disable
+and Write Disable) for each of 16 keys.
+
+Being a CPU register, PKRU is inherently thread-local, potentially giving each
 thread a different set of protections from every other thread.
 
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
-to the new register.  The feature is only available in 64-bit mode,
-even though there is theoretically space in the PAE PTEs.  These
-permissions are enforced on data access only and have no effect on
-instruction fetches.
+There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
+register.  The feature is only available in 64-bit mode, even though there is
+theoretically space in the PAE PTEs.  These permissions are enforced on data
+access only and have no effect on instruction fetches.
 
 Syscalls
 ========
-- 
cgit v1.2.3


From 54ee1844047c1df015ab2679a4f55564a3aa1fa1 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Tue, 19 Apr 2022 10:06:07 -0700
Subject: x86/pkeys: Clarify PKRU_AD_KEY macro

When changing the PKRU_AD_KEY macro to be used for PKS the name came
into question.[1]

The intent of PKRU_AD_KEY is to set an initial value for the PKRU
register but that is just a mask value.

Clarify this by changing the name to PKRU_AD_MASK().

NOTE the checkpatch errors are ignored for the init_pkru_value to align
the values in the code.

[1] https://lore.kernel.org/lkml/eff862e2-bfaa-9e12-42b5-a12467d72a22@intel.com/

Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20220419170649.1022246-3-ira.weiny@intel.com
---
 arch/x86/mm/pkeys.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index e44e938885b7..7418c367e328 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
 	return vma_pkey(vma);
 }
 
-#define PKRU_AD_KEY(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+#define PKRU_AD_MASK(pkey)	(PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
 
 /*
  * Make the default PKRU value (at execve() time) as restrictive
@@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
  * in the process's lifetime will not accidentally get access
  * to data which is pkey-protected later on.
  */
-u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
-		      PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
-		      PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
-		      PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
-		      PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) |
+		      PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) |
+		      PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) |
+		      PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) |
+		      PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) |
+		      PKRU_AD_MASK(11) | PKRU_AD_MASK(12) |
+		      PKRU_AD_MASK(13) | PKRU_AD_MASK(14) |
+		      PKRU_AD_MASK(15);
 
 static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
 			     size_t count, loff_t *ppos)
-- 
cgit v1.2.3


From 8f1d56f64f8d6b80dea2d1978d10071132a695c5 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Sun, 10 Jul 2022 16:28:37 -0700
Subject: x86/mm/tlb: Ignore f->new_tlb_gen when zero

Commit aa44284960d5 ("x86/mm/tlb: Avoid reading mm_tlb_gen when
possible") introduced an optimization to skip superfluous TLB
flushes based on the generation provided in flush_tlb_info.

However, arch_tlbbatch_flush() does not provide any generation in
flush_tlb_info and populates the flush_tlb_info generation with
0.  This 0 is causes the flush_tlb_info to be interpreted as a
superfluous, old flush.  As a result, try_to_unmap_one() would
not perform any TLB flushes.

Fix it by checking whether f->new_tlb_gen is nonzero. Zero value
is anyhow is an invalid generation value. To avoid future
confusion, introduce TLB_GENERATION_INVALID constant and use it
properly. Add warnings to ensure no partial flushes are done with
TLB_GENERATION_INVALID or when f->mm is NULL, since this does not
make any sense.

In addition, add the missing unlikely().

[ dhansen: change VM_BUG_ON() -> VM_WARN_ON(), clarify changelog ]

Fixes: aa44284960d5 ("x86/mm/tlb: Avoid reading mm_tlb_gen when possible")
Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Hugh Dickins <hughd@google.com>
Link: https://lkml.kernel.org/r/20220710232837.3618-1-namit@vmware.com
---
 arch/x86/include/asm/tlbflush.h |  1 +
 arch/x86/mm/tlb.c               | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 98fa0a114074..21bf105d54f2 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -16,6 +16,7 @@
 void __flush_tlb_all(void);
 
 #define TLB_FLUSH_ALL	-1UL
+#define TLB_GENERATION_INVALID	0
 
 void cr4_update_irqsoff(unsigned long set, unsigned long clear);
 unsigned long cr4_read_shadow(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d9314cc8b81f..c1e31e9a85d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -771,7 +771,8 @@ static void flush_tlb_func(void *info)
 		return;
 	}
 
-	if (f->new_tlb_gen <= local_tlb_gen) {
+	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+		     f->new_tlb_gen <= local_tlb_gen)) {
 		/*
 		 * The TLB is already up to date in respect to f->new_tlb_gen.
 		 * While the core might be still behind mm_tlb_gen, checking
@@ -843,6 +844,12 @@ static void flush_tlb_func(void *info)
 		/* Partial flush */
 		unsigned long addr = f->start;
 
+		/* Partial flush cannot have invalid generations */
+		VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
+
+		/* Partial flush must have valid mm */
+		VM_WARN_ON(f->mm == NULL);
+
 		nr_invalidate = (f->end - f->start) >> f->stride_shift;
 
 		while (addr < f->end) {
@@ -1045,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		struct flush_tlb_info *info;
 
 		preempt_disable();
-		info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
+		info = get_flush_tlb_info(NULL, start, end, 0, false,
+					  TLB_GENERATION_INVALID);
 
 		on_each_cpu(do_kernel_range_flush, info, 1);
 
@@ -1214,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	int cpu = get_cpu();
 
-	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+	info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+				  TLB_GENERATION_INVALID);
 	/*
 	 * flush_tlb_multi() is not optimized for the common case in which only
 	 * a local TLB flush is needed. Optimize this use-case by calling
-- 
cgit v1.2.3