From e822969cab48b786b64246aad1a3ba2a774f5d23 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:33:48 -0800 Subject: mm/page_alloc.c: fix uninitialized memmaps on a partially populated last section Patch series "mm: fix max_pfn not falling on section boundary", v2. Playing with different memory sizes for a x86-64 guest, I discovered that some memmaps (highest section if max_mem does not fall on the section boundary) are marked as being valid and online, but contain garbage. We have to properly initialize these memmaps. Looking at /proc/kpageflags and friends, I found some more issues, partially related to this. This patch (of 3): If max_pfn is not aligned to a section boundary, we can easily run into BUGs. This can e.g., be triggered on x86-64 under QEMU by specifying a memory size that is not a multiple of 128MB (e.g., 4097MB, but also 4160MB). I was told that on real HW, we can easily have this scenario (esp., one of the main reasons sub-section hotadd of devmem was added). The issue is, that we have a valid memmap (pfn_valid()) for the whole section, and the whole section will be marked "online". pfn_to_online_page() will succeed, but the memmap contains garbage. E.g., doing a "./page-types -r -a 0x144001" when QEMU was started with "-m 4160M" - (see tools/vm/page-types.c): [ 200.476376] BUG: unable to handle page fault for address: fffffffffffffffe [ 200.477500] #PF: supervisor read access in kernel mode [ 200.478334] #PF: error_code(0x0000) - not-present page [ 200.479076] PGD 59614067 P4D 59614067 PUD 59616067 PMD 0 [ 200.479557] Oops: 0000 [#4] SMP NOPTI [ 200.479875] CPU: 0 PID: 603 Comm: page-types Tainted: G D W 5.5.0-rc1-next-20191209 #93 [ 200.480646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu4 [ 200.481648] RIP: 0010:stable_page_flags+0x4d/0x410 [ 200.482061] Code: f3 ff 41 89 c0 48 b8 00 00 00 00 01 00 00 00 45 84 c0 0f 85 cd 02 00 00 48 8b 53 08 48 8b 2b 48f [ 200.483644] RSP: 0018:ffffb139401cbe60 EFLAGS: 00010202 [ 200.484091] RAX: fffffffffffffffe RBX: fffffbeec5100040 RCX: 0000000000000000 [ 200.484697] RDX: 0000000000000001 RSI: ffffffff9535c7cd RDI: 0000000000000246 [ 200.485313] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000 [ 200.485917] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000144001 [ 200.486523] R13: 00007ffd6ba55f48 R14: 00007ffd6ba55f40 R15: ffffb139401cbf08 [ 200.487130] FS: 00007f68df717580(0000) GS:ffff9ec77fa00000(0000) knlGS:0000000000000000 [ 200.487804] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 200.488295] CR2: fffffffffffffffe CR3: 0000000135d48000 CR4: 00000000000006f0 [ 200.488897] Call Trace: [ 200.489115] kpageflags_read+0xe9/0x140 [ 200.489447] proc_reg_read+0x3c/0x60 [ 200.489755] vfs_read+0xc2/0x170 [ 200.490037] ksys_pread64+0x65/0xa0 [ 200.490352] do_syscall_64+0x5c/0xa0 [ 200.490665] entry_SYSCALL_64_after_hwframe+0x49/0xbe But it can be triggered much easier via "cat /proc/kpageflags > /dev/null" after cold/hot plugging a DIMM to such a system: [root@localhost ~]# cat /proc/kpageflags > /dev/null [ 111.517275] BUG: unable to handle page fault for address: fffffffffffffffe [ 111.517907] #PF: supervisor read access in kernel mode [ 111.518333] #PF: error_code(0x0000) - not-present page [ 111.518771] PGD a240e067 P4D a240e067 PUD a2410067 PMD 0 This patch fixes that by at least zero-ing out that memmap (so e.g., page_to_pfn() will not crash). Commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages") tried to fix a similar issue, but forgot to consider this special case. After this patch, there are still problems to solve. E.g., not all of these pages falling into a memory hole will actually get initialized later and set PageReserved - they are only zeroed out - but at least the immediate crashes are gone. A follow-up patch will take care of this. Link: http://lkml.kernel.org/r/20191211163201.17179-2-david@redhat.com Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: David Hildenbrand Tested-by: Daniel Jordan Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Andrew Morton Cc: Steven Sistare Cc: Michal Hocko Cc: Daniel Jordan Cc: Bob Picco Cc: Oscar Salvador Cc: Alexey Dobriyan Cc: Dan Williams Cc: Michal Hocko Cc: Stephen Rothwell Cc: [4.15+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15e908ad933b..10eeaaadf53a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6947,7 +6947,8 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) * This function also addresses a similar issue where struct pages are left * uninitialized because the physical address range is not covered by * memblock.memory or memblock.reserved. That could happen when memblock - * layout is manually configured via memmap=. + * layout is manually configured via memmap=, or when the highest physical + * address (max_pfn) does not end on a section boundary. */ void __init zero_resv_unavail(void) { @@ -6965,7 +6966,16 @@ void __init zero_resv_unavail(void) pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); next = end; } - pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn); + + /* + * Early sections always have a fully populated memmap for the whole + * section - see pfn_valid(). If the last section has holes at the + * end and that section is marked "online", the memmap will be + * considered initialized. Make sure that memmap has a well defined + * state. + */ + pgcnt += zero_pfn_range(PFN_DOWN(next), + round_up(max_pfn, PAGES_PER_SECTION)); /* * Struct pages that do not have backing memory. This could be because -- cgit v1.2.3 From 4b094b7851bf4bf551ad456195d3f26e1c03bd74 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:33:55 -0800 Subject: mm/page_alloc.c: initialize memmap of unavailable memory directly Let's make sure that all memory holes are actually marked PageReserved(), that page_to_pfn() produces reliable results, and that these pages are not detected as "mmap" pages due to the mapcount. E.g., booting a x86-64 QEMU guest with 4160 MB: [ 0.010585] Early memory node ranges [ 0.010586] node 0: [mem 0x0000000000001000-0x000000000009efff] [ 0.010588] node 0: [mem 0x0000000000100000-0x00000000bffdefff] [ 0.010589] node 0: [mem 0x0000000100000000-0x0000000143ffffff] max_pfn is 0x144000. Before this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000800 16384 64 ___________M_______________________________ mmap total 16384 64 After this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000100000000 16384 64 ___________________________r_______________ reserved total 16384 64 IOW, especially the unavailable physical memory ("memory hole") in the last section would not get properly marked PageReserved() and is indicated to be "mmap" memory. Drop the trace of that function from include/linux/mm.h - nobody else needs it, and rename it accordingly. Note: The fake zone/node might not be covered by the zone/node span. This is not an urgent issue (for now, we had the same node/zone due to the zeroing). We'll need a clean way to mark memory holes (e.g., using a page type PageHole() if possible or a fake ZONE_INVALID) and eventually stop marking these memory holes PageReserved(). Link: http://lkml.kernel.org/r/20191211163201.17179-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Bob Picco Cc: Daniel Jordan Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Stephen Rothwell Cc: Steven Sistare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 10eeaaadf53a..7d5b9dbf4087 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6916,10 +6916,10 @@ void __init free_area_init_node(int nid, unsigned long *zones_size, #if !defined(CONFIG_FLAT_NODE_MEM_MAP) /* - * Zero all valid struct pages in range [spfn, epfn), return number of struct - * pages zeroed + * Initialize all valid struct pages in the range [spfn, epfn) and mark them + * PageReserved(). Return the number of struct pages that were initialized. */ -static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn) { unsigned long pfn; u64 pgcnt = 0; @@ -6930,7 +6930,13 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) + pageblock_nr_pages - 1; continue; } - mm_zero_struct_page(pfn_to_page(pfn)); + /* + * Use a fake node/zone (0) for now. Some of these pages + * (in memblock.reserved but not in memblock.memory) will + * get re-initialized via reserve_bootmem_region() later. + */ + __init_single_page(pfn_to_page(pfn), pfn, 0, 0); + __SetPageReserved(pfn_to_page(pfn)); pgcnt++; } @@ -6942,7 +6948,7 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) * initialized by going through __init_single_page(). But, there are some * struct pages which are reserved in memblock allocator and their fields * may be accessed (for example page_to_pfn() on some configuration accesses - * flags). We must explicitly zero those struct pages. + * flags). We must explicitly initialize those struct pages. * * This function also addresses a similar issue where struct pages are left * uninitialized because the physical address range is not covered by @@ -6950,7 +6956,7 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) * layout is manually configured via memmap=, or when the highest physical * address (max_pfn) does not end on a section boundary. */ -void __init zero_resv_unavail(void) +static void __init init_unavailable_mem(void) { phys_addr_t start, end; u64 i, pgcnt; @@ -6963,7 +6969,8 @@ void __init zero_resv_unavail(void) for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { if (next < start) - pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); + pgcnt += init_unavailable_range(PFN_DOWN(next), + PFN_UP(start)); next = end; } @@ -6974,8 +6981,8 @@ void __init zero_resv_unavail(void) * considered initialized. Make sure that memmap has a well defined * state. */ - pgcnt += zero_pfn_range(PFN_DOWN(next), - round_up(max_pfn, PAGES_PER_SECTION)); + pgcnt += init_unavailable_range(PFN_DOWN(next), + round_up(max_pfn, PAGES_PER_SECTION)); /* * Struct pages that do not have backing memory. This could be because @@ -6984,6 +6991,10 @@ void __init zero_resv_unavail(void) if (pgcnt) pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt); } +#else +static inline void __init init_unavailable_mem(void) +{ +} #endif /* !CONFIG_FLAT_NODE_MEM_MAP */ #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -7413,7 +7424,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); - zero_resv_unavail(); + init_unavailable_mem(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, NULL, @@ -7608,7 +7619,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) void __init free_area_init(unsigned long *zones_size) { - zero_resv_unavail(); + init_unavailable_mem(); free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -- cgit v1.2.3 From 948c436e463d34e0d716aff0f333bb74471e5399 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:33:59 -0800 Subject: mm/page_alloc: fix and rework pfn handling in memmap_init_zone() Let's update the pfn manually whenever we continue the loop. This makes the code easier to read but also less error prone (and we can directly fix one issue). When overlap_memmap_init() returns true, pfn is updated to "memblock_region_memory_end_pfn(r)". So it already points at the *next* pfn to process. Incrementing the pfn another time is wrong, we might leave one uninitialized. I spotted this by inspecting the code, so I have no idea if this is relevant in practise (with kernelcore=mirror). Link: http://lkml.kernel.org/r/20200113144035.10848-2-david@redhat.com Fixes: a9a9e77fbf27 ("mm: move mirrored memory specific code outside of memmap_init_zone") Signed-off-by: David Hildenbrand Acked-by: Kirill A. Shutemov Reviewed-by: Alexander Duyck Cc: Pavel Tatashin Cc: Michal Hocko Cc: Oscar Salvador Cc: Kirill A. Shutemov Cc: Baoquan He Cc: Dan Williams Cc: Vlastimil Babka Cc: Mel Gorman Cc: "Jin, Zhi" Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d5b9dbf4087..461ed73bc30f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5905,18 +5905,20 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } #endif - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; ) { /* * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { if (!early_pfn_valid(pfn)) { - pfn = next_pfn(pfn) - 1; + pfn = next_pfn(pfn); continue; } - if (!early_pfn_in_nid(pfn, nid)) + if (!early_pfn_in_nid(pfn, nid)) { + pfn++; continue; + } if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -5944,6 +5946,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + pfn++; } } -- cgit v1.2.3 From 4c6058814ec4460c25111e29452ef596acdcd61b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:02 -0800 Subject: mm: factor out next_present_section_nr() Let's move it to the header and use the shorter variant from mm/page_alloc.c (the original one will also check "__highest_present_section_nr + 1", which is not necessary). While at it, make the section_nr in next_pfn() const. In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once we exceed __highest_present_section_nr, which doesn't make a difference in the caller as it is big enough (>= all sane end_pfn). Link: http://lkml.kernel.org/r/20200113144035.10848-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Baoquan He Cc: Dan Williams Cc: "Jin, Zhi" Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++--------- mm/sparse.c | 10 ---------- 2 files changed, 2 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 461ed73bc30f..494f74a1725d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5852,18 +5852,11 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) /* Skip PFNs that belong to non-present sections */ static inline __meminit unsigned long next_pfn(unsigned long pfn) { - unsigned long section_nr; + const unsigned long section_nr = pfn_to_section_nr(++pfn); - section_nr = pfn_to_section_nr(++pfn); if (present_section_nr(section_nr)) return pfn; - - while (++section_nr <= __highest_present_section_nr) { - if (present_section_nr(section_nr)) - return section_nr_to_pfn(section_nr); - } - - return -1; + return section_nr_to_pfn(next_present_section_nr(section_nr)); } #else static inline __meminit unsigned long next_pfn(unsigned long pfn) diff --git a/mm/sparse.c b/mm/sparse.c index 3918fc3eaef1..c184b69460b7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -198,16 +198,6 @@ static void section_mark_present(struct mem_section *ms) ms->section_mem_map |= SECTION_MARKED_PRESENT; } -static inline unsigned long next_present_section_nr(unsigned long section_nr) -{ - do { - section_nr++; - if (present_section_nr(section_nr)) - return section_nr; - } while ((section_nr <= __highest_present_section_nr)); - - return -1; -} #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr != -1) && \ -- cgit v1.2.3 From 1f8d75c1b7dc62f017c542ca99e7da4a0839fb1e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 3 Feb 2020 17:34:06 -0800 Subject: mm/memmap_init: update variable name in memmap_init_zone Patch series "mm/memory_hotplug: Shrink zones before removing memory", v6. This series fixes the access of uninitialized memmaps when shrinking zones/nodes and when removing memory. Also, it contains all fixes for crashes that can be triggered when removing certain namespace using memunmap_pages() - ZONE_DEVICE, reported by Aneesh. We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be more involved (we don't have SECTION_IS_ONLINE as an indicator), and shrinking is only of limited use (set_zone_contiguous() cannot detect the ZONE_DEVICE as contiguous). We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount of code to a minimum. Shrinking is especially necessary to keep zone->contiguous set where possible, especially, on memory unplug of DIMMs at zone boundaries. -------------------------------------------------------------------------- Zones are now properly shrunk when offlining memory blocks or when onlining failed. This allows to properly shrink zones on memory unplug even if the separate memory blocks of a DIMM were onlined to different zones or re-onlined to a different zone after offlining. Example: :/# cat /proc/zoneinfo Node 1, zone Movable spanned 0 present 0 managed 0 :/# echo "online_movable" > /sys/devices/system/memory/memory41/state :/# echo "online_movable" > /sys/devices/system/memory/memory43/state :/# cat /proc/zoneinfo Node 1, zone Movable spanned 98304 present 65536 managed 65536 :/# echo 0 > /sys/devices/system/memory/memory43/online :/# cat /proc/zoneinfo Node 1, zone Movable spanned 32768 present 32768 managed 32768 :/# echo 0 > /sys/devices/system/memory/memory41/online :/# cat /proc/zoneinfo Node 1, zone Movable spanned 0 present 0 managed 0 This patch (of 6): The third argument is actually number of pages. Change the variable name from size to nr_pages to indicate this better. No functional change in this patch. Link: http://lkml.kernel.org/r/20191006085646.5768-3-david@redhat.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 494f74a1725d..3c4eb750a199 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5946,10 +5946,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef CONFIG_ZONE_DEVICE void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, - unsigned long size, + unsigned long nr_pages, struct dev_pagemap *pgmap) { - unsigned long pfn, end_pfn = start_pfn + size; + unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); @@ -5966,7 +5966,7 @@ void __ref memmap_init_zone_device(struct zone *zone, */ if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); - size = end_pfn - start_pfn; + nr_pages = end_pfn - start_pfn; } for (pfn = start_pfn; pfn < end_pfn; pfn++) { @@ -6013,7 +6013,7 @@ void __ref memmap_init_zone_device(struct zone *zone, } pr_info("%s initialised %lu pages in %ums\n", __func__, - size, jiffies_to_msecs(jiffies - start)); + nr_pages, jiffies_to_msecs(jiffies - start)); } #endif -- cgit v1.2.3 From d33695b16a9f0b5f62aefb0a4e073509690ee533 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:09 -0800 Subject: mm/memory_hotplug: poison memmap in remove_pfn_range_from_zone() Let's poison the pages similar to when adding new memory in sparse_add_section(). Also call remove_pfn_range_from_zone() from memunmap_pages(), so we can poison the memmap from there as well. Link: http://lkml.kernel.org/r/20191006085646.5768-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: Oscar Salvador Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 +++ mm/memremap.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 36d80915ddc2..b2dd94fb3aa2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -490,6 +490,9 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, struct pglist_data *pgdat = zone->zone_pgdat; unsigned long flags; + /* Poison struct pages because they are now uninitialized again. */ + page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + #ifdef CONFIG_ZONE_DEVICE /* * Zone shrinking code cannot properly deal with ZONE_DEVICE. So diff --git a/mm/memremap.c b/mm/memremap.c index 4c723d2049d5..09b5b7adc773 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -120,6 +120,8 @@ void memunmap_pages(struct dev_pagemap *pgmap) nid = page_to_nid(first_page); mem_hotplug_begin(); + remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start), + PHYS_PFN(resource_size(res))); if (pgmap->type == MEMORY_DEVICE_PRIVATE) { __remove_pages(PHYS_PFN(res->start), PHYS_PFN(resource_size(res)), NULL); -- cgit v1.2.3 From 9b05158f5d805e0cf373f6e5a43efb9306bcb6a2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:12 -0800 Subject: mm/memory_hotplug: we always have a zone in find_(smallest|biggest)_section_pfn With shrink_pgdat_span() out of the way, we now always have a valid zone. Link: http://lkml.kernel.org/r/20191006085646.5768-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: Oscar Salvador Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2dd94fb3aa2..77cb164a2d96 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -355,7 +355,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, if (unlikely(pfn_to_nid(start_pfn) != nid)) continue; - if (zone && zone != page_zone(pfn_to_page(start_pfn))) + if (zone != page_zone(pfn_to_page(start_pfn))) continue; return start_pfn; @@ -380,7 +380,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, if (unlikely(pfn_to_nid(pfn) != nid)) continue; - if (zone && zone != page_zone(pfn_to_page(pfn))) + if (zone != page_zone(pfn_to_page(pfn))) continue; return pfn; -- cgit v1.2.3 From 950b68d9178b6209e92461ec371eee81f0f20190 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:16 -0800 Subject: mm/memory_hotplug: don't check for "all holes" in shrink_zone_span() If we have holes, the holes will automatically get detected and removed once we remove the next bigger/smaller section. The extra checks can go. Link: http://lkml.kernel.org/r/20191006085646.5768-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: Oscar Salvador Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 77cb164a2d96..61bd62d15fff 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -411,6 +411,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, if (pfn) { zone->zone_start_pfn = pfn; zone->spanned_pages = zone_end_pfn - pfn; + } else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; } } else if (zone_end_pfn == end_pfn) { /* @@ -423,34 +426,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, start_pfn); if (pfn) zone->spanned_pages = pfn - zone_start_pfn + 1; + else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } } - - /* - * The section is not biggest or smallest mem_section in the zone, it - * only creates a hole in the zone. So in this case, we need not - * change the zone. But perhaps, the zone has only hole data. Thus - * it check the zone has only hole or not. - */ - pfn = zone_start_pfn; - for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) { - if (unlikely(!pfn_to_online_page(pfn))) - continue; - - if (page_zone(pfn_to_page(pfn)) != zone) - continue; - - /* Skip range to be removed */ - if (pfn >= start_pfn && pfn < end_pfn) - continue; - - /* If we find valid section, we have nothing to do */ - zone_span_writeunlock(zone); - return; - } - - /* The zone has no valid section */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; zone_span_writeunlock(zone); } -- cgit v1.2.3 From 5d12071c5de8621b911ac77dd1a3929f3aee7335 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:19 -0800 Subject: mm/memory_hotplug: drop local variables in shrink_zone_span() Get rid of the unnecessary local variables. Link: http://lkml.kernel.org/r/20191006085646.5768-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Logan Gunthorpe Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Gupta Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 61bd62d15fff..a2b6ca24c50f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -392,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ - unsigned long zone_end_pfn = z; unsigned long pfn; int nid = zone_to_nid(zone); zone_span_writelock(zone); - if (zone_start_pfn == start_pfn) { + if (zone->zone_start_pfn == start_pfn) { /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. @@ -407,25 +404,25 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, - zone_end_pfn); + zone_end_pfn(zone)); if (pfn) { + zone->spanned_pages = zone_end_pfn(zone) - pfn; zone->zone_start_pfn = pfn; - zone->spanned_pages = zone_end_pfn - pfn; } else { zone->zone_start_pfn = 0; zone->spanned_pages = 0; } - } else if (zone_end_pfn == end_pfn) { + } else if (zone_end_pfn(zone) == end_pfn) { /* * If the section is biggest section in the zone, it need * shrink zone->spanned_pages. * In this case, we find second biggest valid mem_section for * shrinking zone. */ - pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, + pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) - zone->spanned_pages = pfn - zone_start_pfn + 1; + zone->spanned_pages = pfn - zone->zone_start_pfn + 1; else { zone->zone_start_pfn = 0; zone->spanned_pages = 0; -- cgit v1.2.3 From 52fb87c81f11daa7027af25fc24ac7974eb8f45a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:23 -0800 Subject: mm/memory_hotplug: cleanup __remove_pages() Let's drop the basically unused section stuff and simplify. Also, let's use a shorter variant to calculate the number of pages to the next section boundary. Link: http://lkml.kernel.org/r/20191006085646.5768-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: "Matthew Wilcox (Oracle)" Cc: "Aneesh Kumar K.V" Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Logan Gunthorpe Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2b6ca24c50f..4344e85213f2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -516,25 +516,20 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages, void __remove_pages(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; unsigned long map_offset = 0; - unsigned long nr, start_sec, end_sec; map_offset = vmem_altmap_offset(altmap); if (check_pfn_span(pfn, nr_pages, "remove")) return; - start_sec = pfn_to_section_nr(pfn); - end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - for (nr = start_sec; nr <= end_sec; nr++) { - unsigned long pfns; - + for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - __remove_section(pfn, pfns, map_offset, altmap); - pfn += pfns; - nr_pages -= pfns; + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK)); + __remove_section(pfn, cur_nr_pages, map_offset, altmap); map_offset = 0; } } -- cgit v1.2.3 From 92917998849eea951707c8fea2dc3007bb2ad2cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:26 -0800 Subject: mm/memory_hotplug: drop valid_start/valid_end from test_pages_in_a_zone() The callers are only interested in the actual zone, they don't care about boundaries. Return the zone instead to simplify. Link: http://lkml.kernel.org/r/20200110183308.11849-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4344e85213f2..0a54ffac8c68 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1172,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) belong to the same zone. - * When true, return its valid [start, end). + * Confirm all pages in a range [start, end) belong to the same zone (skipping + * memory holes). When true, return the zone. */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end) +struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn) { unsigned long pfn, sec_end_pfn; - unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; @@ -1200,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, continue; /* Check if we got outside of the zone */ if (zone && !zone_spans_pfn(zone, pfn + i)) - return 0; + return NULL; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) - return 0; - if (!zone) - start = pfn + i; + return NULL; zone = page_zone(page); - end = pfn + MAX_ORDER_NR_PAGES; } } - if (zone) { - *valid_start = start; - *valid_end = min(end, end_pfn); - return 1; - } else { - return 0; - } + return zone; } /* @@ -1462,7 +1452,6 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; - unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; char *reason; @@ -1487,14 +1476,12 @@ static int __ref __offline_pages(unsigned long start_pfn, /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, - &valid_end)) { + zone = test_pages_in_a_zone(start_pfn, end_pfn); + if (!zone) { ret = -EINVAL; reason = "multizone range"; goto failed_removal; } - - zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); /* set above range as isolated */ -- cgit v1.2.3 From 1c948715a159d0d02c1e1c9228327ba3c408795c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:34:58 -0800 Subject: mm: remove __krealloc Since 5.5-rc1 the last user of this function is gone, so remove the functionality. See commit 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately") for details. Link: http://lkml.kernel.org/r/20191212223442.22141-1-fw@strlen.de Signed-off-by: Florian Westphal Acked-by: Andrew Morton Acked-by: David Rientjes Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 0d95ddea13b0..0c63c0d3dd38 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1676,28 +1676,6 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, return ret; } -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - /** * krealloc - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. -- cgit v1.2.3 From 3afc423632a194d7d6afef34e4bb98f804cd071d Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:45 -0800 Subject: mm: pagewalk: add p4d_entry() and pgd_entry() pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410 ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were no users. We're about to add users so reintroduce them, along with p4d_entry() as we now have 5 levels of tables. Note that commit a00cc7d9dd93d66a ("mm, x86: add support for PUD-sized transparent hugepages") already re-added pud_entry() but with different semantics to the other callbacks. This commit reverts the semantics back to match the other callbacks. To support hmm.c which now uses the new semantics of pud_entry() a new member ('action') of struct mm_walk is added which allows the callbacks to either descend (ACTION_SUBTREE, the default), skip (ACTION_CONTINUE) or repeat the callback (ACTION_AGAIN). hmm.c is then updated to call pud_trans_huge_lock() itself and make use of the splitting/retry logic of the core code. After this change pud_entry() is called for all entries, not just transparent huge pages. [arnd@arndb.de: fix unused variable warning] Link: http://lkml.kernel.org/r/20200107204607.1533842-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20191218162402.45610-12-steven.price@arm.com Signed-off-by: Steven Price Signed-off-by: Arnd Bergmann Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 58 ++++++++++++++++++++++++++++++++-------------------------- mm/pagewalk.c | 50 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 67 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index d379cb6496ae..c49e9dfce5b4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -474,23 +474,32 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - unsigned long addr = start, next; - pmd_t *pmdp; + unsigned long addr = start; pud_t pud; - int ret; + int ret = 0; + spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); + + if (!ptl) + return 0; + + /* Normally we don't want to split the huge page */ + walk->action = ACTION_CONTINUE; -again: pud = READ_ONCE(*pudp); - if (pud_none(pud)) - return hmm_vma_walk_hole(start, end, walk); + if (pud_none(pud)) { + ret = hmm_vma_walk_hole(start, end, walk); + goto out_unlock; + } if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; uint64_t *pfns, cpu_flags; bool fault, write_fault; - if (!pud_present(pud)) - return hmm_vma_walk_hole(start, end, walk); + if (!pud_present(pud)) { + ret = hmm_vma_walk_hole(start, end, walk); + goto out_unlock; + } i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; @@ -499,16 +508,20 @@ again: cpu_flags = pud_to_hmm_pfn_flags(range, pud); hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, &fault, &write_fault); - if (fault || write_fault) - return hmm_vma_walk_hole_(addr, end, fault, - write_fault, walk); + if (fault || write_fault) { + ret = hmm_vma_walk_hole_(addr, end, fault, + write_fault, walk); + goto out_unlock; + } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; + if (unlikely(!hmm_vma_walk->pgmap)) { + ret = -EBUSY; + goto out_unlock; + } pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; } @@ -517,22 +530,15 @@ again: hmm_vma_walk->pgmap = NULL; } hmm_vma_walk->last = end; - return 0; + goto out_unlock; } - split_huge_pud(walk->vma, pudp, addr); - if (pud_none(*pudp)) - goto again; + /* Ask for the PUD to be split */ + walk->action = ACTION_SUBTREE; - pmdp = pmd_offset(pudp, addr); - do { - next = pmd_addr_end(addr, end); - ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); - if (ret) - return ret; - } while (pmdp++, addr = next, addr != end); - - return 0; +out_unlock: + spin_unlock(ptl); + return ret; } #else #define hmm_vma_walk_pud NULL diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ea0b9e606ad1..690af44609e2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -46,6 +46,9 @@ again: break; continue; } + + walk->action = ACTION_SUBTREE; + /* * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds @@ -55,16 +58,21 @@ again: if (err) break; + if (walk->action == ACTION_AGAIN) + goto again; + /* * Check this here so we only break down trans_huge * pages when we _need_ to */ - if (!ops->pte_entry) + if (walk->action == ACTION_CONTINUE || + !(ops->pte_entry)) continue; split_huge_pmd(walk->vma, pmd, addr); if (pmd_trans_unstable(pmd)) goto again; + err = walk_pte_range(pmd, addr, next, walk); if (err) break; @@ -93,24 +101,25 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, continue; } - if (ops->pud_entry) { - spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + walk->action = ACTION_SUBTREE; - if (ptl) { - err = ops->pud_entry(pud, addr, next, walk); - spin_unlock(ptl); - if (err) - break; - continue; - } - } + if (ops->pud_entry) + err = ops->pud_entry(pud, addr, next, walk); + if (err) + break; + + if (walk->action == ACTION_AGAIN) + goto again; + + if (walk->action == ACTION_CONTINUE || + !(ops->pmd_entry || ops->pte_entry)) + continue; split_huge_pud(walk->vma, pud, addr); if (pud_none(*pud)) goto again; - if (ops->pmd_entry || ops->pte_entry) - err = walk_pmd_range(pud, addr, next, walk); + err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); @@ -136,7 +145,12 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, break; continue; } - if (ops->pmd_entry || ops->pte_entry) + if (ops->p4d_entry) { + err = ops->p4d_entry(p4d, addr, next, walk); + if (err) + break; + } + if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -163,7 +177,13 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, break; continue; } - if (ops->pmd_entry || ops->pte_entry) + if (ops->pgd_entry) { + err = ops->pgd_entry(pgd, addr, next, walk); + if (err) + break; + } + if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || + ops->pte_entry) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; -- cgit v1.2.3 From 488ae6a2b933cb538b5d91b1c0a3420188d28771 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:50 -0800 Subject: mm: pagewalk: allow walking without vma Since 48684a65b4e3: "mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)", page_table_walk() will report any kernel area as a hole, because it lacks a vma. This means each arch has re-implemented page table walking when needed, for example in the per-arch ptdump walker. Remove the requirement to have a vma in the generic code and add a new function walk_page_range_novma() which ignores the VMAs and simply walks the page tables. Link: http://lkml.kernel.org/r/20191218162402.45610-13-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 690af44609e2..d5773465f6da 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -39,7 +39,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || !walk->vma) { + if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, walk); if (err) @@ -65,13 +65,16 @@ again: * Check this here so we only break down trans_huge * pages when we _need_ to */ - if (walk->action == ACTION_CONTINUE || + if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || + walk->action == ACTION_CONTINUE || !(ops->pte_entry)) continue; - split_huge_pmd(walk->vma, pmd, addr); - if (pmd_trans_unstable(pmd)) - goto again; + if (walk->vma) { + split_huge_pmd(walk->vma, pmd, addr); + if (pmd_trans_unstable(pmd)) + goto again; + } err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -93,7 +96,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, do { again: next = pud_addr_end(addr, end); - if (pud_none(*pud) || !walk->vma) { + if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { if (ops->pte_hole) err = ops->pte_hole(addr, next, walk); if (err) @@ -111,11 +114,13 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (walk->action == ACTION_AGAIN) goto again; - if (walk->action == ACTION_CONTINUE || + if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || + walk->action == ACTION_CONTINUE || !(ops->pmd_entry || ops->pte_entry)) continue; - split_huge_pud(walk->vma, pud, addr); + if (walk->vma) + split_huge_pud(walk->vma, pud, addr); if (pud_none(*pud)) goto again; @@ -389,6 +394,25 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } +int walk_page_range_novma(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .private = private, + .no_vma = true + }; + + if (start >= end || !walk.mm) + return -EINVAL; + + lockdep_assert_held(&walk.mm->mmap_sem); + + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { -- cgit v1.2.3 From fbf56346b855872db45af7c1274180f9d91f46cd Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:54 -0800 Subject: mm: pagewalk: don't lock PTEs for walk_page_range_novma() walk_page_range_novma() can be used to walk page tables or the kernel or for firmware. These page tables may contain entries that are not backed by a struct page and so it isn't (in general) possible to take the PTE lock for the pte_entry() callback. So update walk_pte_range() to only take the lock when no_vma==false by splitting out the inner loop to a separate function and add a comment explaining the difference to walk_page_range_novma(). Link: http://lkml.kernel.org/r/20191218162402.45610-14-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5773465f6da..4b5ee92ba079 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -4,15 +4,12 @@ #include #include -static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, - struct mm_walk *walk) +static int walk_pte_range_inner(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) { - pte_t *pte; - int err = 0; const struct mm_walk_ops *ops = walk->ops; - spinlock_t *ptl; + int err = 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (;;) { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) @@ -22,8 +19,26 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, break; pte++; } + return err; +} + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + int err = 0; + spinlock_t *ptl; + + if (walk->no_vma) { + pte = pte_offset_map(pmd, addr); + err = walk_pte_range_inner(pte, addr, end, walk); + pte_unmap(pte); + } else { + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + err = walk_pte_range_inner(pte, addr, end, walk); + pte_unmap_unlock(pte, ptl); + } - pte_unmap_unlock(pte, ptl); return err; } @@ -394,6 +409,12 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } +/* + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. This is useful for + * walking the kernel pages tables or page tables for firmware. + */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) -- cgit v1.2.3 From c02a98753e0a36ba65a05818626fa6adeb4e7c97 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:58 -0800 Subject: mm: pagewalk: fix termination condition in walk_pte_range() If walk_pte_range() is called with a 'end' argument that is beyond the last page of memory (e.g. ~0UL) then the comparison between 'addr' and 'end' will always fail and the loop will be infinite. Instead change the comparison to >= while accounting for overflow. Link: http://lkml.kernel.org/r/20191218162402.45610-15-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 4b5ee92ba079..6732fc7ac4c8 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -14,9 +14,9 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr, err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; - addr += PAGE_SIZE; - if (addr == end) + if (addr >= end - PAGE_SIZE) break; + addr += PAGE_SIZE; pte++; } return err; -- cgit v1.2.3 From b7a16c7ad790d0ecb44dcb08a6a75d0d0455ab5f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:03 -0800 Subject: mm: pagewalk: add 'depth' parameter to pte_hole The pte_hole() callback is called at multiple levels of the page tables. Code dumping the kernel page tables needs to know what at what depth the missing entry is. Add this is an extra parameter to pte_hole(). When the depth isn't know (e.g. processing a vma) then -1 is passed. The depth that is reported is the actual level where the entry is missing (ignoring any folding that is in place), i.e. any levels where PTRS_PER_P?D is set to 1 are ignored. Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their natural numbers as levels 2/3/4. Link: http://lkml.kernel.org/r/20191218162402.45610-16-steven.price@arm.com Signed-off-by: Steven Price Tested-by: Zong Li Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 8 ++++---- mm/migrate.c | 5 +++-- mm/mincore.c | 1 + mm/pagewalk.c | 31 +++++++++++++++++++++++++------ 4 files changed, 33 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index c49e9dfce5b4..72e5a6d9a417 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -186,7 +186,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, } static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, again: pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) - return hmm_vma_walk_hole(start, end, walk); + return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { bool fault, write_fault; @@ -487,7 +487,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pud = READ_ONCE(*pudp); if (pud_none(pud)) { - ret = hmm_vma_walk_hole(start, end, walk); + ret = hmm_vma_walk_hole(start, end, -1, walk); goto out_unlock; } @@ -497,7 +497,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, bool fault, write_fault; if (!pud_present(pud)) { - ret = hmm_vma_walk_hole(start, end, walk); + ret = hmm_vma_walk_hole(start, end, -1, walk); goto out_unlock; } diff --git a/mm/migrate.c b/mm/migrate.c index edf42ed90030..b1092876e537 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2151,6 +2151,7 @@ out_unlock: #ifdef CONFIG_DEVICE_PRIVATE static int migrate_vma_collect_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { struct migrate_vma *migrate = walk->private; @@ -2195,7 +2196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, again: if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, walk); + return migrate_vma_collect_hole(start, end, -1, walk); if (pmd_trans_huge(*pmdp)) { struct page *page; @@ -2228,7 +2229,7 @@ again: return migrate_vma_collect_skip(start, end, walk); if (pmd_none(*pmdp)) - return migrate_vma_collect_hole(start, end, + return migrate_vma_collect_hole(start, end, -1, walk); } } diff --git a/mm/mincore.c b/mm/mincore.c index 49b6fa2f6aa1..0e6dd9948f1a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -112,6 +112,7 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end, } static int mincore_unmapped_range(unsigned long addr, unsigned long end, + __always_unused int depth, struct mm_walk *walk) { walk->private += __mincore_unmapped_range(addr, end, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6732fc7ac4c8..5895ce4f1a85 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -4,6 +4,22 @@ #include #include +/* + * We want to know the real level where a entry is located ignoring any + * folding of levels which may be happening. For example if p4d is folded then + * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). + */ +static int real_depth(int depth) +{ + if (depth == 3 && PTRS_PER_PMD == 1) + depth = 2; + if (depth == 2 && PTRS_PER_PUD == 1) + depth = 1; + if (depth == 1 && PTRS_PER_P4D == 1) + depth = 0; + return depth; +} + static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -49,6 +65,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; + int depth = real_depth(3); pmd = pmd_offset(pud, addr); do { @@ -56,7 +73,7 @@ again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) { if (ops->pte_hole) - err = ops->pte_hole(addr, next, walk); + err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; @@ -106,6 +123,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; + int depth = real_depth(2); pud = pud_offset(p4d, addr); do { @@ -113,7 +131,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) { if (ops->pte_hole) - err = ops->pte_hole(addr, next, walk); + err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; @@ -154,13 +172,14 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long next; const struct mm_walk_ops *ops = walk->ops; int err = 0; + int depth = real_depth(1); p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (ops->pte_hole) - err = ops->pte_hole(addr, next, walk); + err = ops->pte_hole(addr, next, depth, walk); if (err) break; continue; @@ -192,7 +211,7 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (ops->pte_hole) - err = ops->pte_hole(addr, next, walk); + err = ops->pte_hole(addr, next, 0, walk); if (err) break; continue; @@ -239,7 +258,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) - err = ops->pte_hole(addr, next, walk); + err = ops->pte_hole(addr, next, -1, walk); if (err) break; @@ -283,7 +302,7 @@ static int walk_page_test(unsigned long start, unsigned long end, if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) - err = ops->pte_hole(start, end, walk); + err = ops->pte_hole(start, end, -1, walk); return err ? err : 1; } return 0; -- cgit v1.2.3 From 30d621f6723b1c98a142861f7a52849d286bc7fa Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:20 -0800 Subject: mm: add generic ptdump Add a generic version of page table dumping that architectures can opt-in to. Link: http://lkml.kernel.org/r/20191218162402.45610-20-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig.debug | 21 +++++++++ mm/Makefile | 1 + mm/ptdump.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+) create mode 100644 mm/ptdump.c (limited to 'mm') diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 327b3ebf23bf..0271b22e063f 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -117,3 +117,24 @@ config DEBUG_RODATA_TEST depends on STRICT_KERNEL_RWX ---help--- This option enables a testcase for the setting rodata read-only. + +config GENERIC_PTDUMP + bool + +config PTDUMP_CORE + bool + +config PTDUMP_DEBUGFS + bool "Export kernel pagetable layout to userspace via debugfs" + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on GENERIC_PTDUMP + select PTDUMP_CORE + help + Say Y here if you want to show the kernel pagetable layout in a + debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. diff --git a/mm/Makefile b/mm/Makefile index 32f08e22e824..272e66039e70 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,3 +109,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o +obj-$(CONFIG_PTDUMP_CORE) += ptdump.o diff --git a/mm/ptdump.c b/mm/ptdump.c new file mode 100644 index 000000000000..868638b8e404 --- /dev/null +++ b/mm/ptdump.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#ifdef CONFIG_KASAN +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_early_shadow_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline int note_kasan_page_table(struct mm_walk *walk, + unsigned long addr) +{ + struct ptdump_state *st = walk->private; + + st->note_page(st, addr, 5, pte_val(kasan_early_shadow_pte[0])); + + walk->action = ACTION_CONTINUE; + + return 0; +} +#endif + +static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pgd_t val = READ_ONCE(*pgd); + +#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN) + if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d))) + return note_kasan_page_table(walk, addr); +#endif + + if (pgd_leaf(val)) + st->note_page(st, addr, 1, pgd_val(val)); + + return 0; +} + +static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + p4d_t val = READ_ONCE(*p4d); + +#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN) + if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud))) + return note_kasan_page_table(walk, addr); +#endif + + if (p4d_leaf(val)) + st->note_page(st, addr, 2, p4d_val(val)); + + return 0; +} + +static int ptdump_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pud_t val = READ_ONCE(*pud); + +#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN) + if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd))) + return note_kasan_page_table(walk, addr); +#endif + + if (pud_leaf(val)) + st->note_page(st, addr, 3, pud_val(val)); + + return 0; +} + +static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pmd_t val = READ_ONCE(*pmd); + +#if defined(CONFIG_KASAN) + if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte))) + return note_kasan_page_table(walk, addr); +#endif + + if (pmd_leaf(val)) + st->note_page(st, addr, 4, pmd_val(val)); + + return 0; +} + +static int ptdump_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + + st->note_page(st, addr, 5, pte_val(READ_ONCE(*pte))); + + return 0; +} + +static int ptdump_hole(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + + st->note_page(st, addr, depth + 1, 0); + + return 0; +} + +static const struct mm_walk_ops ptdump_ops = { + .pgd_entry = ptdump_pgd_entry, + .p4d_entry = ptdump_p4d_entry, + .pud_entry = ptdump_pud_entry, + .pmd_entry = ptdump_pmd_entry, + .pte_entry = ptdump_pte_entry, + .pte_hole = ptdump_hole, +}; + +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +{ + const struct ptdump_range *range = st->range; + + down_read(&mm->mmap_sem); + while (range->start != range->end) { + walk_page_range_novma(mm, range->start, range->end, + &ptdump_ops, st); + range++; + } + up_read(&mm->mmap_sem); + + /* Flush out the last page */ + st->note_page(st, 0, 0, 0); +} -- cgit v1.2.3 From f8f0d0b6fa203bfa363d30f34f6fecce9e5cc2f7 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:38 -0800 Subject: mm: ptdump: reduce level numbers by 1 in note_page() Rather than having to increment the 'depth' number by 1 in ptdump_hole(), let's change the meaning of 'level' in note_page() since that makes the code simplier. Note that for x86, the level numbers were previously increased by 1 in commit 45dcd2091363 ("x86/mm/dump_pagetables: Fix printout of p4d level") and the comment "Bit 7 has a different meaning" was not updated, so this change also makes the code match the comment again. Link: http://lkml.kernel.org/r/20191218162402.45610-24-steven.price@arm.com Signed-off-by: Steven Price Reviewed-by: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ptdump.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/ptdump.c b/mm/ptdump.c index 868638b8e404..ad18a9839d6f 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -17,7 +17,7 @@ static inline int note_kasan_page_table(struct mm_walk *walk, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 5, pte_val(kasan_early_shadow_pte[0])); + st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); walk->action = ACTION_CONTINUE; @@ -37,7 +37,7 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, #endif if (pgd_leaf(val)) - st->note_page(st, addr, 1, pgd_val(val)); + st->note_page(st, addr, 0, pgd_val(val)); return 0; } @@ -54,7 +54,7 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, #endif if (p4d_leaf(val)) - st->note_page(st, addr, 2, p4d_val(val)); + st->note_page(st, addr, 1, p4d_val(val)); return 0; } @@ -71,7 +71,7 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, #endif if (pud_leaf(val)) - st->note_page(st, addr, 3, pud_val(val)); + st->note_page(st, addr, 2, pud_val(val)); return 0; } @@ -88,7 +88,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, #endif if (pmd_leaf(val)) - st->note_page(st, addr, 4, pmd_val(val)); + st->note_page(st, addr, 3, pmd_val(val)); return 0; } @@ -98,7 +98,7 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, 5, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); return 0; } @@ -108,7 +108,7 @@ static int ptdump_hole(unsigned long addr, unsigned long next, { struct ptdump_state *st = walk->private; - st->note_page(st, addr, depth + 1, 0); + st->note_page(st, addr, depth, 0); return 0; } @@ -135,5 +135,5 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) up_read(&mm->mmap_sem); /* Flush out the last page */ - st->note_page(st, 0, 0, 0); + st->note_page(st, 0, -1, 0); } -- cgit v1.2.3 From e47690d756a760579141560ded06ec1020dd85e8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:42 -0800 Subject: x86: mm: avoid allocating struct mm_struct on the stack struct mm_struct is quite large (~1664 bytes) and so allocating on the stack may cause problems as the kernel stack size is small. Since ptdump_walk_pgd_level_core() was only allocating the structure so that it could modify the pgd argument we can instead introduce a pgd override in struct mm_walk and pass this down the call stack to where it is needed. Since the correct mm_struct is now being passed down, it is now also unnecessary to take the mmap_sem semaphore because ptdump_walk_pgd() will now take the semaphore on the real mm. [steven.price@arm.com: restore missed arm64 changes] Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Signed-off-by: Steven Price Reported-by: Stephen Rothwell Cc: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 7 ++++++- mm/ptdump.c | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 5895ce4f1a85..928df1638c30 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -206,7 +206,10 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, const struct mm_walk_ops *ops = walk->ops; int err = 0; - pgd = pgd_offset(walk->mm, addr); + if (walk->pgd) + pgd = walk->pgd + pgd_index(addr); + else + pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { @@ -436,11 +439,13 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = mm, + .pgd = pgd, .private = private, .no_vma = true }; diff --git a/mm/ptdump.c b/mm/ptdump.c index ad18a9839d6f..26208d0d03b7 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -122,14 +122,14 @@ static const struct mm_walk_ops ptdump_ops = { .pte_hole = ptdump_hole, }; -void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm) +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) { const struct ptdump_range *range = st->range; down_read(&mm->mmap_sem); while (range->start != range->end) { walk_page_range_novma(mm, range->start, range->end, - &ptdump_ops, st); + &ptdump_ops, pgd, st); range++; } up_read(&mm->mmap_sem); -- cgit v1.2.3 From 0ed1325967ab5f7a4549a2641c6ebe115f76e228 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:36:49 -0800 Subject: mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush Architectures for which we have hardware walkers of Linux page table should flush TLB on mmu gather batch allocation failures and batch flush. Some architectures like POWER supports multiple translation modes (hash and radix) and in the case of POWER only radix translation mode needs the above TLBI. This is because for hash translation mode kernel wants to avoid this extra flush since there are no hardware walkers of linux page table. With radix translation, the hardware also walks linux page table and with that, kernel needs to make sure to TLB invalidate page walk cache before page table pages are freed. More details in commit d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for RCU_TABLE_FREE") The changes to sparc are to make sure we keep the old behavior since we are now removing HAVE_RCU_TABLE_NO_INVALIDATE. The default value for tlb_needs_table_invalidate is to always force an invalidate and sparc can avoid the table invalidate. Hence we define tlb_needs_table_invalidate to false for sparc architecture. Link: http://lkml.kernel.org/r/20200116064531.483522-3-aneesh.kumar@linux.ibm.com Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes") Signed-off-by: Peter Zijlstra (Intel) Acked-by: Michael Ellerman [powerpc] Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_gather.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7d70e5c78f97..7c1b8f67af7b 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -102,14 +102,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { -#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE - /* - * Invalidate page-table caches used by hardware walkers. Then we still - * need to RCU-sched wait while freeing the pages because software - * walkers can still be in-flight. - */ - tlb_flush_mmu_tlbonly(tlb); -#endif + if (tlb_needs_table_invalidate()) { + /* + * Invalidate page-table caches used by hardware walkers. Then + * we still need to RCU-sched wait while freeing the pages + * because software walkers can still be in-flight. + */ + tlb_flush_mmu_tlbonly(tlb); + } } static void tlb_remove_table_smp_sync(void *arg) -- cgit v1.2.3 From ff2e6d7259f82ccc9a5aaa7f41194161d9262392 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:02 -0800 Subject: asm-generic/tlb: rename HAVE_RCU_TABLE_FREE Towards a more consistent naming scheme. [akpm@linux-foundation.org: fix sparc64 Kconfig] Link: http://lkml.kernel.org/r/20200116064531.483522-7-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- mm/mmu_gather.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index e13f4d211475..1b521e0ac1de 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1792,7 +1792,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7c1b8f67af7b..86bb2176e173 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -91,7 +91,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #endif /* HAVE_MMU_GATHER_NO_GATHER */ -#ifdef CONFIG_HAVE_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * See the comment near struct mmu_table_batch. @@ -173,11 +173,11 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) tlb_table_flush(tlb); } -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { -#ifdef CONFIG_HAVE_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE tlb_table_flush(tlb); #endif #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER @@ -220,7 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->batch_count = 0; #endif -#ifdef CONFIG_HAVE_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE tlb->batch = NULL; #endif #ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE -- cgit v1.2.3 From 3af4bd033759c4dab4f0ff594f0aa1e8d182b9d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:05 -0800 Subject: asm-generic/tlb: rename HAVE_MMU_GATHER_PAGE_SIZE Towards a more consistent naming scheme. Link: http://lkml.kernel.org/r/20200116064531.483522-8-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_gather.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 86bb2176e173..297c70307367 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -69,7 +69,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ VM_BUG_ON(!tlb->end); -#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); #endif @@ -223,7 +223,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE tlb->batch = NULL; #endif -#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif -- cgit v1.2.3 From 580a586c409ab3040b7284a19cd9e281692c40c7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:08 -0800 Subject: asm-generic/tlb: rename HAVE_MMU_GATHER_NO_GATHER Towards a more consistent naming scheme. Link: http://lkml.kernel.org/r/20200116064531.483522-9-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_gather.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 297c70307367..a28c74328085 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -11,7 +11,7 @@ #include #include -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { @@ -89,7 +89,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ return false; } -#endif /* HAVE_MMU_GATHER_NO_GATHER */ +#endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE @@ -180,7 +180,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE tlb_table_flush(tlb); #endif -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif } @@ -211,7 +211,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, /* Is it from 0 to ~0? */ tlb->fullmm = !(start | (end+1)); -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; tlb->local.next = NULL; tlb->local.nr = 0; @@ -271,7 +271,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb, tlb_flush_mmu(tlb); -#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER +#ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_list_free(tlb); #endif dec_tlb_flush_pending(tlb->mm); -- cgit v1.2.3 From 0d6e24d430ef23280d8dea0ba1faeefc66c26a57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2020 17:37:11 -0800 Subject: asm-generic/tlb: provide MMU_GATHER_TABLE_FREE As described in the comment, the correct order for freeing pages is: 1) unhook page 2) TLB invalidate page 3) free page This order equally applies to page directories. Currently there are two correct options: - use tlb_remove_page(), when all page directores are full pages and there are no futher contraints placed by things like software walkers (HAVE_FAST_GUP). - use MMU_GATHER_RCU_TABLE_FREE and tlb_remove_table() when the architecture does not do IPI based TLB invalidate and has HAVE_FAST_GUP (or software TLB fill). This however leaves architectures that don't have page based directories but don't need RCU in a bind. For those, provide MMU_GATHER_TABLE_FREE, which provides the independent batching for directories without the additional RCU freeing. Link: http://lkml.kernel.org/r/20200116064531.483522-10-aneesh.kumar@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_gather.c | 120 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a28c74328085..a3538cb2bcbe 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -91,56 +91,106 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #endif /* MMU_GATHER_NO_GATHER */ -#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE +#ifdef CONFIG_MMU_GATHER_TABLE_FREE -/* - * See the comment near struct mmu_table_batch. - */ +static void __tlb_remove_table_free(struct mmu_table_batch *batch) +{ + int i; + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* - * If we want tlb_remove_table() to imply TLB invalidates. + * Semi RCU freeing of the page directories. + * + * This is needed by some architectures to implement software pagetable walkers. + * + * gup_fast() and other software pagetable walkers do a lockless page-table + * walk and therefore needs some synchronization with the freeing of the page + * directories. The chosen means to accomplish that is by disabling IRQs over + * the walk. + * + * Architectures that use IPIs to flush TLBs will then automagically DTRT, + * since we unlink the page, flush TLBs, free the page. Since the disabling of + * IRQs delays the completion of the TLB flush we can never observe an already + * freed page. + * + * Architectures that do not have this (PPC) need to delay the freeing by some + * other means, this is that means. + * + * What we do is batch the freed directory pages (tables) and RCU free them. + * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling + * holds off grace periods. + * + * However, in order to batch these pages we need to allocate storage, this + * allocation is deep inside the MM code and can thus easily fail on memory + * pressure. To guarantee progress we fall back to single table freeing, see + * the implementation of tlb_remove_table_one(). + * */ -static inline void tlb_table_invalidate(struct mmu_gather *tlb) -{ - if (tlb_needs_table_invalidate()) { - /* - * Invalidate page-table caches used by hardware walkers. Then - * we still need to RCU-sched wait while freeing the pages - * because software walkers can still be in-flight. - */ - tlb_flush_mmu_tlbonly(tlb); - } -} static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ } -static void tlb_remove_table_one(void *table) +static void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on - * IRQ disabling. See the comment near struct mmu_table_batch. + * IRQ disabling. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); } static void tlb_remove_table_rcu(struct rcu_head *head) { - struct mmu_table_batch *batch; - int i; + __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); +} - batch = container_of(head, struct mmu_table_batch, rcu); +static void tlb_remove_table_free(struct mmu_table_batch *batch) +{ + call_rcu(&batch->rcu, tlb_remove_table_rcu); +} - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); +#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ - free_page((unsigned long)batch); +static void tlb_remove_table_sync_one(void) { } + +static void tlb_remove_table_free(struct mmu_table_batch *batch) +{ + __tlb_remove_table_free(batch); +} + +#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ + +/* + * If we want tlb_remove_table() to imply TLB invalidates. + */ +static inline void tlb_table_invalidate(struct mmu_gather *tlb) +{ + if (tlb_needs_table_invalidate()) { + /* + * Invalidate page-table caches used by hardware walkers. Then + * we still need to RCU-sched wait while freeing the pages + * because software walkers can still be in-flight. + */ + tlb_flush_mmu_tlbonly(tlb); + } +} + +static void tlb_remove_table_one(void *table) +{ + tlb_remove_table_sync_one(); + __tlb_remove_table(table); } static void tlb_table_flush(struct mmu_gather *tlb) @@ -149,7 +199,7 @@ static void tlb_table_flush(struct mmu_gather *tlb) if (*batch) { tlb_table_invalidate(tlb); - call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); + tlb_remove_table_free(*batch); *batch = NULL; } } @@ -173,13 +223,21 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) tlb_table_flush(tlb); } -#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ +static inline void tlb_table_init(struct mmu_gather *tlb) +{ + tlb->batch = NULL; +} + +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ + +static inline void tlb_table_flush(struct mmu_gather *tlb) { } +static inline void tlb_table_init(struct mmu_gather *tlb) { } + +#endif /* CONFIG_MMU_GATHER_TABLE_FREE */ static void tlb_flush_mmu_free(struct mmu_gather *tlb) { -#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE tlb_table_flush(tlb); -#endif #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb_batch_pages_flush(tlb); #endif @@ -220,9 +278,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->batch_count = 0; #endif -#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE - tlb->batch = NULL; -#endif + tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; #endif -- cgit v1.2.3 From 97a32539b9568bb653683349e5a76d02ff3c3e2c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:17 -0800 Subject: proc: convert everything to "struct proc_ops" The most notable change is DEFINE_SHOW_ATTRIBUTE macro split in seq_file.h. Conversion rule is: llseek => proc_lseek unlocked_ioctl => proc_ioctl xxx => proc_xxx delete ".owner = THIS_MODULE" line [akpm@linux-foundation.org: fix drivers/isdn/capi/kcapi_proc.c] [sfr@canb.auug.org.au: fix kernel/sched/psi.c] Link: http://lkml.kernel.org/r/20200122180545.36222f50@canb.auug.org.au Link: http://lkml.kernel.org/r/20191225172546.GB13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 15 +++++++-------- mm/swapfile.c | 14 +++++++------- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 0c63c0d3dd38..1907cb2903c7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1580,18 +1580,17 @@ static int slabinfo_open(struct inode *inode, struct file *file) return seq_open(file, &slabinfo_op); } -static const struct file_operations proc_slabinfo_operations = { - .open = slabinfo_open, - .read = seq_read, - .write = slabinfo_write, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops slabinfo_proc_ops = { + .proc_open = slabinfo_open, + .proc_read = seq_read, + .proc_write = slabinfo_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init slab_proc_init(void) { - proc_create("slabinfo", SLABINFO_RIGHTS, NULL, - &proc_slabinfo_operations); + proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops); return 0; } module_init(slab_proc_init); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6febae9ad3cd..2c33ff456ed5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2796,17 +2796,17 @@ static int swaps_open(struct inode *inode, struct file *file) return 0; } -static const struct file_operations proc_swaps_operations = { - .open = swaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, - .poll = swaps_poll, +static const struct proc_ops swaps_proc_ops = { + .proc_open = swaps_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, + .proc_poll = swaps_poll, }; static int __init procswaps_init(void) { - proc_create("swaps", 0, NULL, &proc_swaps_operations); + proc_create("swaps", 0, NULL, &swaps_proc_ops); return 0; } __initcall(procswaps_init); -- cgit v1.2.3