aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/balloon_compaction.c2
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/debug.c6
-rw-r--r--mm/early_ioremap.c28
-rw-r--r--mm/filemap.c153
-rw-r--r--mm/gup.c2
-rw-r--r--mm/huge_memory.c77
-rw-r--r--mm/hugetlb.c80
-rw-r--r--mm/internal.h17
-rw-r--r--mm/kasan/kasan.c4
-rw-r--r--mm/kasan/report.c1
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/madvise.c22
-rw-r--r--mm/memblock.c38
-rw-r--r--mm/memcontrol.c85
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c240
-rw-r--r--mm/memory_hotplug.c114
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mmu_notifier.c14
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/mremap.c21
-rw-r--r--mm/nobootmem.c16
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c529
-rw-r--r--mm/page_ext.c6
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_io.c28
-rw-r--r--mm/page_owner.c68
-rw-r--r--mm/percpu-internal.h82
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c111
-rw-r--r--mm/percpu.c1522
-rw-r--r--mm/rmap.c71
-rw-r--r--mm/shmem.c222
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c55
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swap.c24
-rw-r--r--mm/swap_state.c314
-rw-r--r--mm/swapfile.c362
-rw-r--r--mm/userfaultfd.c48
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c126
-rw-r--r--mm/vmstat.c15
-rw-r--r--mm/z3fold.c479
-rw-r--r--mm/zsmalloc.c9
55 files changed, 3438 insertions, 1757 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af447fa7..0ded10a22639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -678,6 +678,7 @@ config ZONE_DEVICE
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
+ select RADIX_TREE_MULTIORDER
help
Device memory hotplug support allows for establishing pmem,
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 9075aa54e955..b06d9fe23a28 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
if (!page)
return NULL;
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 595b757bef72..c03ccbc405a0 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
char name[16];
int u32s;
- sprintf(name, "cma-%s", cma->name);
+ scnprintf(name, sizeof(name), "cma-%s", cma->name);
tmp = debugfs_create_dir(name, cma_debugfs_root);
diff --git a/mm/debug.c b/mm/debug.c
index db1cd26d8752..5715448ab0b5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -124,9 +124,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
-#endif
"def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
@@ -158,9 +156,7 @@ void dump_mm(const struct mm_struct *mm)
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
-#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
- mm->tlb_flush_pending,
-#endif
+ atomic_read(&mm->tlb_flush_pending),
mm->def_flags, &mm->def_flags
);
}
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 6d5717bd7197..b1dd4a948fc0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -30,6 +30,13 @@ early_param("early_ioremap_debug", early_ioremap_debug_setup);
static int after_paging_init __initdata;
+pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size,
+ pgprot_t prot)
+{
+ return prot;
+}
+
void __init __weak early_ioremap_shutdown(void)
{
}
@@ -215,14 +222,29 @@ early_ioremap(resource_size_t phys_addr, unsigned long size)
void __init *
early_memremap(resource_size_t phys_addr, unsigned long size)
{
- return (__force void *)__early_ioremap(phys_addr, size,
- FIXMAP_PAGE_NORMAL);
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
}
#ifdef FIXMAP_PAGE_RO
void __init *
early_memremap_ro(resource_size_t phys_addr, unsigned long size)
{
- return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
+ pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size,
+ FIXMAP_PAGE_RO);
+
+ return (__force void *)__early_ioremap(phys_addr, size, prot);
+}
+#endif
+
+#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+void __init *
+early_memremap_prot(resource_size_t phys_addr, unsigned long size,
+ unsigned long prot_val)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ __pgprot(prot_val));
}
#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index a49702445ce0..9d21afd692b9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
return -EEXIST;
mapping->nrexceptional--;
- if (!dax_mapping(mapping)) {
- if (shadowp)
- *shadowp = p;
- } else {
- /* DAX can replace empty locked entry with a hole */
- WARN_ON_ONCE(p !=
- dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
- /* Wakeup waiters for exceptional entry lock */
- dax_wake_mapping_entry_waiter(mapping, page->index, p,
- true);
- }
+ if (shadowp)
+ *shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
workingset_update_node, mapping);
@@ -402,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping,
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
- struct pagevec pvec;
- bool ret;
+ struct page *page;
if (end_byte < start_byte)
return false;
@@ -411,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping,
if (mapping->nrpages == 0)
return false;
- pagevec_init(&pvec, 0);
- if (!pagevec_lookup(&pvec, mapping, index, 1))
+ if (!find_get_pages_range(mapping, &index, end, 1, &page))
return false;
- ret = (pvec.pages[0]->index <= end);
- pagevec_release(&pvec);
- return ret;
+ put_page(page);
+ return true;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -476,6 +464,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * file_fdatawait_range - wait for writeback to complete
+ * @file: file pointing to address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the address space that file
+ * refers to, in the given range and wait for all of them. Check error
+ * status of the address space vs. the file->f_wb_err cursor and return it.
+ *
+ * Since the error status of the file is advanced by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return file_check_and_advance_wb_err(file);
+}
+EXPORT_SYMBOL(file_fdatawait_range);
+
+/**
* filemap_fdatawait_keep_errors - wait for writeback without clearing errors
* @mapping: address space structure to wait for
*
@@ -489,45 +500,22 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- __filemap_fdatawait_range(mapping, 0, i_size - 1);
+ __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
-/**
- * filemap_fdatawait - wait for all under-writeback pages to complete
- * @mapping: address space structure to wait for
- *
- * Walk the list of under-writeback pages of the given address space
- * and wait for all of them. Check error status of the address space
- * and return it.
- *
- * Since the error status of the address space is cleared by this function,
- * callers are responsible for checking the return value and handling and/or
- * reporting the error.
- */
-int filemap_fdatawait(struct address_space *mapping)
+static bool mapping_needs_writeback(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
-
- if (i_size == 0)
- return 0;
-
- return filemap_fdatawait_range(mapping, 0, i_size - 1);
+ return (!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional);
}
-EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -566,8 +554,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -589,7 +576,7 @@ EXPORT_SYMBOL(filemap_write_and_wait_range);
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
- errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+ errseq_t eseq = errseq_set(&mapping->wb_err, err);
trace_filemap_set_wb_err(mapping, eseq);
}
@@ -656,8 +643,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
int err = 0, err2;
struct address_space *mapping = file->f_mapping;
- if ((!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional)) {
+ if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -885,6 +871,7 @@ void __init pagecache_init(void)
page_writeback_init();
}
+/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
struct wait_page_key {
struct page *page;
int bit_nr;
@@ -909,8 +896,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
if (wait_page->bit_nr != key->bit_nr)
return 0;
+
+ /* Stop walking if it's locked */
if (test_bit(key->bit_nr, &key->page->flags))
- return 0;
+ return -1;
return autoremove_wake_function(wait, mode, sync, key);
}
@@ -964,6 +953,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
int ret = 0;
init_wait(wait);
+ wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
@@ -972,10 +962,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
spin_lock_irq(&q->lock);
if (likely(list_empty(&wait->entry))) {
- if (lock)
- __add_wait_queue_entry_tail_exclusive(q, wait);
- else
- __add_wait_queue(q, wait);
+ __add_wait_queue_entry_tail(q, wait);
SetPageWaiters(page);
}
@@ -985,10 +972,6 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (likely(test_bit(bit_nr, &page->flags))) {
io_schedule();
- if (unlikely(signal_pending_state(state, current))) {
- ret = -EINTR;
- break;
- }
}
if (lock) {
@@ -998,6 +981,11 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
if (!test_bit(bit_nr, &page->flags))
break;
}
+
+ if (unlikely(signal_pending_state(state, current))) {
+ ret = -EINTR;
+ break;
+ }
}
finish_wait(q, wait);
@@ -1039,7 +1027,7 @@ void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, waiter);
+ __add_wait_queue_entry_tail(q, waiter);
SetPageWaiters(page);
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -1564,23 +1552,29 @@ export:
}
/**
- * find_get_pages - gang pagecache lookup
+ * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index (inclusive)
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages() will search for and return a group of up to
- * @nr_pages pages in the mapping. The pages are placed at @pages.
- * find_get_pages() takes a reference against the returned pages.
+ * find_get_pages_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting at index @start and up to index @end
+ * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
+ * a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
+ * We also update @start to index the next page for the traversal.
*
- * find_get_pages() returns the number of pages which were found.
+ * find_get_pages_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
- unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1590,8 +1584,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1627,11 +1624,25 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *start = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when there is no page beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is page at index -1 but that is
+ * already broken anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
rcu_read_unlock();
+
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index 23f01c40c88f..33d651deeae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86975dec0ba1..0b51e70e0a8b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -32,6 +32,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
+#include <linux/oom.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -327,7 +328,7 @@ static struct attribute *hugepage_attr[] = {
NULL,
};
-static struct attribute_group hugepage_attr_group = {
+static const struct attribute_group hugepage_attr_group = {
.attrs = hugepage_attr,
};
@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ int ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -561,12 +563,11 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg, true);
- put_page(page);
- return VM_FAULT_OOM;
+ ret = VM_FAULT_OOM;
+ goto release;
}
- clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
/*
* The memory barrier inside __SetPageUptodate makes sure that
* clear_huge_page writes become visible before the set_pmd_at()
@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
- spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
- put_page(page);
- pte_free(vma->vm_mm, pgtable);
+ goto unlock_release;
} else {
pmd_t entry;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock_release;
+
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
int ret;
@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
}
return 0;
+unlock_release:
+ spin_unlock(vmf->ptl);
+release:
+ if (pgtable)
+ pte_free(vma->vm_mm, pgtable);
+ mem_cgroup_cancel_charge(page, memcg, true);
+ put_page(page);
+ return ret;
+
}
/*
@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = 0;
set = false;
if (pmd_none(*vmf->pmd)) {
- if (userfaultfd_missing(vma)) {
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret) {
+ spin_unlock(vmf->ptl);
+ } else if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
@@ -1226,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (page_trans_huge_mapcount(page, NULL) == 1) {
+ if (!trylock_page(page)) {
+ get_page(page);
+ spin_unlock(vmf->ptl);
+ lock_page(page);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ put_page(page);
+ }
+ if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
+ unlock_page(page);
goto out_unlock;
}
+ unlock_page(page);
get_page(page);
spin_unlock(vmf->ptl);
alloc:
@@ -1277,7 +1305,7 @@ alloc:
count_vm_event(THP_FAULT_ALLOC);
if (!page)
- clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
@@ -1496,10 +1524,25 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
}
/*
+ * Since we took the NUMA fault, we must have observed the !accessible
+ * bit. Make sure all other CPUs agree with that, to avoid them
+ * modifying the page we're about to migrate.
+ *
+ * Must be done under PTL such that we'll observe the relevant
+ * inc_tlb_flush_pending().
+ *
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
+ if (mm_tlb_flush_pending(vma->vm_mm))
+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+
+ /*
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
spin_unlock(vmf->ptl);
+
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
@@ -2438,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (PageWriteback(page))
+ return -EBUSY;
+
if (PageAnon(head)) {
/*
* The caller does not necessarily hold an mmap_sem that would
@@ -2515,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
- ret = 0;
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ ret = split_swap_cluster(entry);
+ } else
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..424b0ef08a60 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order)
}
static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages)
+ unsigned long nr_pages, gfp_t gfp_mask)
{
unsigned long end_pfn = start_pfn + nr_pages;
return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- GFP_KERNEL);
+ gfp_mask);
}
static bool pfn_range_valid_gigantic(struct zone *z,
@@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, unsigned int order)
+static struct page *alloc_gigantic_page(int nid, struct hstate *h)
{
+ unsigned int order = huge_page_order(h);
unsigned long nr_pages = 1 << order;
unsigned long ret, pfn, flags;
- struct zone *z;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ gfp_t gfp_mask;
- z = NODE_DATA(nid)->node_zones;
- for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
- spin_lock_irqsave(&z->lock, flags);
+ gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ zonelist = node_zonelist(nid, gfp_mask);
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
+ spin_lock_irqsave(&zone->lock, flags);
- pfn = ALIGN(z->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(z, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
+ pfn = ALIGN(zone->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
/*
* We release the zone lock here because
* alloc_contig_range() will also lock the zone
@@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
* spinning on this lock, it may win the race
* and cause alloc_contig_range() to fail...
*/
- spin_unlock_irqrestore(&z->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
if (!ret)
return pfn_to_page(pfn);
- spin_lock_irqsave(&z->lock, flags);
+ spin_lock_irqsave(&zone->lock, flags);
}
pfn += nr_pages;
}
- spin_unlock_irqrestore(&z->lock, flags);
+ spin_unlock_irqrestore(&zone->lock, flags);
}
return NULL;
@@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
struct page *page;
- page = alloc_gigantic_page(nid, huge_page_order(h));
+ page = alloc_gigantic_page(nid, h);
if (page) {
prep_compound_gigantic_page(page, huge_page_order(h));
prep_new_huge_page(h, page, nid);
@@ -2083,7 +2088,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
return page;
}
-int __weak alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h)
+ __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
int nr_nodes, node;
@@ -2569,13 +2576,13 @@ static struct attribute *hstate_attrs[] = {
NULL,
};
-static struct attribute_group hstate_attr_group = {
+static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
- struct attribute_group *hstate_attr_group)
+ const struct attribute_group *hstate_attr_group)
{
int retval;
int hi = hstate_index(h);
@@ -2633,7 +2640,7 @@ static struct attribute *per_node_hstate_attrs[] = {
NULL,
};
-static struct attribute_group per_node_hstate_attr_group = {
+static const struct attribute_group per_node_hstate_attr_group = {
.attrs = per_node_hstate_attrs,
};
@@ -4062,9 +4069,9 @@ out:
return ret;
out_release_unlock:
spin_unlock(ptl);
-out_release_nounlock:
if (vm_shared)
unlock_page(page);
+out_release_nounlock:
put_page(page);
goto out;
}
@@ -4078,6 +4085,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
+ int err = -EFAULT;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -4154,11 +4162,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
- int err = vm_fault_to_errno(ret, flags);
-
- if (err)
- return err;
-
+ err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
@@ -4213,7 +4217,7 @@ same_page:
*/
*position = vaddr;
- return i ? i : -EFAULT;
+ return i ? i : err;
}
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
@@ -4603,6 +4607,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
return pte;
}
+/*
+ * huge_pte_offset() - Walk the page table to resolve the hugepage
+ * entry at address @addr
+ *
+ * Return: Pointer to page table or swap entry (PUD or PMD) for
+ * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * size @sz doesn't match the hugepage size at this level of the page
+ * table.
+ */
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
@@ -4617,13 +4630,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
p4d = p4d_offset(pgd, addr);
if (!p4d_present(*p4d))
return NULL;
+
pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
+ if (sz != PUD_SIZE && pud_none(*pud))
return NULL;
- if (pud_huge(*pud))
+ /* hugepage or swap? */
+ if (pud_huge(*pud) || !pud_present(*pud))
return (pte_t *)pud;
+
pmd = pmd_offset(pud, addr);
- return (pte_t *) pmd;
+ if (sz != PMD_SIZE && pmd_none(*pmd))
+ return NULL;
+ /* hugepage or swap? */
+ if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ return (pte_t *)pmd;
+
+ return NULL;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/internal.h b/mm/internal.h
index 24d88f084705..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+/*
+ * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
+ * cannot assume a reduced access to memory reserves is sufficient for
+ * !MMU
+ */
+#ifdef CONFIG_MMU
+#define ALLOC_OOM 0x08
+#else
+#define ALLOC_OOM ALLOC_NO_WATERMARKS
+#endif
+
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -498,6 +509,7 @@ extern struct workqueue_struct *mm_percpu_wq;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
+void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
@@ -505,7 +517,9 @@ static inline void try_to_unmap_flush(void)
static inline void try_to_unmap_flush_dirty(void)
{
}
-
+static inline void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
@@ -522,4 +536,5 @@ static inline bool is_migrate_highatomic_page(struct page *page)
return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
}
+void setup_zone_pageset(struct zone *zone);
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index ca11bc4ce205..6f319fb81718 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -267,13 +267,13 @@ static void check_memory_region(unsigned long addr,
check_memory_region_inline(addr, size, write, ret_ip);
}
-void kasan_check_read(const void *p, unsigned int size)
+void kasan_check_read(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(kasan_check_read);
-void kasan_check_write(const void *p, unsigned int size)
+void kasan_check_write(const volatile void *p, unsigned int size)
{
check_memory_region((unsigned long)p, size, true, _RET_IP_);
}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 04bb1d3eb9ec..6bcfb01ba038 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
disable_trace_on_warning();
info.access_addr = (void *)addr;
+ info.first_bad_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
diff --git a/mm/ksm.c b/mm/ksm.c
index 4dc92f138786..15dd7415f7b3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
- (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) {
+ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
+ mm_tlb_flush_pending(mm)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -3042,7 +3043,7 @@ static struct attribute *ksm_attrs[] = {
NULL,
};
-static struct attribute_group ksm_attr_group = {
+static const struct attribute_group ksm_attr_group = {
.attrs = ksm_attrs,
.name = "ksm",
};
diff --git a/mm/madvise.c b/mm/madvise.c
index 9976852f1e1c..eea1c733286f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma,
}
new_flags &= ~VM_DONTCOPY;
break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
@@ -320,6 +331,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
@@ -367,8 +379,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
- put_page(page);
unlock_page(page);
+ put_page(page);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
@@ -612,6 +624,7 @@ static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
struct page *page;
+ struct zone *zone;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -645,6 +658,11 @@ static int madvise_inject_error(int behavior,
if (ret)
return ret;
}
+
+ /* Ensure that all poisoned pages are removed from per-cpu lists */
+ for_each_populated_zone(zone)
+ drain_all_pages(zone);
+
return 0;
}
#endif
@@ -689,6 +707,8 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
+ case MADV_WIPEONFORK:
+ case MADV_KEEPONFORK:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
diff --git a/mm/memblock.c b/mm/memblock.c
index 2cb25fe4452c..91205780e6b1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-
-phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
- phys_addr_t *addr)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- *addr = __pa(memblock.reserved.regions);
-
- return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.reserved.max);
-}
-
-phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
- phys_addr_t *addr)
+/**
+ * Discard memory and reserved arrays if they were allocated
+ */
+void __init memblock_discard(void)
{
- if (memblock.memory.regions == memblock_memory_init_regions)
- return 0;
+ phys_addr_t addr, size;
- *addr = __pa(memblock.memory.regions);
+ if (memblock.reserved.regions != memblock_reserved_init_regions) {
+ addr = __pa(memblock.reserved.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+ __memblock_free_late(addr, size);
+ }
- return PAGE_ALIGN(sizeof(struct memblock_region) *
- memblock.memory.max);
+ if (memblock.memory.regions != memblock_memory_init_regions) {
+ addr = __pa(memblock.memory.regions);
+ size = PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+ __memblock_free_late(addr, size);
+ }
}
-
#endif
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..6532b219b222 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
* value, and reading all cpu value can be performance bottleneck in some
* common workload, threshold and synchronization as vmstat[] should be
* implemented.
+ *
+ * The parameter idx can be of type enum memcg_event_item or vm_event_item.
*/
static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- enum memcg_event_item event)
+ int event)
{
unsigned long val = 0;
int cpu;
@@ -917,7 +919,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, &it);
+ css_task_iter_start(&iter->css, 0, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1611,9 +1613,13 @@ cleanup:
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
- * another cgroup and stabilizes their page->mem_cgroup binding.
+ * another cgroup.
+ *
+ * It ensures lifetime of the returned memcg. Caller is responsible
+ * for the lifetime of the page; __unlock_page_memcg() is available
+ * when @page might get freed inside the locked section.
*/
-void lock_page_memcg(struct page *page)
+struct mem_cgroup *lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1622,18 +1628,24 @@ void lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- */
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page itself from being freed. E.g. writeback
+ * doesn't hold a page reference and relies on PG_writeback to
+ * keep off truncation, migration and so forth.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
- return;
+ return NULL;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return;
+ return NULL;
if (atomic_read(&memcg->moving_account) <= 0)
- return;
+ return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1649,18 +1661,18 @@ again:
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return;
+ return memcg;
}
EXPORT_SYMBOL(lock_page_memcg);
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
- * @page: the page
+ * __unlock_page_memcg - unlock and unpin a memcg
+ * @memcg: the memcg
+ *
+ * Unlock and unpin a memcg returned by lock_page_memcg().
*/
-void unlock_page_memcg(struct page *page)
+void __unlock_page_memcg(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
-
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1672,6 +1684,15 @@ void unlock_page_memcg(struct page *page)
rcu_read_unlock();
}
+
+/**
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
+ */
+void unlock_page_memcg(struct page *page)
+{
+ __unlock_page_memcg(page->mem_cgroup);
+}
EXPORT_SYMBOL(unlock_page_memcg);
/*
@@ -1896,7 +1917,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ if (unlikely(tsk_is_oom_victim(current) ||
fatal_signal_pending(current) ||
current->flags & PF_EXITING))
goto force;
@@ -4300,6 +4321,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
+ memcg->low = 0;
+
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4616,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (!ret || !target)
put_page(page);
}
- /* There is a swap entry and a page doesn't exist or isn't charged */
- if (ent.val && !ret &&
+ /*
+ * There is a swap entry and a page doesn't exist or isn't charged.
+ * But we cannot move a tail-page in a THP.
+ */
+ if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
@@ -4628,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * We don't consider swapping or file mapped pages because THP does not
- * support them for now.
+ * We don't consider PMD mapped swapping or file mapped pages because THP does
+ * not support them for now.
* Caller should make sure that pmd_trans_huge(pmd) is true.
*/
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
@@ -5404,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (page->mem_cgroup)
+ if (compound_head(page)->mem_cgroup)
goto out;
if (do_swap_account) {
@@ -5887,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
+ unsigned int nr_entries;
unsigned short oldid;
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5907,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
+ nr_entries = hpage_nr_pages(page);
+ /* Get references for the tail pages, too */
+ if (nr_entries > 1)
+ mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
+ nr_entries);
VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(swap_memcg, 1);
+ mem_cgroup_swap_statistics(swap_memcg, nr_entries);
page->mem_cgroup = NULL;
if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+ page_counter_uncharge(&memcg->memory, nr_entries);
if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
- page_counter_charge(&swap_memcg->memsw, 1);
- page_counter_uncharge(&memcg->memsw, 1);
+ page_counter_charge(&swap_memcg->memsw, nr_entries);
+ page_counter_uncharge(&memcg->memsw, nr_entries);
}
/*
@@ -5929,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, false, -1);
+ mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
+ -nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1cd3b3569af8..88366626c0b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
+ arch_unmap_kpfn(pfn);
+
orig_head = hpage = compound_head(p);
num_poisoned_pages_inc();
diff --git a/mm/memory.c b/mm/memory.c
index 0e517be91a89..13ee83b43878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -68,6 +68,7 @@
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
+#include <linux/oom.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -215,12 +216,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-/* tlb_gather_mmu
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @fullmm argument is used when @mm is without
- * users and we're going to destroy the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
+void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
{
tlb->mm = mm;
@@ -275,10 +272,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
* Called at the end of the shootdown operation to free up any resources
* that were required.
*/
-void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+void arch_tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end, bool force)
{
struct mmu_gather_batch *batch, *next;
+ if (force)
+ __tlb_adjust_range(tlb, start, end - start);
+
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -398,6 +399,34 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+/* tlb_gather_mmu
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @fullmm argument is used when @mm is without
+ * users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ arch_tlb_gather_mmu(tlb, mm, start, end);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, a thread has stable TLB entry can fail to flush
+ * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
+ * forcefully if we detect parallel PTE batching threads.
+ */
+ bool force = mm_tlb_flush_nested(tlb->mm);
+
+ arch_tlb_finish_mmu(tlb, start, end, force);
+ dec_tlb_flush_pending(tlb->mm);
+}
+
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -1197,6 +1226,7 @@ again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
+ flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
@@ -1483,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+ for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
unmap_single_vma(&tlb, vma, start, end, NULL);
+
+ /*
+ * zap_page_range does not specify whether mmap_sem should be
+ * held for read or write. That allows parallel zap_page_range
+ * operations to unmap a PTE and defer a flush meaning that
+ * this call observes pte_none and fails to flush the TLB.
+ * Rather than adding a complex API, ensure that no stale
+ * TLB entries exist when this call returns.
+ */
+ flush_tlb_range(vma, start, end);
+ }
+
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
@@ -1646,7 +1688,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1658,14 +1700,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
goto out;
retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
+ if (!pte_none(*pte)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match.
+ */
+ if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ goto out_unlock;
+ entry = *pte;
+ goto out_mkwrite;
+ } else
+ goto out_unlock;
+ }
/* Ok, finally just insert the thing.. */
if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+
+out_mkwrite:
+ if (mkwrite) {
+ entry = pte_mkyoung(entry);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
+
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1736,14 +1799,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ false);
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn_prot);
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
@@ -1772,10 +1836,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, pgprot);
}
- return insert_pfn(vma, addr, pfn, pgprot);
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+}
+
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, false);
+
}
EXPORT_SYMBOL(vm_insert_mixed);
+int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
+{
+ return __vm_insert_mixed(vma, addr, pfn, true);
+}
+EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
@@ -2541,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
- int total_mapcount;
+ int total_map_swapcount;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2556,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
- if (reuse_swap_page(vmf->page, &total_mapcount)) {
- if (total_mapcount == 1) {
+ if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
+ if (total_map_swapcount == 1) {
/*
* The page is all ours. Move it to
* our anon_vma so the rmap code will
@@ -2674,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page, *swapcache;
+ struct page *page = NULL, *swapcache;
struct mem_cgroup *memcg;
+ struct vma_swap_readahead swap_ra;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
int ret = 0;
+ bool vma_readahead = swap_use_vma_readahead();
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (vma_readahead)
+ page = swap_readahead_detect(vmf, &swap_ra);
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
+ if (page)
+ put_page(page);
goto out;
+ }
entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
@@ -2699,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf)
goto out;
}
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- page = lookup_swap_cache(entry);
+ if (!page)
+ page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
+ vmf->address);
if (!page) {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -2864,6 +2955,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
+ int ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
@@ -2896,6 +2988,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte))
goto unlock;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2930,6 +3025,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!pte_none(*vmf->pte))
goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2949,7 +3048,7 @@ setpte:
update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return 0;
+ return ret;
release:
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
@@ -3223,7 +3322,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
int finish_fault(struct vm_fault *vmf)
{
struct page *page;
- int ret;
+ int ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3231,7 +3330,15 @@ int finish_fault(struct vm_fault *vmf)
page = vmf->cow_page;
else
page = vmf->page;
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+
+ /*
+ * check even for read faults because we might have lost our CoWed
+ * page
+ */
+ if (!(vmf->vma->vm_flags & VM_SHARED))
+ ret = check_stable_address_space(vmf->vma->vm_mm);
+ if (!ret)
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3871,19 +3978,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
- /*
- * This mm has been already reaped by the oom reaper and so the
- * refault cannot be trusted in general. Anonymous refaults would
- * lose data and give a zero page instead e.g. This is especially
- * problem for use_mm() because regular tasks will just die and
- * the corrupted data will not be visible anywhere while kthread
- * will outlive the oom victim and potentially propagate the date
- * further.
- */
- if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
- && test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
- ret = VM_FAULT_SIGBUS;
-
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -3975,7 +4069,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ unsigned long *start, unsigned long *end,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4002,17 +4097,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
+ if (start && end) {
+ *start = address & PMD_MASK;
+ *end = *start + PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
+ if (start && end) {
+ *start = address & PAGE_MASK;
+ *end = *start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, *start, *end);
+ }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4020,6 +4127,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
+ if (start && end)
+ mmu_notifier_invalidate_range_end(mm, *start, *end);
out:
return -EINVAL;
}
@@ -4031,20 +4140,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, NULL,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+ unsigned long *start, unsigned long *end,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp,
- ptlp)));
+ !(res = __follow_pte_pmd(mm, address, start, end,
+ ptepp, pmdpp, ptlp)));
return res;
}
EXPORT_SYMBOL(follow_pte_pmd);
@@ -4307,19 +4417,53 @@ static void clear_gigantic_page(struct page *page,
}
}
void clear_huge_page(struct page *page,
- unsigned long addr, unsigned int pages_per_huge_page)
+ unsigned long addr_hint, unsigned int pages_per_huge_page)
{
- int i;
+ int i, n, base, l;
+ unsigned long addr = addr_hint &
+ ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1);
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
+ /* Clear sub-page to access last to keep its cache lines hot */
might_sleep();
- for (i = 0; i < pages_per_huge_page; i++) {
+ n = (addr_hint - addr) / PAGE_SIZE;
+ if (2 * n <= pages_per_huge_page) {
+ /* If sub-page to access in first half of huge page */
+ base = 0;
+ l = n;
+ /* Clear sub-pages at the end of huge page */
+ for (i = pages_per_huge_page - 1; i >= 2 * n; i--) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ } else {
+ /* If sub-page to access in second half of huge page */
+ base = pages_per_huge_page - 2 * (pages_per_huge_page - n);
+ l = pages_per_huge_page - n;
+ /* Clear sub-pages at the begin of huge page */
+ for (i = 0; i < base; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+ }
+ /*
+ * Clear remaining sub-pages in left-right-left-right pattern
+ * towards the sub-page to access
+ */
+ for (i = 0; i < l; i++) {
+ int left_idx = base + i;
+ int right_idx = base + 2 * l - 1 - i;
+
+ cond_resched();
+ clear_user_highpage(page + left_idx,
+ addr + left_idx * PAGE_SIZE);
cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ clear_user_highpage(page + right_idx,
+ addr + right_idx * PAGE_SIZE);
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8dccc317aac2..73bf17df6899 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
node_set_state(node, N_MEMORY);
}
-bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
-{
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
- struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
-
- /*
- * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
- * physically before ZONE_MOVABLE. All we need is they do not
- * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
- * though so let's stick with it for simplicity for now.
- * TODO make sure we do not overlap with ZONE_DEVICE
- */
- if (online_type == MMOP_ONLINE_KERNEL) {
- if (zone_is_empty(movable_zone))
- return true;
- return movable_zone->zone_start_pfn >= pfn + nr_pages;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- return zone_end_pfn(default_zone) <= pfn;
- }
-
- /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
- return online_type == MMOP_ONLINE_KEEP;
-}
-
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone,
* If no kernel zone covers this pfn range it will automatically go
* to the ZONE_NORMAL.
*/
-struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
unsigned long nr_pages)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -872,17 +847,40 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
return &pgdat->node_zones[ZONE_NORMAL];
}
-static inline bool movable_pfn_range(int nid, struct zone *default_zone,
- unsigned long start_pfn, unsigned long nr_pages)
+static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+ unsigned long nr_pages)
{
- if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
- MMOP_ONLINE_KERNEL))
- return true;
+ struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
+ nr_pages);
+ struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+ bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
+ bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
- if (!movable_node_is_enabled())
- return false;
+ /*
+ * We inherit the existing zone in a simple case where zones do not
+ * overlap in the given range
+ */
+ if (in_kernel ^ in_movable)
+ return (in_kernel) ? kernel_zone : movable_zone;
- return !zone_intersects(default_zone, start_pfn, nr_pages);
+ /*
+ * If the range doesn't belong to any zone or two zones overlap in the
+ * given range then we use movable zone only if movable_node is
+ * enabled because we always online to a kernel zone by default.
+ */
+ return movable_node_enabled ? movable_zone : kernel_zone;
+}
+
+struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
+ unsigned long nr_pages)
+{
+ if (online_type == MMOP_ONLINE_KERNEL)
+ return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
+
+ if (online_type == MMOP_ONLINE_MOVABLE)
+ return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
+
+ return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
/*
@@ -892,28 +890,14 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone,
static struct zone * __meminit move_pfn_range(int online_type, int nid,
unsigned long start_pfn, unsigned long nr_pages)
{
- struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
-
- if (online_type == MMOP_ONLINE_KEEP) {
- struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
- /*
- * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
- * movable zone if that is not possible (e.g. we are within
- * or past the existing movable zone). movable_node overrides
- * this default and defaults to movable zone
- */
- if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
- zone = movable_zone;
- } else if (online_type == MMOP_ONLINE_MOVABLE) {
- zone = &pgdat->node_zones[ZONE_MOVABLE];
- }
+ struct zone *zone;
+ zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
move_pfn_range_to_zone(zone, start_pfn, nr_pages);
return zone;
}
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -925,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
struct memory_notify arg;
nid = pfn_to_nid(pfn);
- if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
- return -EINVAL;
-
/* associate pfn range with the zone */
zone = move_pfn_range(online_type, nid, pfn, nr_pages);
@@ -945,10 +926,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
- mutex_lock(&zonelists_mutex);
if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
- build_all_zonelists(NULL, zone);
+ setup_zone_pageset(zone);
}
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
@@ -956,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (ret) {
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
- mutex_unlock(&zonelists_mutex);
goto failed_addition;
}
@@ -969,13 +948,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages) {
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
- build_all_zonelists(NULL, NULL);
+ build_all_zonelists(NULL);
else
zone_pcp_update(zone);
}
- mutex_unlock(&zonelists_mutex);
-
init_per_zone_wmark_min();
if (onlined_pages) {
@@ -1046,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* The node we allocated has no zone fallback lists. For avoiding
* to access not-initialized zonelist, build here.
*/
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(pgdat, NULL);
- mutex_unlock(&zonelists_mutex);
+ build_all_zonelists(pgdat);
/*
* zone->managed_pages is set to an approximate value in
@@ -1100,13 +1075,6 @@ int try_online_node(int nid)
node_set_online(nid);
ret = register_one_node(nid);
BUG_ON(ret);
-
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
-
out:
mem_hotplug_done();
return ret;
@@ -1722,9 +1690,7 @@ repeat:
if (!populated_zone(zone)) {
zone_pcp_reset(zone);
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
+ build_all_zonelists(NULL);
} else
zone_pcp_update(zone);
@@ -1750,7 +1716,7 @@ failed_removal:
return ret;
}
-/* Must be protected by mem_hotplug_begin() */
+/* Must be protected by mem_hotplug_begin() or a device_lock */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d911fa5cb2a7..618ab125228b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
- if (vma) {
- up_read(&current->mm->mmap_sem);
- vma = NULL;
- }
-
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 627671551873..e84eeb4e4356 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -41,6 +41,7 @@
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
+#include <linux/ptrace.h>
#include <asm/tlbflush.h>
@@ -1652,7 +1653,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
const int __user *, nodes,
int __user *, status, int, flags)
{
- const struct cred *cred = current_cred(), *tcred;
struct task_struct *task;
struct mm_struct *mm;
int err;
@@ -1676,14 +1676,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
/*
* Check if this process has the right to modify the specified
- * process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
- * userid as the target process.
+ * process. Use the regular "ptrace_may_access()" checks.
*/
- tcred = __task_cred(task);
- if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
- !capable(CAP_SYS_NICE)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out;
@@ -1937,12 +1932,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
- /*
- * We are not sure a pending tlb flush here is for a huge page
- * mapping or not. Hence use the tlb range variant
- */
- if (mm_tlb_flush_pending(mm))
- flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..4c5981651407 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
+#include <linux/oom.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -2639,13 +2640,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (vma->vm_start >= end)
return 0;
- if (uf) {
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
-
- if (error)
- return error;
- }
-
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2679,6 +2673,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
}
vma = prev ? prev->vm_next : mm->mmap;
+ if (unlikely(uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain splitted, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ if (error)
+ return error;
+ }
+
/*
* unlock any mlock()ed ranges before detaching vmas
*/
@@ -2993,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ if (unlikely(tsk_is_oom_victim(current))) {
+ /*
+ * Wait for oom_reap_task() to stop working on this
+ * mm. Because MMF_OOM_SKIP is already set before
+ * calling down_read(), oom_reap_task() will not run
+ * on this "mm" post up_write().
+ *
+ * tsk_is_oom_victim() cannot be set from under us
+ * either because current->mm is already set to NULL
+ * under task_lock before calling mmput and oom_mm is
+ * set not NULL by the OOM killer only if current->mm
+ * is found not NULL while holding the task_lock.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ }
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
@@ -3514,7 +3540,7 @@ static int init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -3535,7 +3561,7 @@ static int init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
@@ -3579,7 +3605,7 @@ static int reserve_mem_notifier(struct notifier_block *nb,
break;
case MEM_OFFLINE:
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
if (sysctl_user_reserve_kbytes > free_kbytes) {
init_user_reserve();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 54ca54562928..314285284e6e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
- struct mmu_notifier *mn;
- int id;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_page)
- mn->ops->invalidate_page(mn, mm, address);
- }
- srcu_read_unlock(&srcu, id);
-}
-
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 1a8c9ca83e48..bd0f409922cb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -243,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
- set_tlb_flush_pending(mm);
+ inc_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -255,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
- clear_tlb_flush_pending(mm);
+ dec_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index cd8a1b199ef9..7395564daa6c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
@@ -383,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (!vma || vma->vm_start > addr)
return ERR_PTR(-EFAULT);
+ /*
+ * !old_len is a special case where an attempt is made to 'duplicate'
+ * a mapping. This makes no sense for private mappings as it will
+ * instead create a fresh/new mapping unrelated to the original. This
+ * is contrary to the basic idea of mremap which creates new mappings
+ * based on the original. There are no known use cases for this
+ * behavior. As a result, fail such attempts.
+ */
+ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
+ pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
+ return ERR_PTR(-EINVAL);
+ }
+
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
@@ -428,6 +442,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
struct vm_userfaultfd_ctx *uf,
+ struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
struct mm_struct *mm = current->mm;
@@ -446,7 +461,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
- ret = do_munmap(mm, new_addr, new_len, NULL);
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
@@ -514,6 +529,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long charged = 0;
bool locked = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
+ LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
@@ -541,7 +557,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap);
+ &locked, &uf, &uf_unmap_early, &uf_unmap);
goto out;
}
@@ -621,6 +637,7 @@ out:
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
+ userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 36454d0f96ee..3637809a18d0 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
NULL)
count += __free_memory_core(start, end);
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
- {
- phys_addr_t size;
-
- /* Free memblock.reserved array if it was allocated */
- size = get_allocated_memblock_reserved_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
-
- /* Free memblock.memory array if it was allocated */
- size = get_allocated_memblock_memory_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
- }
-#endif
-
return count;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index fc184f597d59..53d5175a5c14 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
@@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void)
{
unsigned long free_kbytes;
- free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+ free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..99736e026712 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
}
/*
- * increase mm_users only after we know we will reap something so
- * that the mmput_async is called only when we have reaped something
- * and delayed __mmput doesn't matter that much
+ * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
+ * work on the mm anymore. The check for MMF_OOM_SKIP must run
+ * under mmap_sem for reading because it serializes against the
+ * down_write();up_write() cycle in exit_mmap().
*/
- if (!mmget_not_zero(mm)) {
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
goto unlock_oom;
@@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
K(get_mm_counter(mm, MM_SHMEMPAGES)));
up_read(&mm->mmap_sem);
- /*
- * Drop our reference but make sure the mmput slow path is called from a
- * different context because we shouldn't risk we get stuck there and
- * put the oom_reaper out of the way.
- */
- mmput_async(mm);
trace_finish_task_reaping(tsk->pid);
unlock_oom:
mutex_unlock(&oom_lock);
@@ -824,7 +819,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/*
* If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just set TIF_MEMDIE so it can die quickly
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
*/
task_lock(p);
if (task_will_free_mem(p)) {
@@ -889,9 +885,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
count_memcg_event_mm(mm, OOM_KILL);
/*
- * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
- * the OOM victim from depleting the memory reserves from the user
- * space under its control.
+ * We should send SIGKILL before granting access to memory reserves
+ * in order to prevent the OOM victim from depleting the memory
+ * reserves from the user space under its control.
*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 96e93b214d31..0b9c5cbe8eba 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES);
+ x = global_zone_page_state(NR_FREE_PAGES);
/*
* Pages reserved for the kernel should not be considered
* dirtyable, to prevent a situation where reclaim has to
@@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
- * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
* (the number of pages we may dirty without exceeding the dirty limits).
*/
static unsigned long dirty_poll_interval(unsigned long dirty,
@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
int ret;
- lock_page_memcg(page);
+ memcg = lock_page_memcg(page);
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
+ /*
+ * NOTE: Page might be free now! Writeback doesn't hold a page
+ * reference on its own, it relies on truncation to wait for
+ * the clearing of PG_writeback. The below can only access
+ * page state that is static across allocation cycles.
+ */
if (ret) {
- dec_lruvec_page_state(page, NR_WRITEBACK);
+ dec_lruvec_state(lruvec, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
}
- unlock_page_memcg(page);
+ __unlock_page_memcg(memcg);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d30e914afb6..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,8 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/lockdep.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -1584,6 +1586,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ /* Discard memblock private memory */
+ memblock_discard();
+#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -2531,9 +2537,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2548,6 +2559,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2561,8 +2577,13 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -2930,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
{
long min = mark;
int o;
- const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
@@ -2943,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* the high-atomic reserves. This will over-estimate the size of the
* atomic reserve but it avoids a search.
*/
- if (likely(!alloc_harder))
+ if (likely(!alloc_harder)) {
free_pages -= z->nr_reserved_highatomic;
- else
- min -= min / 4;
+ } else {
+ /*
+ * OOM victims can try even harder than normal ALLOC_HARDER
+ * users on the grounds that it's definitely going to be in
+ * the exit path shortly and free memory. Any allocation it
+ * makes during the free path will be small and short-lived.
+ */
+ if (alloc_flags & ALLOC_OOM)
+ min -= min / 2;
+ else
+ min -= min / 4;
+ }
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
@@ -3184,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
- if (test_thread_flag(TIF_MEMDIE) ||
+ if (tsk_is_oom_victim(current) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3271,10 +3303,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3490,6 +3525,47 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
}
#endif /* CONFIG_COMPACTION */
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+ STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+
+static bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ /* no reclaim without waiting on it */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return false;
+
+ /* this guy won't enter reclaim */
+ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
+
+ /* We're only interested __GFP_FS allocations for now */
+ if (!(gfp_mask & __GFP_FS))
+ return false;
+
+ if (gfp_mask & __GFP_NOLOCKDEP)
+ return false;
+
+ return true;
+}
+
+void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_acquire(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
+
+void fs_reclaim_release(gfp_t gfp_mask)
+{
+ if (__need_fs_reclaim(gfp_mask))
+ lock_map_release(&__fs_reclaim_map);
+}
+EXPORT_SYMBOL_GPL(fs_reclaim_release);
+#endif
+
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3504,7 +3580,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3512,7 +3588,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
cond_resched();
@@ -3603,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
return alloc_flags;
}
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
{
- if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ if (!tsk_is_oom_victim(tsk))
return false;
+ /*
+ * !MMU doesn't have oom reaper so give access to memory reserves
+ * only to the thread with TIF_MEMDIE set
+ */
+ if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+ return false;
+
+ return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return 0;
if (gfp_mask & __GFP_MEMALLOC)
- return true;
+ return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- return true;
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- return true;
+ return ALLOC_NO_WATERMARKS;
+ if (!in_interrupt()) {
+ if (current->flags & PF_MEMALLOC)
+ return ALLOC_NO_WATERMARKS;
+ else if (oom_reserves_allowed(current))
+ return ALLOC_OOM;
+ }
- return false;
+ return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!__gfp_pfmemalloc_flags(gfp_mask);
}
/*
@@ -3770,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long alloc_start = jiffies;
unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
+ int reserve_flags;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -3875,15 +3977,16 @@ retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
- if (gfp_pfmemalloc_allowed(gfp_mask))
- alloc_flags = ALLOC_NO_WATERMARKS;
+ reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+ if (reserve_flags)
+ alloc_flags = reserve_flags;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
@@ -3960,8 +4063,8 @@ retry:
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) &&
- (alloc_flags == ALLOC_NO_WATERMARKS ||
+ if (tsk_is_oom_victim(current) &&
+ (alloc_flags == ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4041,7 +4144,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- lockdep_trace_alloc(gfp_mask);
+ fs_reclaim_acquire(gfp_mask);
+ fs_reclaim_release(gfp_mask);
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
@@ -4443,7 +4547,7 @@ long si_mem_available(void)
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
- available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
@@ -4458,8 +4562,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4471,7 +4576,7 @@ void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
- val->freeram = global_page_state(NR_FREE_PAGES);
+ val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
@@ -4602,15 +4707,15 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE),
- global_page_state(NR_FREE_PAGES),
+ global_zone_page_state(NR_PAGETABLE),
+ global_zone_page_state(NR_BOUNCE),
+ global_zone_page_state(NR_FREE_PAGES),
free_pcp,
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4772,18 +4877,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
*
* Add all populated zones of a node to the zonelist.
*/
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
+ int nr_zones = 0;
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
if (managed_zone(zone)) {
- zoneref_set_zone(zone,
- &zonelist->_zonerefs[nr_zones++]);
+ zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
} while (zone_type);
@@ -4791,52 +4895,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
return nr_zones;
}
-
-/*
- * zonelist_order:
- * 0 = automatic detection of better ordering.
- * 1 = order by ([node] distance, -zonetype)
- * 2 = order by (-zonetype, [node] distance)
- *
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- * the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT 0
-#define ZONELIST_ORDER_NODE 1
-#define ZONELIST_ORDER_ZONE 2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
#ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN 16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- * = "[dD]efault - default, automatic configuration.
- * = "[nN]ode - order by node locality, then by zone within node
- * = "[zZ]one - order by zone, then by locality within zone
- */
static int __parse_numa_zonelist_order(char *s)
{
- if (*s == 'd' || *s == 'D') {
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
- } else if (*s == 'n' || *s == 'N') {
- user_zonelist_order = ZONELIST_ORDER_NODE;
- } else if (*s == 'z' || *s == 'Z') {
- user_zonelist_order = ZONELIST_ORDER_ZONE;
- } else {
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
+ /*
+ * We used to support different zonlists modes but they turned
+ * out to be just not useful. Let's keep the warning in place
+ * if somebody still use the cmd line parameter so that we do
+ * not fail it silently
+ */
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4844,19 +4914,15 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- int ret;
-
if (!s)
return 0;
- ret = __parse_numa_zonelist_order(s);
- if (ret == 0)
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
- return ret;
+ return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
+char numa_zonelist_order[] = "Node";
+
/*
* sysctl handler for numa_zonelist_order
*/
@@ -4864,40 +4930,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
+ char *str;
int ret;
- static DEFINE_MUTEX(zl_order_mutex);
- mutex_lock(&zl_order_mutex);
- if (write) {
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
- ret = -EINVAL;
- goto out;
- }
- strcpy(saved_string, (char *)table->data);
- }
- ret = proc_dostring(table, write, buffer, length, ppos);
- if (ret)
- goto out;
- if (write) {
- int oldval = user_zonelist_order;
+ if (!write)
+ return proc_dostring(table, write, buffer, length, ppos);
+ str = memdup_user_nul(buffer, 16);
+ if (IS_ERR(str))
+ return PTR_ERR(str);
- ret = __parse_numa_zonelist_order((char *)table->data);
- if (ret) {
- /*
- * bogus value. restore saved string
- */
- strncpy((char *)table->data, saved_string,
- NUMA_ZONELIST_ORDER_LEN);
- user_zonelist_order = oldval;
- } else if (oldval != user_zonelist_order) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
- }
-out:
- mutex_unlock(&zl_order_mutex);
+ ret = __parse_numa_zonelist_order(str);
+ kfree(str);
return ret;
}
@@ -4971,17 +5014,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+ unsigned nr_nodes)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int i;
+
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
- ;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ pg_data_t *node = NODE_DATA(node_order[i]);
+
+ nr_zones = build_zonerefs_node(node, zonerefs);
+ zonerefs += nr_zones;
+ }
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -4989,13 +5039,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
- int j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
- zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
/*
@@ -5004,79 +5055,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
- int pos, j, node;
- int zone_type; /* needs to be signed */
- struct zone *z;
- struct zonelist *zonelist;
-
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- pos = 0;
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
- for (j = 0; j < nr_nodes; j++) {
- node = node_order[j];
- z = &NODE_DATA(node)->node_zones[zone_type];
- if (managed_zone(z)) {
- zoneref_set_zone(z,
- &zonelist->_zonerefs[pos++]);
- check_highest_zone(zone_type);
- }
- }
- }
- zonelist->_zonerefs[pos].zone = NULL;
- zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
- return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
- current_zonelist_order = default_zonelist_order();
- else
- current_zonelist_order = user_zonelist_order;
-}
static void build_zonelists(pg_data_t *pgdat)
{
- int i, node, load;
+ static int node_order[MAX_NUMNODES];
+ int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
- struct zonelist *zonelist;
- unsigned int order = current_zonelist_order;
-
- /* initialize zonelists */
- for (i = 0; i < MAX_ZONELISTS; i++) {
- zonelist = pgdat->node_zonelists + i;
- zonelist->_zonerefs[0].zone = NULL;
- zonelist->_zonerefs[0].zone_idx = 0;
- }
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5085,8 +5070,6 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- i = 0;
-
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
@@ -5097,19 +5080,12 @@ static void build_zonelists(pg_data_t *pgdat)
node_distance(local_node, prev_node))
node_load[node] = load;
+ node_order[nr_nodes++] = node;
prev_node = node;
load--;
- if (order == ZONELIST_ORDER_NODE)
- build_zonelists_in_node_order(pgdat, node);
- else
- node_order[i++] = node; /* remember order */
- }
-
- if (order == ZONELIST_ORDER_ZONE) {
- /* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, i);
}
+ build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
}
@@ -5135,21 +5111,17 @@ static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
-static void set_zonelist_order(void)
-{
- current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
static void build_zonelists(pg_data_t *pgdat)
{
int node, local_node;
- enum zone_type j;
- struct zonelist *zonelist;
+ struct zoneref *zonerefs;
+ int nr_zones;
local_node = pgdat->node_id;
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
- j = build_zonelists_node(pgdat, zonelist, 0);
+ zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+ nr_zones = build_zonerefs_node(pgdat, zonerefs);
+ zonerefs += nr_zones;
/*
* Now we build the zonelist so that it contains the zones
@@ -5162,16 +5134,18 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+ nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+ zonerefs += nr_zones;
}
- zonelist->_zonerefs[j].zone = NULL;
- zonelist->_zonerefs[j].zone_idx = 0;
+ zonerefs->zone = NULL;
+ zonerefs->zone_idx = 0;
}
#endif /* CONFIG_NUMA */
@@ -5194,50 +5168,32 @@ static void build_zonelists(pg_data_t *pgdat)
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
{
int nid;
- int cpu;
+ int __maybe_unused cpu;
pg_data_t *self = data;
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+ /*
+ * This node is hotadded and no memory is yet present. So just
+ * building zonelists is fine - no need to touch other nodes.
+ */
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- }
-
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
-
- build_zonelists(pgdat);
- }
+ } else {
+ for_each_online_node(nid) {
+ pg_data_t *pgdat = NODE_DATA(nid);
- /*
- * Initialize the boot_pagesets that are going to be used
- * for bootstrapping processors. The real pagesets for
- * each zone will be allocated later when the per cpu
- * allocator is available.
- *
- * boot_pagesets are used also for bootstrapping offline
- * cpus if the system is already booted because the pagesets
- * are needed to initialize allocators on a specific cpu too.
- * F.e. the percpu allocator needs the page allocator which
- * needs the percpu allocator in order to allocate its pagesets
- * (a chicken-egg dilemma).
- */
- for_each_possible_cpu(cpu) {
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ build_zonelists(pgdat);
+ }
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
@@ -5248,45 +5204,53 @@ static int __build_all_zonelists(void *data)
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
- if (cpu_online(cpu))
+ for_each_online_cpu(cpu)
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
- return 0;
+ spin_unlock(&lock);
}
static noinline void __init
build_all_zonelists_init(void)
{
+ int cpu;
+
__build_all_zonelists(NULL);
+
+ /*
+ * Initialize the boot_pagesets that are going to be used
+ * for bootstrapping processors. The real pagesets for
+ * each zone will be allocated later when the per cpu
+ * allocator is available.
+ *
+ * boot_pagesets are used also for bootstrapping offline
+ * cpus if the system is already booted because the pagesets
+ * are needed to initialize allocators on a specific cpu too.
+ * F.e. the percpu allocator needs the page allocator which
+ * needs the percpu allocator in order to allocate its pagesets
+ * (a chicken-egg dilemma).
+ */
+ for_each_possible_cpu(cpu)
+ setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
/*
- * Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
{
- set_zonelist_order();
-
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
-#ifdef CONFIG_MEMORY_HOTPLUG
- if (zone)
- setup_zone_pageset(zone);
-#endif
- /* we have to stop all cpus to guarantee there is no user
- of zonelist */
- stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+ __build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5302,9 +5266,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
@@ -5558,7 +5521,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
pageset_set_high_and_batch(zone, pcp);
}
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7012,9 +6975,11 @@ static void __setup_per_zone_wmarks(void)
*/
void setup_per_zone_wmarks(void)
{
- mutex_lock(&zonelists_mutex);
+ static DEFINE_SPINLOCK(lock);
+
+ spin_lock(&lock);
__setup_per_zone_wmarks();
- mutex_unlock(&zonelists_mutex);
+ spin_unlock(&lock);
}
/*
@@ -7666,7 +7631,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 88ccc044b09a..32f18911deda 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid)
return addr;
}
- if (node_state(nid, N_HIGH_MEMORY))
- addr = vzalloc_node(size, nid);
- else
- addr = vzalloc(size);
+ addr = vzalloc_node(size, nid);
return addr;
}
@@ -409,6 +406,7 @@ void __init page_ext_init(void)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
+ cond_resched();
}
}
hotplug_memory_notifier(page_ext_callback, 0);
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 1b0f48c62316..4bd03a8d809e 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = {
NULL,
};
-static struct attribute_group page_idle_attr_group = {
+static const struct attribute_group page_idle_attr_group = {
.bin_attrs = page_idle_bin_attrs,
.name = "page_idle",
};
diff --git a/mm/page_io.c b/mm/page_io.c
index 9cf1bc751d79..21502d341a67 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -22,14 +22,16 @@
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
+#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
+ int i, nr = hpage_nr_pages(page);
struct bio *bio;
- bio = bio_alloc(gfp_flags, 1);
+ bio = bio_alloc(gfp_flags, nr);
if (bio) {
struct block_device *bdev;
@@ -38,8 +40,9 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
- bio_add_page(bio, page, PAGE_SIZE, 0);
- BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE);
+ for (i = 0; i < nr; i++)
+ bio_add_page(bio, page + i, PAGE_SIZE, 0);
+ VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
}
return bio;
}
@@ -137,6 +140,7 @@ out:
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
+ put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -261,6 +265,15 @@ static sector_t swap_page_sector(struct page *page)
return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
+static inline void count_swpout_vm_event(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (unlikely(PageTransHuge(page)))
+ count_vm_event(THP_SWPOUT);
+#endif
+ count_vm_events(PSWPOUT, hpage_nr_pages(page));
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -312,7 +325,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
return 0;
}
@@ -325,7 +338,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
goto out;
}
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- count_vm_event(PSWPOUT);
+ count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
@@ -379,6 +392,11 @@ int swap_readpage(struct page *page, bool do_poll)
goto out;
}
disk = bio->bi_disk;
+ /*
+ * Keep this task valid during swap readpage because the oom killer may
+ * attempt to access it in the page fault retry time check.
+ */
+ get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0fd9dcf2c5dc..8e2d7137510c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
static depot_stack_handle_t failure_handle;
+static depot_stack_handle_t early_handle;
static void init_early_allocated_pages(void);
@@ -53,7 +54,7 @@ static bool need_page_owner(void)
return true;
}
-static noinline void register_dummy_stack(void)
+static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
unsigned long entries[4];
struct stack_trace dummy;
@@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void)
dummy.skip = 0;
save_stack_trace(&dummy);
- dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+ return depot_save_stack(&dummy, GFP_KERNEL);
}
-static noinline void register_failure_stack(void)
+static noinline void register_dummy_stack(void)
{
- unsigned long entries[4];
- struct stack_trace failure;
+ dummy_handle = create_dummy_stack();
+}
- failure.nr_entries = 0;
- failure.max_entries = ARRAY_SIZE(entries);
- failure.entries = &entries[0];
- failure.skip = 0;
+static noinline void register_failure_stack(void)
+{
+ failure_handle = create_dummy_stack();
+}
- save_stack_trace(&failure);
- failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+static noinline void register_early_stack(void)
+{
+ early_handle = create_dummy_stack();
}
static void init_page_owner(void)
@@ -88,6 +90,7 @@ static void init_page_owner(void)
register_dummy_stack();
register_failure_stack();
+ register_early_stack();
static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
return handle;
}
-noinline void __set_page_owner(struct page *page, unsigned int order,
- gfp_t gfp_mask)
+static inline void __set_page_owner_handle(struct page_ext *page_ext,
+ depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
- if (unlikely(!page_ext))
- return;
-
page_owner = get_page_owner(page_ext);
- page_owner->handle = save_stack(gfp_mask);
+ page_owner->handle = handle;
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
@@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ depot_stack_handle_t handle;
+
+ if (unlikely(!page_ext))
+ return;
+
+ handle = save_stack(gfp_mask);
+ __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+}
+
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -550,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
/*
- * We are safe to check buddy flag and order, because
- * this is init stage and only single thread runs.
+ * To avoid having to grab zone->lock, be a little
+ * careful when reading buddy page order. The only
+ * danger is that we skip too much and potentially miss
+ * some early allocated pages, which is better than
+ * heavy lock contention.
*/
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long order = page_order_unsafe(page);
+
+ if (order > 0 && order < MAX_ORDER)
+ pfn += (1UL << order) - 1;
continue;
}
@@ -565,14 +583,15 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (unlikely(!page_ext))
continue;
- /* Maybe overraping zone */
+ /* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
/* Found early allocated page */
- set_page_owner(page, 0, 0);
+ __set_page_owner_handle(page_ext, early_handle, 0, 0);
count++;
}
+ cond_resched();
}
pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n",
@@ -583,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat)
{
struct zone *zone;
struct zone *node_zones = pgdat->node_zones;
- unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
if (!populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
init_pages_in_zone(pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
}
}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index cd2442e13d8f..7065faf74b46 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,22 @@
#include <linux/types.h>
#include <linux/percpu.h>
+/*
+ * pcpu_block_md is the metadata block struct.
+ * Each chunk's bitmap is split into a number of full blocks.
+ * All units are in terms of bits.
+ */
+struct pcpu_block_md {
+ int contig_hint; /* contig hint for block */
+ int contig_hint_start; /* block relative starting
+ position of the contig hint */
+ int left_free; /* size of free space along
+ the left side of the block */
+ int right_free; /* size of free space along
+ the right side of the block */
+ int first_free; /* block position of first free */
+};
+
struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */
@@ -11,24 +27,29 @@ struct pcpu_chunk {
#endif
struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
+ int free_bytes; /* free bytes in the chunk */
+ int contig_bits; /* max contiguous size hint */
+ int contig_bits_start; /* contig_bits starting
+ offset */
void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used before the sentry */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
+ unsigned long *alloc_map; /* allocation map */
+ unsigned long *bound_map; /* boundary map */
+ struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_free; /* no free below this */
+ int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
- bool has_reserved; /* Indicates if chunk has reserved space
- at the beginning. Reserved chunk will
- contain reservation for static chunk.
- Dynamic chunk will contain reservation
- for static and reserved chunks. */
+ int start_offset; /* the overlap with the previous
+ region to have a page aligned
+ base_addr */
+ int end_offset; /* additional area required to
+ have the region end page
+ aligned */
+
+ int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
+ int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */
};
@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_slot;
extern int pcpu_nr_slots;
+extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;
+/**
+ * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bitmap blocks used.
+ */
+static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
+{
+ return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
+}
+
+/**
+ * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
+ * @pages: number of physical pages
+ *
+ * This conversion is from physical pages to the number of bits
+ * required in the bitmap.
+ */
+static inline int pcpu_nr_pages_to_map_bits(int pages)
+{
+ return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bits in the bitmap.
+ */
+static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
+{
+ return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index eb58aa4c0997..d2a76642c4ae 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, 0, nr_pages);
+ pcpu_chunk_populated(chunk, 0, nr_pages, false);
spin_unlock_irq(&pcpu_lock);
pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 03524a56eeff..6142484e88f7 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -18,7 +18,7 @@
#include "percpu-internal.h"
#define P(X, Y) \
- seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y)
+ seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
struct percpu_stats pcpu_stats;
struct pcpu_alloc_info pcpu_stats_ai;
@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b)
}
/*
- * Iterates over all chunks to find the max # of map entries used.
+ * Iterates over all chunks to find the max nr_alloc entries.
*/
-static int find_max_map_used(void)
+static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
+ int slot, max_nr_alloc;
- max_map_used = 0;
+ max_nr_alloc = 0;
for (slot = 0; slot < pcpu_nr_slots; slot++)
list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_map_used = max(max_map_used, chunk->map_used);
+ max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
- return max_map_used;
+ return max_nr_alloc;
}
/*
* Prints out chunk state. Fragmentation is considered between
* the beginning of the chunk to the last allocation.
+ *
+ * All statistics are in bytes unless stated otherwise.
*/
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
- void *buffer)
+ int *buffer)
{
- int i, s_index, last_alloc, alloc_sign, as_len;
+ int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
int sum_frag = 0, max_frag = 0;
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
alloc_sizes = buffer;
- s_index = chunk->has_reserved ? 1 : 0;
-
- /* find last allocation */
- last_alloc = -1;
- for (i = chunk->map_used - 1; i >= s_index; i--) {
- if (chunk->map[i] & 1) {
- last_alloc = i;
- break;
- }
- }
- /* if the chunk is not empty - ignoring reserve */
- if (last_alloc >= s_index) {
- as_len = last_alloc + 1 - s_index;
-
- /*
- * Iterate through chunk map computing size info.
- * The first bit is overloaded to be a used flag.
- * negative = free space, positive = allocated
- */
- for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
- alloc_sign = (*p & 1) ? 1 : -1;
- alloc_sizes[i] = alloc_sign *
- ((p[1] & ~1) - (p[0] & ~1));
+ /*
+ * find_last_bit returns the start value if nothing found.
+ * Therefore, we must determine if it is a failure of find_last_bit
+ * and set the appropriate value.
+ */
+ last_alloc = find_last_bit(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) -
+ chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
+ last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
+ last_alloc + 1 : 0;
+
+ as_len = 0;
+ start = chunk->start_offset;
+
+ /*
+ * If a bit is set in the allocation map, the bound_map identifies
+ * where the allocation ends. If the allocation is not set, the
+ * bound_map does not identify free areas as it is only kept accurate
+ * on allocation, not free.
+ *
+ * Positive values are allocations and negative values are free
+ * fragments.
+ */
+ while (start < last_alloc) {
+ if (test_bit(start, chunk->alloc_map)) {
+ end = find_next_bit(chunk->bound_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = 1;
+ } else {
+ end = find_next_bit(chunk->alloc_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = -1;
}
- sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+ alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
+
+ start = end;
+ }
+
+ /*
+ * The negative values are free fragments and thus sorting gives the
+ * free fragments at the beginning in largest first order.
+ */
+ if (as_len > 0) {
+ sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
- /* Iterate through the unallocated fragements. */
+ /* iterate through the unallocated fragments */
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
sum_frag -= *p;
max_frag = max(max_frag, -1 * (*p));
@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
- P("free_size", chunk->free_size);
- P("contig_hint", chunk->contig_hint);
+ P("empty_pop_pages", chunk->nr_empty_pop_pages);
+ P("first_bit", chunk->first_bit);
+ P("free_bytes", chunk->free_bytes);
+ P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
static int percpu_stats_show(struct seq_file *m, void *v)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
- void *buffer;
+ int slot, max_nr_alloc;
+ int *buffer;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
- max_map_used = find_max_map_used();
+ max_nr_alloc = find_max_nr_alloc();
spin_unlock_irq(&pcpu_lock);
- buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+ /* there can be at most this many free and allocated fragments */
+ buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
if (!buffer)
return -ENOMEM;
spin_lock_irq(&pcpu_lock);
/* if the buffer allocated earlier is too small */
- if (max_map_used < find_max_map_used()) {
+ if (max_nr_alloc < find_max_nr_alloc()) {
spin_unlock_irq(&pcpu_lock);
vfree(buffer);
goto alloc_buffer;
}
#define PL(X) \
- seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m,
"Percpu Memory Statistics\n"
@@ -151,7 +175,7 @@ alloc_buffer:
#undef PL
#define PU(X) \
- seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+ seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
seq_printf(m,
"Global Stats:\n"
@@ -164,6 +188,7 @@ alloc_buffer:
PU(nr_max_chunks);
PU(min_alloc_size);
PU(max_alloc_size);
+ P("empty_pop_pages", pcpu_nr_empty_pop_pages);
seq_putc(m, '\n');
#undef PU
diff --git a/mm/percpu.c b/mm/percpu.c
index bd4130a69bbc..59d44d61f5f1 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -4,44 +4,53 @@
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
*
- * This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks. Each chunk is
- * consisted of boot-time determined number of units and the first
- * chunk is used for static percpu variables in the kernel image
- * (special boot time alloc/init handling necessary as these areas
- * need to be brought up before allocation services are running).
- * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.
+ * This file is released under the GPLv2 license.
+ *
+ * The percpu allocator handles both static and dynamic areas. Percpu
+ * areas are allocated in chunks which are divided into units. There is
+ * a 1-to-1 mapping for units to possible cpus. These units are grouped
+ * based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
- * Allocation is done in offset-size areas of single unit space. Ie,
- * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
- * cpus. On NUMA, the mapping can be non-linear and even sparse.
- * Percpu access can be done by configuring percpu base registers
- * according to cpu to unit mapping and pcpu_unit_size.
- *
- * There are usually many small percpu allocations many of them being
- * as small as 4 bytes. The allocator organizes chunks into lists
- * according to free size and tries to allocate from the fullest one.
- * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be equal to or larger than the maximum contiguous
- * area in the chunk. This helps the allocator not to iterate the
- * chunk maps unnecessarily.
- *
- * Allocation state in each chunk is kept using an array of integers
- * on chunk->map. A positive value in the map represents a free
- * region and negative allocated. Allocation inside a chunk is done
- * by scanning this map sequentially and serving the first matching
- * entry. This is mostly copied from the percpu_modalloc() allocator.
- * Chunks can be determined from the address using the index field
- * in the page struct. The index field contains a pointer to the chunk.
+ * Allocation is done by offsets into a unit's address space. Ie., an
+ * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
+ * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
+ * and even sparse. Access is handled by configuring percpu base
+ * registers according to the cpu to unit mappings and offsetting the
+ * base address using pcpu_unit_size.
+ *
+ * There is special consideration for the first chunk which must handle
+ * the static percpu variables in the kernel image as allocation services
+ * are not online yet. In short, the first chunk is structured like so:
+ *
+ * <Static | [Reserved] | Dynamic>
+ *
+ * The static data is copied from the original section managed by the
+ * linker. The reserved section, if non-zero, primarily manages static
+ * percpu variables from kernel modules. Finally, the dynamic section
+ * takes care of normal allocations.
+ *
+ * The allocator organizes chunks into lists according to free size and
+ * tries to allocate from the fullest chunk first. Each chunk is managed
+ * by a bitmap with metadata blocks. The allocation map is updated on
+ * every allocation and free to reflect the current state while the boundary
+ * map is only updated on allocation. Each metadata block contains
+ * information to help mitigate the need to iterate over large portions
+ * of the bitmap. The reverse mapping from page to chunk is stored in
+ * the page's index. Lastly, units are lazily backed and grow in unison.
+ *
+ * There is a unique conversion that goes on here between bytes and bits.
+ * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
+ * tracks the number of pages it is responsible for in nr_pages. Helper
+ * functions are used to convert from between the bytes, bits, and blocks.
+ * All hints are managed in bits unless explicitly stated.
*
* To use this allocator, arch code should do the following:
*
@@ -58,6 +67,7 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
+#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
@@ -81,10 +91,9 @@
#include "percpu-internal.h"
-#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
-#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
-#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
+/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+#define PCPU_SLOT_BASE_SHIFT 5
+
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
- * chunk and serves it for reserved allocations. The amount of
- * reserved offset is in pcpu_reserved_chunk_limit. When reserved
- * area doesn't exist, the following variables contain NULL and 0
- * respectively.
+ * chunk and serves it for reserved allocations. When the reserved
+ * region doesn't exist, the following variable is NULL.
*/
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
-static int pcpu_reserved_chunk_limit __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
@@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks);
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
*/
-static int pcpu_nr_empty_pop_pages;
+int pcpu_nr_empty_pop_pages;
/*
* Balance work is used to populate or destroy chunks asynchronously. We
@@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void)
schedule_work(&pcpu_balance_work);
}
-static bool pcpu_addr_in_first_chunk(void *addr)
+/**
+ * pcpu_addr_in_chunk - check if the address is served from this chunk
+ * @chunk: chunk of interest
+ * @addr: percpu address
+ *
+ * RETURNS:
+ * True if the address is served from this chunk.
+ */
+static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
- void *first_start = pcpu_first_chunk->base_addr;
+ void *start_addr, *end_addr;
- return addr >= first_start && addr < first_start + pcpu_unit_size;
-}
+ if (!chunk)
+ return false;
-static bool pcpu_addr_in_reserved_chunk(void *addr)
-{
- void *first_start = pcpu_first_chunk->base_addr;
+ start_addr = chunk->base_addr + chunk->start_offset;
+ end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
+ chunk->end_offset;
- return addr >= first_start &&
- addr < first_start + pcpu_reserved_chunk_limit;
+ return addr >= start_addr && addr < end_addr;
}
static int __pcpu_size_to_slot(int size)
@@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size)
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_size);
+ return pcpu_size_to_slot(chunk->free_bytes);
}
/* set the pointer to a chunk in a page struct */
@@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
+static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
+}
+
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
+ return (unsigned long)chunk->base_addr +
+ pcpu_unit_page_offset(cpu, page_idx);
}
-static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_zero_bit(chunk->populated, end, *rs);
- *re = find_next_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_zero_bit(bitmap, end, *rs);
+ *re = find_next_bit(bitmap, end, *rs + 1);
}
-static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_bit(chunk->populated, end, *rs);
- *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_bit(bitmap, end, *rs);
+ *re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
- * (Un)populated page region iterators. Iterate over (un)populated
- * page regions between @start and @end in @chunk. @rs and @re should
- * be integer variables and will be set to start and end page index of
- * the current region.
+ * Bitmap region iterators. Iterates over the bitmap between
+ * [@start, @end) in @chunk. @rs and @re should be integer variables
+ * and will be set to start and end index of the current free region.
+ */
+#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
+
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
+{
+ return chunk->alloc_map +
+ (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
+}
+
+static unsigned long pcpu_off_to_block_index(int off)
+{
+ return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+ return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+ return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
+/**
+ * pcpu_next_md_free_region - finds the next hint free area
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Helper function for pcpu_for_each_md_free_region. It checks
+ * block->contig_hint and performs aggregation across blocks to find the
+ * next hint. It modifies bit_off and bits in-place to be consumed in the
+ * loop.
+ */
+static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
+ int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ return;
+ }
+
+ /*
+ * This checks three things. First is there a contig_hint to
+ * check. Second, have we checked this hint before by
+ * comparing the block_off. Third, is this the same as the
+ * right contig hint. In the last case, it spills over into
+ * the next block and should be handled by the contig area
+ * across blocks code.
+ */
+ *bits = block->contig_hint;
+ if (*bits && block->contig_hint_start >= block_off &&
+ *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
+ *bit_off = pcpu_block_off_to_off(i,
+ block->contig_hint_start);
+ return;
+ }
+
+ *bits = block->right_free;
+ *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
+ }
+}
+
+/**
+ * pcpu_next_fit_region - finds fit areas for a given allocation request
+ * @chunk: chunk of interest
+ * @alloc_bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finds the next free region that is viable for use with a given size and
+ * alignment. This only returns if there is a valid area to be used for this
+ * allocation. block->first_free is returned if the allocation request fits
+ * within the block to see if the request can be fulfilled prior to the contig
+ * hint.
*/
-#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
+ int align, int *bit_off, int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (*bits >= alloc_bits)
+ return;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ }
+
+ /* check block->contig_hint */
+ *bits = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+ /*
+ * This uses the block offset to determine if this has been
+ * checked in the prior iteration.
+ */
+ if (block->contig_hint &&
+ block->contig_hint_start >= block_off &&
+ block->contig_hint >= *bits + alloc_bits) {
+ *bits += alloc_bits + block->contig_hint_start -
+ block->first_free;
+ *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ return;
+ }
+
+ *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
+ align);
+ *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
+ *bit_off = pcpu_block_off_to_off(i, *bit_off);
+ if (*bits >= alloc_bits)
+ return;
+ }
-#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+ /* no valid offsets were found - fail condition */
+ *bit_off = pcpu_chunk_map_bits(chunk);
+}
+
+/*
+ * Metadata free area iterators. These perform aggregation of free areas
+ * based on the metadata blocks and return the offset @bit_off and size in
+ * bits of the free area @bits. pcpu_for_each_fit_region only returns when
+ * a fit is found for the allocation request.
+ */
+#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
+ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits) + 1, \
+ pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
+
+#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
+ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits), \
+ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)))
/**
* pcpu_mem_zalloc - allocate memory
@@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr)
}
/**
- * pcpu_count_occupied_pages - count the number of pages an area occupies
- * @chunk: chunk of interest
- * @i: index of the area in question
- *
- * Count the number of pages chunk's @i'th area occupies. When the area's
- * start and/or end address isn't aligned to page boundary, the straddled
- * page is included in the count iff the rest of the page is free.
- */
-static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
-{
- int off = chunk->map[i] & ~1;
- int end = chunk->map[i + 1] & ~1;
-
- if (!PAGE_ALIGNED(off) && i > 0) {
- int prev = chunk->map[i - 1];
-
- if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
- off = round_down(off, PAGE_SIZE);
- }
-
- if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
- int next = chunk->map[i + 1];
- int nend = chunk->map[i + 2] & ~1;
-
- if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
- end = round_up(end, PAGE_SIZE);
- }
-
- return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
-}
-
-/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
@@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
/**
- * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * pcpu_cnt_pop_pages- counts populated backing pages in range
* @chunk: chunk of interest
- * @is_atomic: the allocation context
+ * @bit_off: start offset
+ * @bits: size of area to check
*
- * Determine whether area map of @chunk needs to be extended. If
- * @is_atomic, only the amount necessary for a new allocation is
- * considered; however, async extension is scheduled if the left amount is
- * low. If !@is_atomic, it aims for more empty space. Combined, this
- * ensures that the map is likely to have enough available space to
- * accomodate atomic allocations which can't extend maps directly.
- *
- * CONTEXT:
- * pcpu_lock.
+ * Calculates the number of populated pages in the region
+ * [page_start, page_end). This keeps track of how many empty populated
+ * pages are available and decide if async work should be scheduled.
*
* RETURNS:
- * New target map allocation length if extension is necessary, 0
- * otherwise.
+ * The nr of populated pages.
*/
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
+static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int margin, new_alloc;
-
- lockdep_assert_held(&pcpu_lock);
-
- if (is_atomic) {
- margin = 3;
+ int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
+ int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
- if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
- if (list_empty(&chunk->map_extend_list)) {
- list_add_tail(&chunk->map_extend_list,
- &pcpu_map_extend_chunks);
- pcpu_schedule_balance_work();
- }
- }
- } else {
- margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
- }
-
- if (chunk->map_alloc >= chunk->map_used + margin)
+ if (page_start >= page_end)
return 0;
- new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + margin)
- new_alloc *= 2;
-
- return new_alloc;
+ /*
+ * bitmap_weight counts the number of bits set in a bitmap up to
+ * the specified number of bits. This is counting the populated
+ * pages up to page_end and then subtracting the populated pages
+ * up to page_start to count the populated pages in
+ * [page_start, page_end).
+ */
+ return bitmap_weight(chunk->populated, page_end) -
+ bitmap_weight(chunk->populated, page_start);
}
/**
- * pcpu_extend_area_map - extend area map of a chunk
+ * pcpu_chunk_update - updates the chunk metadata given a free area
* @chunk: chunk of interest
- * @new_alloc: new target allocation length of the area map
+ * @bit_off: chunk offset
+ * @bits: size of free area
*
- * Extend area map of @chunk to have @new_alloc entries.
+ * This updates the chunk's contig hint and starting offset given a free area.
+ * Choose the best starting offset if the contig hint is equal.
+ */
+static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
+{
+ if (bits > chunk->contig_bits) {
+ chunk->contig_bits_start = bit_off;
+ chunk->contig_bits = bits;
+ } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
+ (!bit_off ||
+ __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
+ /* use the start with the best alignment */
+ chunk->contig_bits_start = bit_off;
+ }
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
*
- * CONTEXT:
- * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ * Iterates over the metadata blocks to find the largest contig area.
+ * It also counts the populated pages and uses the delta to update the
+ * global count.
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Updates:
+ * chunk->contig_bits
+ * chunk->contig_bits_start
+ * nr_empty_pop_pages (chunk and global)
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
{
- int *old = NULL, *new = NULL;
- size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
- unsigned long flags;
+ int bit_off, bits, nr_empty_pop_pages;
- lockdep_assert_held(&pcpu_alloc_mutex);
+ /* clear metadata */
+ chunk->contig_bits = 0;
- new = pcpu_mem_zalloc(new_size);
- if (!new)
- return -ENOMEM;
+ bit_off = chunk->first_bit;
+ bits = nr_empty_pop_pages = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_chunk_update(chunk, bit_off, bits);
- /* acquire pcpu_lock and switch to new area map */
- spin_lock_irqsave(&pcpu_lock, flags);
+ nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
+ }
- if (new_alloc <= chunk->map_alloc)
- goto out_unlock;
+ /*
+ * Keep track of nr_empty_pop_pages.
+ *
+ * The chunk maintains the previous number of free pages it held,
+ * so the delta is used to update the global counter. The reserved
+ * chunk is not part of the free page count as they are populated
+ * at init and are special to serving reserved allocations.
+ */
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages +=
+ (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
- old_size = chunk->map_alloc * sizeof(chunk->map[0]);
- old = chunk->map;
+ chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+}
- memcpy(new, old, old_size);
+/**
+ * pcpu_block_update - updates a block given a free area
+ * @block: block of interest
+ * @start: start offset in block
+ * @end: end offset in block
+ *
+ * Updates a block given a known free area. The region [start, end) is
+ * expected to be the entirety of the free area within a block. Chooses
+ * the best starting offset if the contig hints are equal.
+ */
+static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
+{
+ int contig = end - start;
+
+ block->first_free = min(block->first_free, start);
+ if (start == 0)
+ block->left_free = contig;
+
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ block->right_free = contig;
+
+ if (contig > block->contig_hint) {
+ block->contig_hint_start = start;
+ block->contig_hint = contig;
+ } else if (block->contig_hint_start && contig == block->contig_hint &&
+ (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* use the start with the best alignment */
+ block->contig_hint_start = start;
+ }
+}
- chunk->map_alloc = new_alloc;
- chunk->map = new;
- new = NULL;
+/**
+ * pcpu_block_refresh_hint
+ * @chunk: chunk of interest
+ * @index: index of the metadata block
+ *
+ * Scans over the block beginning at first_free and updates the block
+ * metadata accordingly.
+ */
+static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
+{
+ struct pcpu_block_md *block = chunk->md_blocks + index;
+ unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
+ int rs, re; /* region start, region end */
+
+ /* clear hints */
+ block->contig_hint = 0;
+ block->left_free = block->right_free = 0;
+
+ /* iterate over free areas and update the contig hints */
+ pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+ PCPU_BITMAP_BLOCK_BITS) {
+ pcpu_block_update(block, rs, re);
+ }
+}
-out_unlock:
- spin_unlock_irqrestore(&pcpu_lock, flags);
+/**
+ * pcpu_block_update_hint_alloc - update hint on allocation path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. The metadata only has to be
+ * refreshed by a full scan iff the chunk's contig hint is broken. Block level
+ * scans are required if the block's contig hint is broken.
+ */
+static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
/*
- * pcpu_mem_free() might end up calling vfree() which uses
- * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
*/
- pcpu_mem_free(old);
- pcpu_mem_free(new);
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
- return 0;
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+
+ /*
+ * Update s_block.
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
+ if (s_off == s_block->first_free)
+ s_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, s_index),
+ PCPU_BITMAP_BLOCK_BITS,
+ s_off + bits);
+
+ if (s_off >= s_block->contig_hint_start &&
+ s_off < s_block->contig_hint_start + s_block->contig_hint) {
+ /* block contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, s_index);
+ } else {
+ /* update left and right contig manually */
+ s_block->left_free = min(s_block->left_free, s_off);
+ if (s_index == e_index)
+ s_block->right_free = min_t(int, s_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ else
+ s_block->right_free = 0;
+ }
+
+ /*
+ * Update e_block.
+ */
+ if (s_index != e_index) {
+ /*
+ * When the allocation is across blocks, the end is along
+ * the left part of the e_block.
+ */
+ e_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, e_off);
+
+ if (e_off == PCPU_BITMAP_BLOCK_BITS) {
+ /* reset the block */
+ e_block++;
+ } else {
+ if (e_off > e_block->contig_hint_start) {
+ /* contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, e_index);
+ } else {
+ e_block->left_free = 0;
+ e_block->right_free =
+ min_t(int, e_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ }
+ }
+
+ /* update in-between md_blocks */
+ for (block = s_block + 1; block < e_block; block++) {
+ block->contig_hint = 0;
+ block->left_free = 0;
+ block->right_free = 0;
+ }
+ }
+
+ /*
+ * The only time a full chunk scan is required is if the chunk
+ * contig hint is broken. Otherwise, it means a smaller space
+ * was used and therefore the chunk contig hint is still correct.
+ */
+ if (bit_off >= chunk->contig_bits_start &&
+ bit_off < chunk->contig_bits_start + chunk->contig_bits)
+ pcpu_chunk_refresh_hint(chunk);
}
/**
- * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
- * @chunk: chunk the candidate area belongs to
- * @off: the offset to the start of the candidate area
- * @this_size: the size of the candidate area
- * @size: the size of the target allocation
- * @align: the alignment of the target allocation
- * @pop_only: only allocate from already populated region
- *
- * We're trying to allocate @size bytes aligned at @align. @chunk's area
- * at @off sized @this_size is a candidate. This function determines
- * whether the target allocation fits in the candidate area and returns the
- * number of bytes to pad after @off. If the target area doesn't fit, -1
- * is returned.
- *
- * If @pop_only is %true, this function only considers the already
- * populated part of the candidate area.
+ * pcpu_block_update_hint_free - updates the block hints on the free path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. This avoids a blind block
+ * refresh by making use of the block contig hints. If this fails, it scans
+ * forward and backward to determine the extent of the free area. This is
+ * capped at the boundary of blocks.
+ *
+ * A chunk update is triggered if a page becomes free, a block becomes free,
+ * or the free spans across blocks. This tradeoff is to minimize iterating
+ * over the block metadata to update chunk->contig_bits. chunk->contig_bits
+ * may be off by up to a page, but it will never be more than the available
+ * space. If the contig hint is contained in one block, it will be accurate.
*/
-static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
- int size, int align, bool pop_only)
+static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int cand_off = off;
-
- while (true) {
- int head = ALIGN(cand_off, align) - off;
- int page_start, page_end, rs, re;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
+ int start, end; /* start and end of the whole free area */
- if (this_size < head + size)
- return -1;
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
- if (!pop_only)
- return head;
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+ /*
+ * Check if the freed area aligns with the block->contig_hint.
+ * If it does, then the scan to find the beginning/end of the
+ * larger free area can be avoided.
+ *
+ * start and end refer to beginning and end of the free area
+ * within each their respective blocks. This is not necessarily
+ * the entire free area as it may span blocks past the beginning
+ * or end of the block.
+ */
+ start = s_off;
+ if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
+ start = s_block->contig_hint_start;
+ } else {
/*
- * If the first unpopulated page is beyond the end of the
- * allocation, the whole allocation is populated;
- * otherwise, retry from the end of the unpopulated area.
+ * Scan backwards to find the extent of the free area.
+ * find_last_bit returns the starting bit, so if the start bit
+ * is returned, that means there was no last bit and the
+ * remainder of the chunk is free.
*/
- page_start = PFN_DOWN(head + off);
- page_end = PFN_UP(head + off + size);
-
- rs = page_start;
- pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
- if (rs >= page_end)
- return head;
- cand_off = re * PAGE_SIZE;
+ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
+ start);
+ start = (start == l_bit) ? 0 : l_bit + 1;
+ }
+
+ end = e_off;
+ if (e_off == e_block->contig_hint_start)
+ end = e_block->contig_hint_start + e_block->contig_hint;
+ else
+ end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, end);
+
+ /* update s_block */
+ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ pcpu_block_update(s_block, start, e_off);
+
+ /* freeing in the same block */
+ if (s_index != e_index) {
+ /* update e_block */
+ pcpu_block_update(e_block, 0, end);
+
+ /* reset md_blocks in the middle */
+ for (block = s_block + 1; block < e_block; block++) {
+ block->first_free = 0;
+ block->contig_hint_start = 0;
+ block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ block->right_free = PCPU_BITMAP_BLOCK_BITS;
+ }
}
+
+ /*
+ * Refresh chunk metadata when the free makes a page free, a block
+ * free, or spans across blocks. The contig hint may be off by up to
+ * a page, but if the hint is contained in a block, it will be accurate
+ * with the else condition below.
+ */
+ if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
+ ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
+ s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk);
+ else
+ pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
+ s_block->contig_hint);
}
/**
- * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * pcpu_is_populated - determines if the region is populated
* @chunk: chunk of interest
- * @size: wanted size in bytes
- * @align: wanted align
- * @pop_only: allocate only from the populated area
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Try to allocate @size bytes area aligned at @align from @chunk.
- * Note that this function only allocates the offset. It doesn't
- * populate or map the area.
+ * @bit_off: chunk offset
+ * @bits: size of area
+ * @next_off: return value for the next offset to start searching
*
- * @chunk->map must have at least two free slots.
+ * For atomic allocations, check if the backing pages are populated.
*
- * CONTEXT:
- * pcpu_lock.
+ * RETURNS:
+ * Bool if the backing pages are populated.
+ * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
+ */
+static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
+ int *next_off)
+{
+ int page_start, page_end, rs, re;
+
+ page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+ page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
+
+ rs = page_start;
+ pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+ if (rs >= page_end)
+ return true;
+
+ *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+ return false;
+}
+
+/**
+ * pcpu_find_block_fit - finds the block index to start searching
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE bytes)
+ * @pop_only: use populated regions only
+ *
+ * Given a chunk and an allocation spec, find the offset to begin searching
+ * for a free region. This iterates over the bitmap metadata blocks to
+ * find an offset that will be guaranteed to fit the requirements. It is
+ * not quite first fit as if the allocation does not fit in the contig hint
+ * of a block or chunk, it is skipped. This errs on the side of caution
+ * to prevent excess iteration. Poor alignment can cause the allocator to
+ * skip over blocks and chunks that have valid free areas.
*
* RETURNS:
- * Allocated offset in @chunk on success, -1 if no matching area is
- * found.
+ * The offset in the bitmap to begin searching.
+ * -1 if no offset is found.
*/
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
- bool pop_only, int *occ_pages_p)
+static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, bool pop_only)
{
- int oslot = pcpu_chunk_slot(chunk);
- int max_contig = 0;
- int i, off;
- bool seen_free = false;
- int *p;
-
- for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
- int head, tail;
- int this_size;
-
- off = *p;
- if (off & 1)
- continue;
+ int bit_off, bits, next_off;
- this_size = (p[1] & ~1) - off;
+ /*
+ * Check to see if the allocation can fit in the chunk's contig hint.
+ * This is an optimization to prevent scanning by assuming if it
+ * cannot fit in the global hint, there is memory pressure and creating
+ * a new chunk would happen soon.
+ */
+ bit_off = ALIGN(chunk->contig_bits_start, align) -
+ chunk->contig_bits_start;
+ if (bit_off + alloc_bits > chunk->contig_bits)
+ return -1;
+
+ bit_off = chunk->first_bit;
+ bits = 0;
+ pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
+ if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
+ &next_off))
+ break;
- head = pcpu_fit_in_area(chunk, off, this_size, size, align,
- pop_only);
- if (head < 0) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- max_contig = max(this_size, max_contig);
- continue;
- }
+ bit_off = next_off;
+ bits = 0;
+ }
- /*
- * If head is small or the previous block is free,
- * merge'em. Note that 'small' is defined as smaller
- * than sizeof(int), which is very small but isn't too
- * uncommon for percpu allocations.
- */
- if (head && (head < sizeof(int) || !(p[-1] & 1))) {
- *p = off += head;
- if (p[-1] & 1)
- chunk->free_size -= head;
- else
- max_contig = max(*p - p[-1], max_contig);
- this_size -= head;
- head = 0;
- }
+ if (bit_off == pcpu_chunk_map_bits(chunk))
+ return -1;
- /* if tail is small, just keep it around */
- tail = this_size - head - size;
- if (tail < sizeof(int)) {
- tail = 0;
- size = this_size - head;
- }
+ return bit_off;
+}
- /* split if warranted */
- if (head || tail) {
- int nr_extra = !!head + !!tail;
-
- /* insert new subblocks */
- memmove(p + nr_extra + 1, p + 1,
- sizeof(chunk->map[0]) * (chunk->map_used - i));
- chunk->map_used += nr_extra;
-
- if (head) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- *++p = off += head;
- ++i;
- max_contig = max(head, max_contig);
- }
- if (tail) {
- p[1] = off + size;
- max_contig = max(tail, max_contig);
- }
- }
+/**
+ * pcpu_alloc_area - allocates an area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE)
+ * @start: bit_off to start searching
+ *
+ * This function takes in a @start offset to begin searching to fit an
+ * allocation of @alloc_bits with alignment @align. It needs to scan
+ * the allocation map because if it fits within the block's contig hint,
+ * @start will be block->first_free. This is an attempt to fill the
+ * allocation prior to breaking the contig hint. The allocation and
+ * boundary maps are updated accordingly if it confirms a valid
+ * free area.
+ *
+ * RETURNS:
+ * Allocated addr offset in @chunk on success.
+ * -1 if no matching area is found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, int start)
+{
+ size_t align_mask = (align) ? (align - 1) : 0;
+ int bit_off, end, oslot;
- if (!seen_free)
- chunk->first_free = i + 1;
+ lockdep_assert_held(&pcpu_lock);
- /* update hint and mark allocated */
- if (i + 1 == chunk->map_used)
- chunk->contig_hint = max_contig; /* fully scanned */
- else
- chunk->contig_hint = max(chunk->contig_hint,
- max_contig);
+ oslot = pcpu_chunk_slot(chunk);
- chunk->free_size -= size;
- *p |= 1;
+ /*
+ * Search to find a fit.
+ */
+ end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
+ bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
+ alloc_bits, align_mask);
+ if (bit_off >= end)
+ return -1;
- *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
- pcpu_chunk_relocate(chunk, oslot);
- return off;
- }
+ /* update alloc map */
+ bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
+
+ /* update boundary map */
+ set_bit(bit_off, chunk->bound_map);
+ bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
+ set_bit(bit_off + alloc_bits, chunk->bound_map);
+
+ chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ if (bit_off == chunk->first_bit)
+ chunk->first_bit = find_next_zero_bit(
+ chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk),
+ bit_off + alloc_bits);
+
+ pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
- chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
- /* tell the upper layer that this chunk has no matching area */
- return -1;
+ return bit_off * PCPU_MIN_ALLOC_SIZE;
}
/**
- * pcpu_free_area - free area to a pcpu_chunk
+ * pcpu_free_area - frees the corresponding offset
* @chunk: chunk of interest
- * @freeme: offset of area to free
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Free area starting from @freeme to @chunk. Note that this function
- * only modifies the allocation map. It doesn't depopulate or unmap
- * the area.
+ * @off: addr offset into chunk
*
- * CONTEXT:
- * pcpu_lock.
+ * This function determines the size of an allocation to free using
+ * the boundary bitmap and clears the allocation map.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
- int *occ_pages_p)
+static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
- int oslot = pcpu_chunk_slot(chunk);
- int off = 0;
- unsigned i, j;
- int to_free = 0;
- int *p;
+ int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
- freeme |= 1; /* we are searching for <given offset, in use> pair */
-
- i = 0;
- j = chunk->map_used;
- while (i != j) {
- unsigned k = (i + j) / 2;
- off = chunk->map[k];
- if (off < freeme)
- i = k + 1;
- else if (off > freeme)
- j = k;
- else
- i = j = k;
+ oslot = pcpu_chunk_slot(chunk);
+
+ bit_off = off / PCPU_MIN_ALLOC_SIZE;
+
+ /* find end index */
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ bits = end - bit_off;
+ bitmap_clear(chunk->alloc_map, bit_off, bits);
+
+ /* update metadata */
+ chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ chunk->first_bit = min(chunk->first_bit, bit_off);
+
+ pcpu_block_update_hint_free(chunk, bit_off, bits);
+
+ pcpu_chunk_relocate(chunk, oslot);
+}
+
+static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
+{
+ struct pcpu_block_md *md_block;
+
+ for (md_block = chunk->md_blocks;
+ md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
+ md_block++) {
+ md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
}
- BUG_ON(off != freeme);
+}
- if (i < chunk->first_free)
- chunk->first_free = i;
+/**
+ * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
+ * @tmp_addr: the start of the region served
+ * @map_size: size of the region served
+ *
+ * This is responsible for creating the chunks that serve the first chunk. The
+ * base_addr is page aligned down of @tmp_addr while the region end is page
+ * aligned up. Offsets are kept track of to determine the region served. All
+ * this is done to appease the bitmap allocator in avoiding partial blocks.
+ *
+ * RETURNS:
+ * Chunk serving the region at @tmp_addr of @map_size.
+ */
+static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
+ int map_size)
+{
+ struct pcpu_chunk *chunk;
+ unsigned long aligned_addr, lcm_align;
+ int start_offset, offset_bits, region_size, region_bits;
- p = chunk->map + i;
- *p = off &= ~1;
- chunk->free_size += (p[1] & ~1) - off;
+ /* region calculations */
+ aligned_addr = tmp_addr & PAGE_MASK;
- *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
+ start_offset = tmp_addr - aligned_addr;
- /* merge with next? */
- if (!(p[1] & 1))
- to_free++;
- /* merge with previous? */
- if (i > 0 && !(p[-1] & 1)) {
- to_free++;
- i--;
- p--;
+ /*
+ * Align the end of the region with the LCM of PAGE_SIZE and
+ * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
+ * the other.
+ */
+ lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
+ region_size = ALIGN(start_offset + map_size, lcm_align);
+
+ /* allocate chunk */
+ chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT),
+ 0);
+
+ INIT_LIST_HEAD(&chunk->list);
+
+ chunk->base_addr = (void *)aligned_addr;
+ chunk->start_offset = start_offset;
+ chunk->end_offset = region_size - chunk->start_offset - map_size;
+
+ chunk->nr_pages = region_size >> PAGE_SHIFT;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]), 0);
+ chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]), 0);
+ chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]), 0);
+ pcpu_init_md_blocks(chunk);
+
+ /* manage populated page bitmap */
+ chunk->immutable = true;
+ bitmap_fill(chunk->populated, chunk->nr_pages);
+ chunk->nr_populated = chunk->nr_pages;
+ chunk->nr_empty_pop_pages =
+ pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
+ map_size / PCPU_MIN_ALLOC_SIZE);
+
+ chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
+ chunk->free_bytes = map_size;
+
+ if (chunk->start_offset) {
+ /* hide the beginning of the bitmap */
+ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map, 0, offset_bits);
+ set_bit(0, chunk->bound_map);
+ set_bit(offset_bits, chunk->bound_map);
+
+ chunk->first_bit = offset_bits;
+
+ pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
- if (to_free) {
- chunk->map_used -= to_free;
- memmove(p + 1, p + 1 + to_free,
- (chunk->map_used - i) * sizeof(chunk->map[0]));
+
+ if (chunk->end_offset) {
+ /* hide the end of the bitmap */
+ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) - offset_bits,
+ offset_bits);
+ set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
+ chunk->bound_map);
+ set_bit(region_bits, chunk->bound_map);
+
+ pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
+ - offset_bits, offset_bits);
}
- chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
- pcpu_chunk_relocate(chunk, oslot);
+ return chunk;
}
static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
struct pcpu_chunk *chunk;
+ int region_bits;
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
if (!chunk)
return NULL;
- chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
- sizeof(chunk->map[0]));
- if (!chunk->map) {
- pcpu_mem_free(chunk);
- return NULL;
- }
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->nr_pages = pcpu_unit_pages;
+ region_bits = pcpu_chunk_map_bits(chunk);
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[0] = 0;
- chunk->map[1] = pcpu_unit_size | 1;
- chunk->map_used = 1;
- chunk->has_reserved = false;
+ chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]));
+ if (!chunk->alloc_map)
+ goto alloc_map_fail;
- INIT_LIST_HEAD(&chunk->list);
- INIT_LIST_HEAD(&chunk->map_extend_list);
- chunk->free_size = pcpu_unit_size;
- chunk->contig_hint = pcpu_unit_size;
+ chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]));
+ if (!chunk->bound_map)
+ goto bound_map_fail;
+
+ chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]));
+ if (!chunk->md_blocks)
+ goto md_blocks_fail;
+
+ pcpu_init_md_blocks(chunk);
+
+ /* init metadata */
+ chunk->contig_bits = region_bits;
+ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
+
+md_blocks_fail:
+ pcpu_mem_free(chunk->bound_map);
+bound_map_fail:
+ pcpu_mem_free(chunk->alloc_map);
+alloc_map_fail:
+ pcpu_mem_free(chunk);
+
+ return NULL;
}
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map);
+ pcpu_mem_free(chunk->bound_map);
+ pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
}
@@ -748,13 +1210,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
+ * @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
+ *
+ * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
+ * is to serve an allocation in that area.
*/
-static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
+ int page_end, bool for_alloc)
{
int nr = page_end - page_start;
@@ -762,7 +1228,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
- pcpu_nr_empty_pop_pages += nr;
+
+ if (!for_alloc) {
+ chunk->nr_empty_pop_pages += nr;
+ pcpu_nr_empty_pop_pages += nr;
+ }
}
/**
@@ -784,6 +1254,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
+ chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
}
@@ -819,18 +1290,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
+ * This is an internal function that handles all but static allocations.
+ * Static percpu address values should never be passed into the allocator.
+ *
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- /* is it in the first chunk? */
- if (pcpu_addr_in_first_chunk(addr)) {
- /* is it in the reserved area? */
- if (pcpu_addr_in_reserved_chunk(addr))
- return pcpu_reserved_chunk;
+ /* is it in the dynamic region (first chunk)? */
+ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
return pcpu_first_chunk;
- }
+
+ /* is it in the reserved region? */
+ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
+ return pcpu_reserved_chunk;
/*
* The address is relative to unit0 which might be unused and
@@ -863,19 +1337,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
struct pcpu_chunk *chunk;
const char *err;
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
- int occ_pages = 0;
- int slot, off, new_alloc, cpu, ret;
+ int slot, off, cpu, ret;
unsigned long flags;
void __percpu *ptr;
+ size_t bits, bit_align;
/*
- * We want the lowest bit of offset available for in-use/free
- * indicator, so force >= 16bit alignment and make size even.
+ * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
+ * therefore alignment must be a minimum of that many bytes.
+ * An allocation may have internal fragmentation from rounding up
+ * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/
- if (unlikely(align < 2))
- align = 2;
+ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
+ align = PCPU_MIN_ALLOC_SIZE;
- size = ALIGN(size, 2);
+ size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
+ bits = size >> PCPU_MIN_ALLOC_SHIFT;
+ bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
@@ -893,23 +1371,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
- if (size > chunk->contig_hint) {
+ off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
+ if (off < 0) {
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
- while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (is_atomic ||
- pcpu_extend_area_map(chunk, new_alloc) < 0) {
- err = "failed to extend area map of reserved chunk";
- goto fail;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- }
-
- off = pcpu_alloc_area(chunk, size, align, is_atomic,
- &occ_pages);
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
@@ -921,31 +1389,15 @@ restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (size > chunk->contig_hint)
+ off = pcpu_find_block_fit(chunk, bits, bit_align,
+ is_atomic);
+ if (off < 0)
continue;
- new_alloc = pcpu_need_to_extend(chunk, is_atomic);
- if (new_alloc) {
- if (is_atomic)
- continue;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (pcpu_extend_area_map(chunk,
- new_alloc) < 0) {
- err = "failed to extend area map";
- goto fail;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- /*
- * pcpu_lock has been dropped, need to
- * restart cpu_slot list walking.
- */
- goto restart;
- }
-
- off = pcpu_alloc_area(chunk, size, align, is_atomic,
- &occ_pages);
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
+
}
}
@@ -987,30 +1439,25 @@ area_found:
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ pcpu_for_each_unpop_region(chunk->populated, rs, re,
+ page_start, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- pcpu_free_area(chunk, off, &occ_pages);
+ pcpu_free_area(chunk, off);
err = "failed to populate";
goto fail_unlock;
}
- pcpu_chunk_populated(chunk, rs, re);
+ pcpu_chunk_populated(chunk, rs, re, true);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
mutex_unlock(&pcpu_alloc_mutex);
}
- if (chunk != pcpu_reserved_chunk) {
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_nr_empty_pop_pages -= occ_pages;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- }
-
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
@@ -1128,7 +1575,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
- list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1137,7 +1583,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
list_for_each_entry_safe(chunk, next, &to_free, list) {
int rs, re;
- pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@@ -1146,25 +1593,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
pcpu_destroy_chunk(chunk);
}
- /* service chunks which requested async area map extension */
- do {
- int new_alloc = 0;
-
- spin_lock_irq(&pcpu_lock);
-
- chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
- struct pcpu_chunk, map_extend_list);
- if (chunk) {
- list_del_init(&chunk->map_extend_list);
- new_alloc = pcpu_need_to_extend(chunk, false);
- }
-
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
- } while (chunk);
-
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1194,7 +1622,7 @@ retry_pop:
spin_lock_irq(&pcpu_lock);
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- nr_unpop = pcpu_unit_pages - chunk->nr_populated;
+ nr_unpop = chunk->nr_pages - chunk->nr_populated;
if (nr_unpop)
break;
}
@@ -1204,14 +1632,15 @@ retry_pop:
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
- pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
ret = pcpu_populate_chunk(chunk, rs, rs + nr);
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, rs, rs + nr);
+ pcpu_chunk_populated(chunk, rs, rs + nr, false);
spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
@@ -1250,7 +1679,7 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off, occ_pages;
+ int off;
if (!ptr)
return;
@@ -1264,13 +1693,10 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off, &occ_pages);
-
- if (chunk != pcpu_reserved_chunk)
- pcpu_nr_empty_pop_pages += occ_pages;
+ pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
- if (chunk->free_size == pcpu_unit_size) {
+ if (chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
@@ -1361,10 +1787,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
* The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
+ *
+ * The address check is against full chunk sizes. pcpu_base_addr
+ * points to the beginning of the first chunk including the
+ * static region. Assumes good intent as the first chunk may
+ * not be full (ie. < pcpu_unit_pages in size).
*/
- first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
- first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
- pcpu_unit_pages);
+ first_low = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
+ first_high = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
if ((unsigned long)addr >= first_low &&
(unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
@@ -1546,12 +1978,13 @@ static void pcpu_dump_alloc_info(const char *lvl,
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
*
- * If the first chunk ends up with both reserved and dynamic areas, it
- * is served by two chunks - one to serve the core static and reserved
- * areas and the other for the dynamic area. They share the same vm
- * and page map but uses different area allocation map to stay away
- * from each other. The latter chunk is circulated in the chunk slots
- * and available for dynamic allocation like any other chunks.
+ * The first chunk will always contain a static and a dynamic region.
+ * However, the static region is not managed by any chunk. If the first
+ * chunk also contains a reserved region, it is served by two chunks -
+ * one for the reserved region and one for the dynamic region. They
+ * share the same vm, but use offset regions in the area allocation map.
+ * The chunk serving the dynamic region is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunk.
*
* RETURNS:
* 0 on success, -errno on failure.
@@ -1559,17 +1992,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
- static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- size_t dyn_size = ai->dyn_size;
- size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
- struct pcpu_chunk *schunk, *dchunk = NULL;
+ size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ size_t static_size, dyn_size;
+ struct pcpu_chunk *chunk;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
+ int map_size;
+ unsigned long tmp_addr;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -1592,7 +2025,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(!ai->dyn_size);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
+ PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
+ IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
@@ -1671,64 +2109,41 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
INIT_LIST_HEAD(&pcpu_slot[i]);
/*
- * Initialize static chunk. If reserved_size is zero, the
- * static chunk covers static area + dynamic allocation area
- * in the first chunk. If reserved_size is not zero, it
- * covers static area + reserved area (mostly used for module
- * static percpu allocation).
+ * The end of the static region needs to be aligned with the
+ * minimum allocation size as this offsets the reserved and
+ * dynamic region. The first chunk ends page aligned by
+ * expanding the dynamic region, therefore the dynamic region
+ * can be shrunk to compensate while still staying above the
+ * configured sizes.
*/
- schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
- INIT_LIST_HEAD(&schunk->list);
- INIT_LIST_HEAD(&schunk->map_extend_list);
- schunk->base_addr = base_addr;
- schunk->map = smap;
- schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->immutable = true;
- bitmap_fill(schunk->populated, pcpu_unit_pages);
- schunk->nr_populated = pcpu_unit_pages;
+ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
+ dyn_size = ai->dyn_size - (static_size - ai->static_size);
- if (ai->reserved_size) {
- schunk->free_size = ai->reserved_size;
- pcpu_reserved_chunk = schunk;
- pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
- } else {
- schunk->free_size = dyn_size;
- dyn_size = 0; /* dynamic area covered */
- }
- schunk->contig_hint = schunk->free_size;
-
- schunk->map[0] = 1;
- schunk->map[1] = ai->static_size;
- schunk->map_used = 1;
- if (schunk->free_size)
- schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
- schunk->map[schunk->map_used] |= 1;
- schunk->has_reserved = true;
+ /*
+ * Initialize first chunk.
+ * If the reserved_size is non-zero, this initializes the reserved
+ * chunk. If the reserved_size is zero, the reserved chunk is NULL
+ * and the dynamic region is initialized here. The first chunk,
+ * pcpu_first_chunk, will always point to the chunk that serves
+ * the dynamic region.
+ */
+ tmp_addr = (unsigned long)base_addr + static_size;
+ map_size = ai->reserved_size ?: dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
/* init dynamic chunk if necessary */
- if (dyn_size) {
- dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
- INIT_LIST_HEAD(&dchunk->list);
- INIT_LIST_HEAD(&dchunk->map_extend_list);
- dchunk->base_addr = base_addr;
- dchunk->map = dmap;
- dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->immutable = true;
- bitmap_fill(dchunk->populated, pcpu_unit_pages);
- dchunk->nr_populated = pcpu_unit_pages;
-
- dchunk->contig_hint = dchunk->free_size = dyn_size;
- dchunk->map[0] = 1;
- dchunk->map[1] = pcpu_reserved_chunk_limit;
- dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
- dchunk->map_used = 2;
- dchunk->has_reserved = true;
+ if (ai->reserved_size) {
+ pcpu_reserved_chunk = chunk;
+
+ tmp_addr = (unsigned long)base_addr + static_size +
+ ai->reserved_size;
+ map_size = dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
}
/* link the first chunk in */
- pcpu_first_chunk = dchunk ?: schunk;
- pcpu_nr_empty_pop_pages +=
- pcpu_count_occupied_pages(pcpu_first_chunk, 1);
+ pcpu_first_chunk = chunk;
+ pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_stats_chunk_alloc();
@@ -1842,6 +2257,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ /* determine the maximum # of units that can fit in an allocation */
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
@@ -1868,9 +2284,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
}
/*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
+ * Wasted space is caused by a ratio imbalance of upa to group_cnt.
+ * Expand the unit_size until we use >= 75% of the units allocated.
+ * Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
@@ -2299,36 +2715,6 @@ void __init setup_per_cpu_areas(void)
#endif /* CONFIG_SMP */
/*
- * First and reserved chunks are initialized with temporary allocation
- * map in initdata so that they can be used before slab is online.
- * This function is called after slab is brought up and replaces those
- * with properly allocated maps.
- */
-void __init percpu_init_late(void)
-{
- struct pcpu_chunk *target_chunks[] =
- { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
- struct pcpu_chunk *chunk;
- unsigned long flags;
- int i;
-
- for (i = 0; (chunk = target_chunks[i]); i++) {
- int *map;
- const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
-
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- map = pcpu_mem_zalloc(size);
- BUG_ON(!map);
-
- spin_lock_irqsave(&pcpu_lock, flags);
- memcpy(map, chunk->map, size);
- chunk->map = map;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- }
-}
-
-/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.
diff --git a/mm/rmap.c b/mm/rmap.c
index ced14f1af6dc..c570f82e6827 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -605,6 +605,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
tlb_ubc->flush_required = true;
/*
+ * Ensure compiler does not re-order the setting of tlb_flush_batched
+ * before the PTE is cleared.
+ */
+ barrier();
+ mm->tlb_flush_batched = true;
+
+ /*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
* before the page is queued for IO.
@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
+
+/*
+ * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
+ * releasing the PTL if TLB flushes are batched. It's possible for a parallel
+ * operation such as mprotect or munmap to race between reclaim unmapping
+ * the page and flushing the page. If this race occurs, it potentially allows
+ * access to data via a stale TLB entry. Tracking all mm's that have TLB
+ * batching in flight would be expensive during reclaim so instead track
+ * whether TLB batching occurred in the past and if so then do a flush here
+ * if required. This will cost one additional flush per reclaim cycle paid
+ * by the first operation at risk such as mprotect and mumap.
+ *
+ * This must be called under the PTL so that an access to tlb_flush_batched
+ * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
+ * via the PTL.
+ */
+void flush_tlb_batched_pending(struct mm_struct *mm)
+{
+ if (mm->tlb_flush_batched) {
+ flush_tlb_mm(mm);
+
+ /*
+ * Do not allow the compiler to re-order the clearing of
+ * tlb_flush_batched before the tlb is flushed.
+ */
+ barrier();
+ mm->tlb_flush_batched = false;
+ }
+}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
@@ -851,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+ unsigned long start = address, end;
int *cleaned = arg;
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free from this function.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
+ unsigned long cstart, cend;
int ret = 0;
- address = pvmw.address;
+
+ cstart = address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -868,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
+ cend = cstart + PAGE_SIZE;
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -882,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
+ cstart &= PMD_MASK;
+ cend = cstart + PMD_SIZE;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -890,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
if (ret) {
- mmu_notifier_invalidate_page(vma->vm_mm, address);
+ mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
(*cleaned)++;
}
}
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return true;
}
@@ -1288,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
+ unsigned long start = address, end;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1299,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
flags & TTU_MIGRATION, page);
}
+ /*
+ * We have to assume the worse case ie pmd for invalidation. Note that
+ * the page can not be free in this function as call of try_to_unmap()
+ * must hold a reference on the page.
+ */
+ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk(&pvmw)) {
/*
* If the page is mlock()d, we cannot swap it out.
@@ -1409,6 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
WARN_ON_ONCE(1);
ret = false;
+ /* We have to invalidate as we cleared the pte */
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1454,8 +1515,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
- mmu_notifier_invalidate_page(mm, address);
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
}
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index b0aa6075d164..ace53a582be5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,6 +34,7 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/khugepaged.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -188,6 +189,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
+static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (shmem_acct_block(info->flags, pages))
+ return false;
+
+ if (sbinfo->max_blocks) {
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - pages) > 0)
+ goto unacct;
+ percpu_counter_add(&sbinfo->used_blocks, pages);
+ }
+
+ return true;
+
+unacct:
+ shmem_unacct_blocks(info->flags, pages);
+ return false;
+}
+
+static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
+{
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+
+ if (sbinfo->max_blocks)
+ percpu_counter_sub(&sbinfo->used_blocks, pages);
+ shmem_unacct_blocks(info->flags, pages);
+}
+
static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
@@ -249,23 +282,20 @@ static void shmem_recalc_inode(struct inode *inode)
freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
if (freed > 0) {
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -freed);
info->alloced -= freed;
inode->i_blocks -= freed * BLOCKS_PER_PAGE;
- shmem_unacct_blocks(info->flags, freed);
+ shmem_inode_unacct_blocks(inode, freed);
}
}
bool shmem_charge(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
unsigned long flags;
- if (shmem_acct_block(info->flags, pages))
+ if (!shmem_inode_acct_block(inode, pages))
return false;
+
spin_lock_irqsave(&info->lock, flags);
info->alloced += pages;
inode->i_blocks += pages * BLOCKS_PER_PAGE;
@@ -273,26 +303,12 @@ bool shmem_charge(struct inode *inode, long pages)
spin_unlock_irqrestore(&info->lock, flags);
inode->i_mapping->nrpages += pages;
- if (!sbinfo->max_blocks)
- return true;
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - pages) > 0) {
- inode->i_mapping->nrpages -= pages;
- spin_lock_irqsave(&info->lock, flags);
- info->alloced -= pages;
- shmem_recalc_inode(inode);
- spin_unlock_irqrestore(&info->lock, flags);
- shmem_unacct_blocks(info->flags, pages);
- return false;
- }
- percpu_counter_add(&sbinfo->used_blocks, pages);
return true;
}
void shmem_uncharge(struct inode *inode, long pages)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
unsigned long flags;
spin_lock_irqsave(&info->lock, flags);
@@ -301,9 +317,7 @@ void shmem_uncharge(struct inode *inode, long pages)
shmem_recalc_inode(inode);
spin_unlock_irqrestore(&info->lock, flags);
- if (sbinfo->max_blocks)
- percpu_counter_sub(&sbinfo->used_blocks, pages);
- shmem_unacct_blocks(info->flags, pages);
+ shmem_inode_unacct_blocks(inode, pages);
}
/*
@@ -1022,7 +1036,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
*/
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
@@ -1448,9 +1466,10 @@ static struct page *shmem_alloc_page(gfp_t gfp,
}
static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
- struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
+ struct inode *inode,
pgoff_t index, bool huge)
{
+ struct shmem_inode_info *info = SHMEM_I(inode);
struct page *page;
int nr;
int err = -ENOSPC;
@@ -1459,14 +1478,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
- if (shmem_acct_block(info->flags, nr))
+ if (!shmem_inode_acct_block(inode, nr))
goto failed;
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks - nr) > 0)
- goto unacct;
- percpu_counter_add(&sbinfo->used_blocks, nr);
- }
if (huge)
page = shmem_alloc_hugepage(gfp, info, index);
@@ -1479,10 +1492,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
}
err = -ENOMEM;
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -nr);
-unacct:
- shmem_unacct_blocks(info->flags, nr);
+ shmem_inode_unacct_blocks(inode, nr);
failed:
return ERR_PTR(err);
}
@@ -1640,7 +1650,7 @@ repeat:
if (swap.val) {
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap);
+ page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -1747,10 +1757,9 @@ repeat:
}
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
- index, true);
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
+alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
index, false);
}
if (IS_ERR(page)) {
@@ -1817,7 +1826,11 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
- if (list_empty(&info->shrinklist)) {
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
list_add_tail(&info->shrinklist,
&sbinfo->shrinklist);
sbinfo->shrinklist_len++;
@@ -1868,10 +1881,7 @@ clear:
* Error recovery.
*/
unacct:
- if (sbinfo->max_blocks)
- percpu_counter_sub(&sbinfo->used_blocks,
- 1 << compound_order(page));
- shmem_unacct_blocks(info->flags, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
if (PageTransHuge(page)) {
unlock_page(page);
@@ -2198,16 +2208,16 @@ bool shmem_mapping(struct address_space *mapping)
return mapping->a_ops == &shmem_aops;
}
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
+static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -2219,33 +2229,30 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
int ret;
ret = -ENOMEM;
- if (shmem_acct_block(info->flags, 1))
+ if (!shmem_inode_acct_block(inode, 1))
goto out;
- if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks,
- sbinfo->max_blocks) >= 0)
- goto out_unacct_blocks;
- percpu_counter_inc(&sbinfo->used_blocks);
- }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
- goto out_dec_used_blocks;
-
- page_kaddr = kmap_atomic(page);
- ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
- PAGE_SIZE);
- kunmap_atomic(page_kaddr);
-
- /* fallback to copy_from_user outside mmap_sem */
- if (unlikely(ret)) {
- *pagep = page;
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -1);
- shmem_unacct_blocks(info->flags, 1);
- /* don't free the page */
- return -EFAULT;
+ goto out_unacct_blocks;
+
+ if (!zeropage) { /* mcopy_atomic */
+ page_kaddr = kmap_atomic(page);
+ ret = copy_from_user(page_kaddr,
+ (const void __user *)src_addr,
+ PAGE_SIZE);
+ kunmap_atomic(page_kaddr);
+
+ /* fallback to copy_from_user outside mmap_sem */
+ if (unlikely(ret)) {
+ *pagep = page;
+ shmem_inode_unacct_blocks(inode, 1);
+ /* don't free the page */
+ return -EFAULT;
+ }
+ } else { /* mfill_zeropage_atomic */
+ clear_highpage(page);
}
} else {
page = *pagep;
@@ -2306,14 +2313,33 @@ out_release_uncharge:
out_release:
unlock_page(page);
put_page(page);
-out_dec_used_blocks:
- if (sbinfo->max_blocks)
- percpu_counter_add(&sbinfo->used_blocks, -1);
out_unacct_blocks:
- shmem_unacct_blocks(info->flags, 1);
+ shmem_inode_unacct_blocks(inode, 1);
goto out;
}
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep)
+{
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, false, pagep);
+}
+
+int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
+{
+ struct page *page = NULL;
+
+ return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, 0, true, &page);
+}
+
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -3627,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
@@ -3639,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create,
char *name;
long len;
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
+ if (flags & MFD_ALLOW_SEALING)
+ return -EINVAL;
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
/* length includes terminating zero */
len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
@@ -3671,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create,
goto err_name;
}
- file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_fd;
}
- info = SHMEM_I(file_inode(file));
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_RDWR | O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING)
+
+ if (flags & MFD_ALLOW_SEALING) {
+ /*
+ * flags check at beginning of function ensures
+ * this is not a hugetlbfs (MFD_HUGETLB) file.
+ */
+ info = SHMEM_I(file_inode(file));
info->seals &= ~F_SEAL_SEAL;
+ }
fd_install(fd, file);
kfree(name);
@@ -3959,7 +4009,7 @@ int __init shmem_init(void)
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
- if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
+ if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
shmem_huge = 0; /* just in case it was patched */
@@ -4020,7 +4070,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
return -EINVAL;
shmem_huge = huge;
- if (shmem_huge < SHMEM_HUGE_DENY)
+ if (shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
return count;
}
diff --git a/mm/slab.h b/mm/slab.h
index 6885e1192ec5..073362816acc 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,6 +43,7 @@ struct kmem_cache {
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/random.h>
+#include <linux/sched/mm.h>
/*
* State of the slab allocator.
@@ -412,7 +413,10 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
gfp_t flags)
{
flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
+
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
+
might_sleep_if(gfpflags_allow_blocking(flags));
if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index 1bae78d71096..a8bd6fa11a66 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -432,7 +432,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp &= gfp_allowed_mask;
- lockdep_trace_alloc(gfp);
+ fs_reclaim_acquire(gfp);
+ fs_reclaim_release(gfp);
if (size < PAGE_SIZE - align) {
if (!size)
@@ -538,7 +539,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
+ fs_reclaim_acquire(flags);
+ fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/slub.c b/mm/slub.c
index 1d3f9835f4ea..ddb04576b342 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,7 @@
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
#include <linux/memcontrol.h>
+#include <linux/random.h>
#include <trace/events/kmem.h>
@@ -238,30 +239,62 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
+/*
+ * Returns freelist pointer (ptr). With hardening, this is obfuscated
+ * with an XOR of the address where the pointer is held and a per-cache
+ * random number.
+ */
+static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
+ unsigned long ptr_addr)
+{
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+#else
+ return ptr;
+#endif
+}
+
+/* Returns the freelist pointer recorded at location ptr_addr. */
+static inline void *freelist_dereference(const struct kmem_cache *s,
+ void *ptr_addr)
+{
+ return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr),
+ (unsigned long)ptr_addr);
+}
+
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
- return *(void **)(object + s->offset);
+ return freelist_dereference(s, object + s->offset);
}
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- prefetch(object + s->offset);
+ if (object)
+ prefetch(freelist_dereference(s, object + s->offset));
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
+ unsigned long freepointer_addr;
void *p;
if (!debug_pagealloc_enabled())
return get_freepointer(s, object);
- probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
- return p;
+ freepointer_addr = (unsigned long)object + s->offset;
+ probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+ return freelist_ptr(s, p, freepointer_addr);
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
- *(void **)(object + s->offset) = fp;
+ unsigned long freeptr_addr = (unsigned long)object + s->offset;
+
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ BUG_ON(object == fp); /* naive detection of double free or corruption */
+#endif
+
+ *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
/* Loop over all objects in a slab */
@@ -3358,8 +3391,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
struct kmem_cache_node *n;
for_each_kmem_cache_node(s, node, n) {
- kmem_cache_free(kmem_cache_node, n);
s->node[node] = NULL;
+ kmem_cache_free(kmem_cache_node, n);
}
}
@@ -3389,8 +3422,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 0;
}
- s->node[node] = n;
init_kmem_cache_node(n);
+ s->node[node] = n;
}
return 1;
}
@@ -3563,6 +3596,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
{
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
s->reserved = 0;
+#ifdef CONFIG_SLAB_FREELIST_HARDENED
+ s->random = get_random_long();
+#endif
if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
s->reserved = sizeof(struct rcu_head);
@@ -5423,7 +5459,7 @@ static struct attribute *slab_attrs[] = {
NULL
};
-static struct attribute_group slab_attr_group = {
+static const struct attribute_group slab_attr_group = {
.attrs = slab_attrs,
};
@@ -5642,13 +5678,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
* A cache is never shut down before deactivation is
* complete, so no need to worry about synchronization.
*/
- return;
+ goto out;
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
+out:
kobject_put(&s->kobj);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index c50b1a14d55e..d1a39b8051e0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
if (slab_is_available()) {
struct page *page;
- if (node_state(node, N_HIGH_MEMORY))
- page = alloc_pages_node(
- node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
- get_order(size));
- else
- page = alloc_pages(
- GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
- get_order(size));
+ page = alloc_pages_node(node,
+ GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
+ get_order(size));
if (page)
return page_address(page);
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7b4be3fd5cac..a9783acf2bb9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid)
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available()) {
- if (node_state(nid, N_HIGH_MEMORY))
- section = kzalloc_node(array_size, GFP_KERNEL, nid);
- else
- section = kzalloc(array_size, GFP_KERNEL);
- } else {
+ if (slab_is_available())
+ section = kzalloc_node(array_size, GFP_KERNEL, nid);
+ else
section = memblock_virt_alloc_node(array_size, nid);
- }
return section;
}
diff --git a/mm/swap.c b/mm/swap.c
index 60b1d2a75852..62d96b8e5eb3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -946,28 +946,34 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
}
/**
- * pagevec_lookup - gang pagecache lookup
+ * pagevec_lookup_range - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
+ * @end: The final page index
* @nr_pages: The maximum number of pages
*
- * pagevec_lookup() will search for and return a group of up to @nr_pages pages
- * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
+ * pagevec_lookup_range() will search for and return a group of up to @nr_pages
+ * pages in the mapping starting from index @start and upto index @end
+ * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages.
+ * indexes. There may be holes in the indices due to not-present pages. We
+ * also update @start to index the next page for the traversal.
*
- * pagevec_lookup() returns the number of pages which were found.
+ * pagevec_lookup_range() returns the number of pages which were found. If this
+ * number is smaller than @nr_pages, the end of specified range has been
+ * reached.
*/
-unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
- pgoff_t start, unsigned nr_pages)
+unsigned pagevec_lookup_range(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *start, pgoff_t end)
{
- pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+ pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
+ pvec->pages);
return pagevec_count(pvec);
}
-EXPORT_SYMBOL(pagevec_lookup);
+EXPORT_SYMBOL(pagevec_lookup_range);
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b68c93014f50..71ce2d1ccbf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = {
struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
+bool swap_vma_readahead = true;
+
+#define SWAP_RA_MAX_ORDER_DEFAULT 3
+
+static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
+
+#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
+#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
+#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK
+#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK)
+
+#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK)
+#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
+#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK)
+
+#define SWAP_RA_VAL(addr, win, hits) \
+ (((addr) & PAGE_MASK) | \
+ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \
+ ((hits) & SWAP_RA_HITS_MASK))
+
+/* Initial readahead hits is 4 to start up with a small window */
+#define GET_SWAP_RA_VAL(vma) \
+ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
@@ -297,19 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
* lock getting page table operations atomic even if we drop the page
* lock before returning.
*/
-struct page * lookup_swap_cache(swp_entry_t entry)
+struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
+ unsigned long addr)
{
struct page *page;
+ unsigned long ra_info;
+ int win, hits, readahead;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
- if (page && likely(!PageTransCompound(page))) {
+ INC_CACHE_INFO(find_total);
+ if (page) {
INC_CACHE_INFO(find_success);
- if (TestClearPageReadahead(page))
- atomic_inc(&swapin_readahead_hits);
+ if (unlikely(PageTransCompound(page)))
+ return page;
+ readahead = TestClearPageReadahead(page);
+ if (vma) {
+ ra_info = GET_SWAP_RA_VAL(vma);
+ win = SWAP_RA_WIN(ra_info);
+ hits = SWAP_RA_HITS(ra_info);
+ if (readahead)
+ hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(addr, win, hits));
+ }
+ if (readahead) {
+ count_vm_event(SWAP_RA_HIT);
+ if (!vma)
+ atomic_inc(&swapin_readahead_hits);
+ }
}
-
- INC_CACHE_INFO(find_total);
return page;
}
@@ -424,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return retpage;
}
-static unsigned long swapin_nr_pages(unsigned long offset)
+static unsigned int __swapin_nr_pages(unsigned long prev_offset,
+ unsigned long offset,
+ int hits,
+ int max_pages,
+ int prev_win)
{
- static unsigned long prev_offset;
- unsigned int pages, max_pages, last_ra;
- static atomic_t last_readahead_pages;
-
- max_pages = 1 << READ_ONCE(page_cluster);
- if (max_pages <= 1)
- return 1;
+ unsigned int pages, last_ra;
/*
* This heuristic has been found to work well on both sequential and
* random loads, swapping to hard disk or to SSD: please don't ask
* what the "+ 2" means, it just happens to work well, that's all.
*/
- pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ pages = hits + 2;
if (pages == 2) {
/*
* We can have no readahead hits to judge by: but must not get
@@ -448,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset)
*/
if (offset != prev_offset + 1 && offset != prev_offset - 1)
pages = 1;
- prev_offset = offset;
} else {
unsigned int roundup = 4;
while (roundup < pages)
@@ -460,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
pages = max_pages;
/* Don't shrink readahead too fast */
- last_ra = atomic_read(&last_readahead_pages) / 2;
+ last_ra = prev_win / 2;
if (pages < last_ra)
pages = last_ra;
+
+ return pages;
+}
+
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int hits, pages, max_pages;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << READ_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ hits = atomic_xchg(&swapin_readahead_hits, 0);
+ pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ atomic_read(&last_readahead_pages));
+ if (!hits)
+ prev_offset = offset;
atomic_set(&last_readahead_pages, pages);
return pages;
@@ -496,7 +552,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long start_offset, end_offset;
unsigned long mask;
struct blk_plug plug;
- bool do_poll = true;
+ bool do_poll = true, page_allocated;
mask = swapin_nr_pages(offset) - 1;
if (!mask)
@@ -512,12 +568,19 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
- page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
- gfp_mask, vma, addr, false);
+ page = __read_swap_cache_async(
+ swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr, &page_allocated);
if (!page)
continue;
- if (offset != entry_offset && likely(!PageTransCompound(page)))
- SetPageReadahead(page);
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (offset != entry_offset &&
+ likely(!PageTransCompound(page))) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
put_page(page);
}
blk_finish_plug(&plug);
@@ -561,3 +624,210 @@ void exit_swap_address_space(unsigned int type)
synchronize_rcu();
kvfree(spaces);
}
+
+static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
+ unsigned long faddr,
+ unsigned long lpfn,
+ unsigned long rpfn,
+ unsigned long *start,
+ unsigned long *end)
+{
+ *start = max3(lpfn, PFN_DOWN(vma->vm_start),
+ PFN_DOWN(faddr & PMD_MASK));
+ *end = min3(rpfn, PFN_DOWN(vma->vm_end),
+ PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
+}
+
+struct page *swap_readahead_detect(struct vm_fault *vmf,
+ struct vma_swap_readahead *swap_ra)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long swap_ra_info;
+ struct page *page;
+ swp_entry_t entry;
+ unsigned long faddr, pfn, fpfn;
+ unsigned long start, end;
+ pte_t *pte;
+ unsigned int max_win, hits, prev_win, win, left;
+#ifndef CONFIG_64BIT
+ pte_t *tpte;
+#endif
+
+ faddr = vmf->address;
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ if ((unlikely(non_swap_entry(entry))))
+ return NULL;
+ page = lookup_swap_cache(entry, vma, faddr);
+ if (page)
+ return page;
+
+ max_win = 1 << READ_ONCE(swap_ra_max_order);
+ if (max_win == 1) {
+ swap_ra->win = 1;
+ return NULL;
+ }
+
+ fpfn = PFN_DOWN(faddr);
+ swap_ra_info = GET_SWAP_RA_VAL(vma);
+ pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
+ prev_win = SWAP_RA_WIN(swap_ra_info);
+ hits = SWAP_RA_HITS(swap_ra_info);
+ swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
+ max_win, prev_win);
+ atomic_long_set(&vma->swap_readahead_info,
+ SWAP_RA_VAL(faddr, win, 0));
+
+ if (win == 1)
+ return NULL;
+
+ /* Copy the PTEs because the page table may be unmapped */
+ if (fpfn == pfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
+ else if (pfn == fpfn + 1)
+ swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
+ &start, &end);
+ else {
+ left = (win - 1) / 2;
+ swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
+ &start, &end);
+ }
+ swap_ra->nr_pte = end - start;
+ swap_ra->offset = fpfn - start;
+ pte = vmf->pte - swap_ra->offset;
+#ifdef CONFIG_64BIT
+ swap_ra->ptes = pte;
+#else
+ tpte = swap_ra->ptes;
+ for (pfn = start; pfn != end; pfn++)
+ *tpte++ = *pte++;
+#endif
+
+ return NULL;
+}
+
+struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
+ struct vm_fault *vmf,
+ struct vma_swap_readahead *swap_ra)
+{
+ struct blk_plug plug;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page;
+ pte_t *pte, pentry;
+ swp_entry_t entry;
+ unsigned int i;
+ bool page_allocated;
+
+ if (swap_ra->win == 1)
+ goto skip;
+
+ blk_start_plug(&plug);
+ for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
+ i++, pte++) {
+ pentry = *pte;
+ if (pte_none(pentry))
+ continue;
+ if (pte_present(pentry))
+ continue;
+ entry = pte_to_swp_entry(pentry);
+ if (unlikely(non_swap_entry(entry)))
+ continue;
+ page = __read_swap_cache_async(entry, gfp_mask, vma,
+ vmf->address, &page_allocated);
+ if (!page)
+ continue;
+ if (page_allocated) {
+ swap_readpage(page, false);
+ if (i != swap_ra->offset &&
+ likely(!PageTransCompound(page))) {
+ SetPageReadahead(page);
+ count_vm_event(SWAP_RA);
+ }
+ }
+ put_page(page);
+ }
+ blk_finish_plug(&plug);
+ lru_add_drain();
+skip:
+ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
+ swap_ra->win == 1);
+}
+
+#ifdef CONFIG_SYSFS
+static ssize_t vma_ra_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
+}
+static ssize_t vma_ra_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ swap_vma_readahead = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ swap_vma_readahead = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+static struct kobj_attribute vma_ra_enabled_attr =
+ __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
+ vma_ra_enabled_store);
+
+static ssize_t vma_ra_max_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", swap_ra_max_order);
+}
+static ssize_t vma_ra_max_order_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err, v;
+
+ err = kstrtoint(buf, 10, &v);
+ if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
+ return -EINVAL;
+
+ swap_ra_max_order = v;
+
+ return count;
+}
+static struct kobj_attribute vma_ra_max_order_attr =
+ __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
+ vma_ra_max_order_store);
+
+static struct attribute *swap_attrs[] = {
+ &vma_ra_enabled_attr.attr,
+ &vma_ra_max_order_attr.attr,
+ NULL,
+};
+
+static struct attribute_group swap_attr_group = {
+ .attrs = swap_attrs,
+};
+
+static int __init swap_init_sysfs(void)
+{
+ int err;
+ struct kobject *swap_kobj;
+
+ swap_kobj = kobject_create_and_add("swap", mm_kobj);
+ if (!swap_kobj) {
+ pr_err("failed to create swap kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(swap_kobj, &swap_attr_group);
+ if (err) {
+ pr_err("failed to register swap group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(swap_kobj);
+ return err;
+}
+subsys_initcall(swap_init_sysfs);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ba4aab2db0b..d483278ee35b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages;
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
-static int least_priority;
+static int least_priority = -1;
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static PLIST_HEAD(swap_avail_head);
+struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);
+atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -265,6 +267,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0;
}
+static inline bool cluster_is_huge(struct swap_cluster_info *info)
+{
+ return info->flags & CLUSTER_FLAG_HUGE;
+}
+
+static inline void cluster_clear_huge(struct swap_cluster_info *info)
+{
+ info->flags &= ~CLUSTER_FLAG_HUGE;
+}
+
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset)
{
@@ -580,6 +592,21 @@ new_cluster:
return found_free;
}
+static void __del_from_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ for_each_node(nid)
+ plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+}
+
+static void del_from_avail_list(struct swap_info_struct *p)
+{
+ spin_lock(&swap_avail_lock);
+ __del_from_avail_list(p);
+ spin_unlock(&swap_avail_lock);
+}
+
static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
@@ -593,10 +620,20 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
- spin_lock(&swap_avail_lock);
- plist_del(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(si);
+ }
+}
+
+static void add_to_avail_list(struct swap_info_struct *p)
+{
+ int nid;
+
+ spin_lock(&swap_avail_lock);
+ for_each_node(nid) {
+ WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
+ plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
}
+ spin_unlock(&swap_avail_lock);
}
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -611,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
bool was_full = !si->highest_bit;
si->highest_bit = end;
- if (was_full && (si->flags & SWP_WRITEOK)) {
- spin_lock(&swap_avail_lock);
- WARN_ON(!plist_node_empty(&si->avail_list));
- if (plist_node_empty(&si->avail_list))
- plist_add(&si->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
- }
+ if (was_full && (si->flags & SWP_WRITEOK))
+ add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
si->inuse_pages -= nr_entries;
@@ -846,7 +878,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
offset = idx * SWAPFILE_CLUSTER;
ci = lock_cluster(si, offset);
alloc_cluster(si, idx);
- cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+ cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++)
@@ -898,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
+ int node;
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && cluster);
@@ -917,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
spin_lock(&swap_avail_lock);
start_over:
- plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ node = numa_node_id();
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
/* requeue si to after same-priority siblings */
- plist_requeue(&si->avail_list, &swap_avail_head);
+ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_lock(&swap_avail_lock);
- if (plist_node_empty(&si->avail_list)) {
+ if (plist_node_empty(&si->avail_lists[node])) {
spin_unlock(&si->lock);
goto nextsi;
}
@@ -934,13 +968,14 @@ start_over:
WARN(!(si->flags & SWP_WRITEOK),
"swap_info %d in list but !SWP_WRITEOK\n",
si->type);
- plist_del(&si->avail_list, &swap_avail_head);
+ __del_from_avail_list(si);
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster)
- n_ret = swap_alloc_cluster(si, swp_entries);
- else
+ if (cluster) {
+ if (!(si->flags & SWP_FILE))
+ n_ret = swap_alloc_cluster(si, swp_entries);
+ } else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
@@ -962,7 +997,7 @@ nextsi:
* swap_avail_head list then try it, otherwise start over
* if we have not gotten any slots.
*/
- if (plist_node_empty(&next->avail_list))
+ if (plist_node_empty(&next->avail_lists[node]))
goto start_over;
}
@@ -1168,22 +1203,57 @@ static void swapcache_free_cluster(swp_entry_t entry)
struct swap_cluster_info *ci;
struct swap_info_struct *si;
unsigned char *map;
- unsigned int i;
+ unsigned int i, free_entries = 0;
+ unsigned char val;
- si = swap_info_get(entry);
+ si = _swap_info_get(entry);
if (!si)
return;
ci = lock_cluster(si, offset);
+ VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
- map[i] = 0;
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ if (!free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++)
+ map[i] &= ~SWAP_HAS_CACHE;
}
+ cluster_clear_huge(ci);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ } else if (free_entries) {
+ for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
+ if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
+ free_swap_slot(entry);
+ }
+ }
+}
+
+int split_swap_cluster(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ si = _swap_info_get(entry);
+ if (!si)
+ return -EBUSY;
+ ci = lock_cluster(si, offset);
+ cluster_clear_huge(ci);
+ unlock_cluster(ci);
+ return 0;
}
#else
static inline void swapcache_free_cluster(swp_entry_t entry)
@@ -1332,29 +1402,161 @@ out:
return count;
}
+#ifdef CONFIG_THP_SWAP
+static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
+ swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
+ unsigned long roffset = swp_offset(entry);
+ unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+ int i;
+ bool ret = false;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (!ci || !cluster_is_huge(ci)) {
+ if (map[roffset] != SWAP_HAS_CACHE)
+ ret = true;
+ goto unlock_out;
+ }
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ if (map[offset + i] != SWAP_HAS_CACHE) {
+ ret = true;
+ break;
+ }
+ }
+unlock_out:
+ unlock_cluster_or_swap_info(si, ci);
+ return ret;
+}
+
+static bool page_swapped(struct page *page)
+{
+ swp_entry_t entry;
+ struct swap_info_struct *si;
+
+ if (likely(!PageTransCompound(page)))
+ return page_swapcount(page) != 0;
+
+ page = compound_head(page);
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si)
+ return swap_page_trans_huge_swapped(si, entry);
+ return false;
+}
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int i, map_swapcount, _total_mapcount, _total_swapcount;
+ unsigned long offset = 0;
+ struct swap_info_struct *si;
+ struct swap_cluster_info *ci = NULL;
+ unsigned char *map = NULL;
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = _total_swapcount = map_swapcount = 0;
+ if (PageSwapCache(page)) {
+ swp_entry_t entry;
+
+ entry.val = page_private(page);
+ si = _swap_info_get(entry);
+ if (si) {
+ map = si->swap_map;
+ offset = swp_offset(entry);
+ }
+ }
+ if (map)
+ ci = lock_cluster(si, offset);
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ _total_mapcount += mapcount;
+ if (map) {
+ swapcount = swap_count(map[offset + i]);
+ _total_swapcount += swapcount;
+ }
+ map_swapcount = max(map_swapcount, mapcount + swapcount);
+ }
+ unlock_cluster(ci);
+ if (PageDoubleMap(page)) {
+ map_swapcount -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ map_swapcount += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ if (total_swapcount)
+ *total_swapcount = _total_swapcount;
+
+ return map_swapcount;
+}
+#else
+#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
+#define page_swapped(page) (page_swapcount(page) != 0)
+
+static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+ int *total_swapcount)
+{
+ int mapcount, swapcount = 0;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
+ if (PageSwapCache(page))
+ swapcount = page_swapcount(page);
+ if (total_swapcount)
+ *total_swapcount = swapcount;
+ return mapcount + swapcount;
+}
+#endif
+
/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
*
- * NOTE: total_mapcount should not be relied upon by the caller if
+ * NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
*/
-bool reuse_swap_page(struct page *page, int *total_mapcount)
+bool reuse_swap_page(struct page *page, int *total_map_swapcount)
{
- int count;
+ int count, total_mapcount, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
- count = page_trans_huge_mapcount(page, total_mapcount);
- if (count <= 1 && PageSwapCache(page)) {
- count += page_swapcount(page);
- if (count != 1)
- goto out;
+ count = page_trans_huge_map_swapcount(page, &total_mapcount,
+ &total_swapcount);
+ if (total_map_swapcount)
+ *total_map_swapcount = total_mapcount + total_swapcount;
+ if (count == 1 && PageSwapCache(page) &&
+ (likely(!PageTransCompound(page)) ||
+ /* The remaining swap count will be freed soon */
+ total_swapcount == page_swapcount(page))) {
if (!PageWriteback(page)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
} else {
@@ -1370,7 +1572,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount)
spin_unlock(&p->lock);
}
}
-out:
+
return count <= 1;
}
@@ -1386,7 +1588,7 @@ int try_to_free_swap(struct page *page)
return 0;
if (PageWriteback(page))
return 0;
- if (page_swapcount(page))
+ if (page_swapped(page))
return 0;
/*
@@ -1407,6 +1609,7 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage())
return 0;
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -1428,7 +1631,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
count = __swap_entry_free(p, entry, 1);
- if (count == SWAP_HAS_CACHE) {
+ if (count == SWAP_HAS_CACHE &&
+ !swap_page_trans_huge_swapped(p, entry)) {
page = find_get_page(swap_address_space(entry),
swp_offset(entry));
if (page && !trylock_page(page)) {
@@ -1445,7 +1649,8 @@ int free_swap_and_cache(swp_entry_t entry)
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
(!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_swapcount(p, entry)) {
+ !swap_page_trans_huge_swapped(p, entry)) {
+ page = compound_head(page);
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1999,7 +2204,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
.sync_mode = WB_SYNC_NONE,
};
- swap_writepage(page, &wbc);
+ swap_writepage(compound_head(page), &wbc);
lock_page(page);
wait_on_page_writeback(page);
}
@@ -2012,8 +2217,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
* delete, since it may not have been written out to swap yet.
*/
if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val))
- delete_from_swap_cache(page);
+ likely(page_private(page) == entry.val) &&
+ !page_swapped(page))
+ delete_from_swap_cache(compound_head(page));
/*
* So we could skip searching mms once swap count went
@@ -2226,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
+static int swap_node(struct swap_info_struct *p)
+{
+ struct block_device *bdev;
+
+ if (p->bdev)
+ bdev = p->bdev;
+ else
+ bdev = p->swap_file->f_inode->i_sb->s_bdev;
+
+ return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
+}
+
static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
+ int i;
+
if (prio >= 0)
p->prio = prio;
else
@@ -2239,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* low-to-high, while swap ordering is high-to-low
*/
p->list.prio = -p->prio;
- p->avail_list.prio = -p->prio;
+ for_each_node(i) {
+ if (p->prio >= 0)
+ p->avail_lists[i].prio = -p->prio;
+ else {
+ if (swap_node(p) == i)
+ p->avail_lists[i].prio = 1;
+ else
+ p->avail_lists[i].prio = -p->prio;
+ }
+ }
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
@@ -2258,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
* swap_info_struct.
*/
plist_add(&p->list, &swap_active_head);
- spin_lock(&swap_avail_lock);
- plist_add(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ add_to_avail_list(p);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2345,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- spin_lock(&swap_avail_lock);
- plist_del(&p->avail_list, &swap_avail_head);
- spin_unlock(&swap_avail_lock);
+ del_from_avail_list(p);
spin_lock(&p->lock);
if (p->prio < 0) {
struct swap_info_struct *si = p;
+ int nid;
plist_for_each_entry_continue(si, &swap_active_head, list) {
si->prio++;
si->list.prio--;
- si->avail_list.prio--;
+ for_each_node(nid) {
+ if (si->avail_lists[nid].prio != 1)
+ si->avail_lists[nid].prio--;
+ }
}
least_priority++;
}
@@ -2387,6 +2616,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
+ if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ atomic_dec(&nr_rotate_swap);
+
mutex_lock(&swapon_mutex);
spin_lock(&swap_lock);
spin_lock(&p->lock);
@@ -2596,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
unsigned int type;
+ int i;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
@@ -2631,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void)
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
plist_node_init(&p->list, 0);
- plist_node_init(&p->avail_list, 0);
+ for_each_node(i)
+ plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
@@ -2873,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!swap_avail_heads)
+ return -ENOMEM;
+
p = alloc_swap_info();
if (IS_ERR(p))
return PTR_ERR(p);
@@ -2963,7 +3200,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(&cluster->index);
}
- }
+ } else
+ atomic_inc(&nr_rotate_swap);
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
@@ -3457,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
}
}
+
+static int __init swapfile_init(void)
+{
+ int nid;
+
+ swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
+ GFP_KERNEL);
+ if (!swap_avail_heads) {
+ pr_emerg("Not enough memory for swap heads, swap is disabled\n");
+ return -ENOMEM;
+ }
+
+ for_each_node(nid)
+ plist_head_init(&swap_avail_heads[nid]);
+
+ return 0;
+}
+subsys_initcall(swapfile_init);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 8bcb501bce60..81192701964d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -371,6 +371,36 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
bool zeropage);
#endif /* CONFIG_HUGETLB_PAGE */
+static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **page,
+ bool zeropage)
+{
+ ssize_t err;
+
+ if (vma_is_anonymous(dst_vma)) {
+ if (!zeropage)
+ err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, page);
+ else
+ err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ } else {
+ if (!zeropage)
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr,
+ src_addr, page);
+ else
+ err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
+ dst_vma, dst_addr);
+ }
+
+ return err;
+}
+
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
@@ -487,22 +517,8 @@ retry:
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));
- if (vma_is_anonymous(dst_vma)) {
- if (!zeropage)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr,
- &page);
- else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
- } else {
- err = -EINVAL; /* if zeropage is true return -EINVAL */
- if (likely(!zeropage))
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, &page);
- }
-
+ err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ src_addr, &page, zeropage);
cond_resched();
if (unlikely(err == -EFAULT)) {
diff --git a/mm/util.c b/mm/util.c
index 7b07ec852e01..34e57fae959d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
+ free = global_zone_page_state(NR_FREE_PAGES);
free += global_node_page_state(NR_FILE_PAGES);
/*
@@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* which are reclaimable, under pressure. The dentry
* cache and most inode caches should fall into this
*/
- free += global_page_state(NR_SLAB_RECLAIMABLE);
+ free += global_node_page_state(NR_SLAB_RECLAIMABLE);
/*
* Leave reserved pages. The pages are not for anonymous pages.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c86c4d..8a43db6284eb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -49,12 +49,10 @@ static void __vunmap(const void *, int);
static void free_work(struct work_struct *w)
{
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
- struct llist_node *llnode = llist_del_all(&p->list);
- while (llnode) {
- void *p = llnode;
- llnode = llist_next(llnode);
- __vunmap(p, 1);
- }
+ struct llist_node *t, *llnode;
+
+ llist_for_each_safe(llnode, t, llist_del_all(&p->list))
+ __vunmap((void *)llnode, 1);
}
/*** Page table manipulation functions ***/
@@ -1671,7 +1669,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
+ const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+ const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
+ 0 :
+ __GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
@@ -1679,7 +1680,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
+ pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
@@ -1700,9 +1701,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
+ page = alloc_page(alloc_mask|highmem_mask);
else
- page = alloc_pages_node(node, alloc_mask, 0);
+ page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1710,7 +1711,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfpflags_allow_blocking(gfp_mask))
+ if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
@@ -2479,7 +2480,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* matching slot. While scanning, if any of the areas overlaps with
* existing vmap_area, the base address is pulled down to fit the
* area. Scanning is repeated till all the areas fit and then all
- * necessary data structres are inserted and the result is returned.
+ * necessary data structures are inserted and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2507,15 +2508,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
if (start > offsets[last_area])
last_area = area;
- for (area2 = 0; area2 < nr_vms; area2++) {
+ for (area2 = area + 1; area2 < nr_vms; area2++) {
unsigned long start2 = offsets[area2];
unsigned long end2 = start2 + sizes[area2];
- if (area2 == area)
- continue;
-
- BUG_ON(start2 >= start && start2 < end);
- BUG_ON(end2 <= end && end2 > start);
+ BUG_ON(start2 < end && start < end2);
}
}
last_end = offsets[last_area] + sizes[last_area];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a1af041930a6..13d711dd8776 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
+ shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
- count_vm_events(SLABS_SCANNED, nr_to_scan);
- total_scan -= nr_to_scan;
- scanned += nr_to_scan;
+ count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
+ total_scan -= shrinkctl->nr_scanned;
+ scanned += shrinkctl->nr_scanned;
cond_resched();
}
@@ -535,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache radix tree and
* optional buffer heads at page->private.
*/
- return page_count(page) - page_has_private(page) == 2;
+ int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ HPAGE_PMD_NR : 1;
+ return page_count(page) - page_has_private(page) == 1 + radix_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -665,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
+ int refcount;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -695,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_ref_freeze(page, 2))
+ if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
+ refcount = 1 + HPAGE_PMD_NR;
+ else
+ refcount = 2;
+ if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_ref_unfreeze(page, 2);
+ page_ref_unfreeze(page, refcount);
goto cannot_free;
}
@@ -1121,58 +1129,59 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* Try to allocate it some swap space here.
* Lazyfree page could be freed directly
*/
- if (PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page)) {
- if (!(sc->gfp_mask & __GFP_IO))
- goto keep_locked;
- if (PageTransHuge(page)) {
- /* cannot split THP, skip it */
- if (!can_split_huge_page(page, NULL))
- goto activate_locked;
- /*
- * Split pages without a PMD map right
- * away. Chances are some or all of the
- * tail pages can be freed without IO.
- */
- if (!compound_mapcount(page) &&
- split_huge_page_to_list(page, page_list))
- goto activate_locked;
- }
- if (!add_to_swap(page)) {
- if (!PageTransHuge(page))
- goto activate_locked;
- /* Split THP and swap individual base pages */
- if (split_huge_page_to_list(page, page_list))
- goto activate_locked;
- if (!add_to_swap(page))
- goto activate_locked;
- }
-
- /* XXX: We don't support THP writes */
- if (PageTransHuge(page) &&
- split_huge_page_to_list(page, page_list)) {
- delete_from_swap_cache(page);
- goto activate_locked;
- }
+ if (PageAnon(page) && PageSwapBacked(page)) {
+ if (!PageSwapCache(page)) {
+ if (!(sc->gfp_mask & __GFP_IO))
+ goto keep_locked;
+ if (PageTransHuge(page)) {
+ /* cannot split THP, skip it */
+ if (!can_split_huge_page(page, NULL))
+ goto activate_locked;
+ /*
+ * Split pages without a PMD map right
+ * away. Chances are some or all of the
+ * tail pages can be freed without IO.
+ */
+ if (!compound_mapcount(page) &&
+ split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+ }
+ if (!add_to_swap(page)) {
+ if (!PageTransHuge(page))
+ goto activate_locked;
+ /* Fallback to swap normal pages */
+ if (split_huge_page_to_list(page,
+ page_list))
+ goto activate_locked;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ count_vm_event(THP_SWPOUT_FALLBACK);
+#endif
+ if (!add_to_swap(page))
+ goto activate_locked;
+ }
- may_enter_fs = 1;
+ may_enter_fs = 1;
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
+ /* Adding to swap updated mapping */
+ mapping = page_mapping(page);
+ }
} else if (unlikely(PageTransHuge(page))) {
/* Split file THP */
if (split_huge_page_to_list(page, page_list))
goto keep_locked;
}
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
-
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_mapped(page)) {
- if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
+ enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+
+ if (unlikely(PageTransHuge(page)))
+ flags |= TTU_SPLIT_HUGE_PMD;
+ if (!try_to_unmap(page, flags)) {
nr_unmap_fail++;
goto activate_locked;
}
@@ -1312,7 +1321,11 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- list_add(&page->lru, &free_pages);
+ if (unlikely(PageTransHuge(page))) {
+ mem_cgroup_uncharge(page);
+ (*get_compound_page_dtor(page))(page);
+ } else
+ list_add(&page->lru, &free_pages);
continue;
activate_locked:
@@ -1742,9 +1755,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ if (stalled)
+ return 0;
+
+ /* wait a bit for the reclaimer. */
+ msleep(100);
+ stalled = true;
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -3525,8 +3544,6 @@ static int kswapd(void *p)
};
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- lockdep_set_current_reclaim_state(GFP_KERNEL);
-
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
@@ -3585,14 +3602,15 @@ kswapd_try_sleep:
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
alloc_order);
+ fs_reclaim_acquire(GFP_KERNEL);
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ fs_reclaim_release(GFP_KERNEL);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
return 0;
}
@@ -3655,14 +3673,14 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
unsigned int noreclaim_flag;
noreclaim_flag = memalloc_noreclaim_save();
- lockdep_set_current_reclaim_state(sc.gfp_mask);
+ fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
- lockdep_clear_current_reclaim_state();
+ fs_reclaim_release(sc.gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
return nr_reclaimed;
@@ -3847,7 +3865,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- lockdep_set_current_reclaim_state(sc.gfp_mask);
+ fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3862,9 +3880,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
+ fs_reclaim_release(gfp_mask);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
- lockdep_clear_current_reclaim_state();
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441bbeef2..c7e4b8458023 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
{
unsigned long requested = 1UL << order;
+ if (WARN_ON_ONCE(order >= MAX_ORDER))
+ return 0;
+
if (!info->free_blocks_total)
return 0;
@@ -1071,6 +1074,8 @@ const char * const vmstat_text[] = {
#endif
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
+ "thp_swpout",
+ "thp_swpout_fallback",
#endif
#ifdef CONFIG_MEMORY_BALLOON
"balloon_inflate",
@@ -1093,6 +1098,10 @@ const char * const vmstat_text[] = {
"vmacache_find_hits",
"vmacache_full_flushes",
#endif
+#ifdef CONFIG_SWAP
+ "swap_ra",
+ "swap_ra_hit",
+#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
@@ -1250,7 +1259,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
-/* Print out the free pages at each order for each migratetype */
+/* Print out the number of pageblocks for each migratetype */
static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
{
int mtype;
@@ -1500,7 +1509,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- v[i] = global_page_state(i);
+ v[i] = global_zone_page_state(i);
v += NR_VM_ZONE_STAT_ITEMS;
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
@@ -1589,7 +1598,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
* which can equally be echo'ed to or cat'ted from (by root),
* can be used to update the stats just before reading them.
*
- * Oh, and since global_page_state() etc. are so careful to hide
+ * Oh, and since global_zone_page_state() etc. are so careful to hide
* transiently negative values, report an error here if any of
* the stats is negative, so we know to go looking for imbalance.
*/
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 54f63c4a809a..486550df32be 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -23,10 +23,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/atomic.h>
+#include <linux/sched.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
@@ -48,11 +51,15 @@ enum buddy {
};
/*
- * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
* z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the pool
+ * @buddy: links the z3fold page into the relevant list in the
+ * pool
* @page_lock: per-page lock
- * @refcount: reference cound for the z3fold page
+ * @refcount: reference count for the z3fold page
+ * @work: work_struct for page layout optimization
+ * @pool: pointer to the pool which this page belongs to
+ * @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
@@ -62,6 +69,9 @@ struct z3fold_header {
struct list_head buddy;
spinlock_t page_lock;
struct kref refcount;
+ struct work_struct work;
+ struct z3fold_pool *pool;
+ short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
unsigned short last_chunks;
@@ -92,28 +102,39 @@ struct z3fold_header {
/**
* struct z3fold_pool - stores metadata for each z3fold pool
- * @lock: protects all pool fields and first|last_chunk fields of any
- * z3fold page in the pool
- * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies;
- * the lists each z3fold page is added to depends on the size of
- * its free region.
+ * @name: pool name
+ * @lock: protects pool unbuddied/lru lists
+ * @stale_lock: protects pool stale page list
+ * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
+ * buddies; the list each z3fold page is added to depends on
+ * the size of its free region.
* @lru: list tracking the z3fold pages in LRU order by most recently
* added buddy.
+ * @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @compact_wq: workqueue for page layout background optimization
+ * @release_wq: workqueue for safe page release
+ * @work: work_struct for safe page release
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
*/
struct z3fold_pool {
+ const char *name;
spinlock_t lock;
- struct list_head unbuddied[NCHUNKS];
+ spinlock_t stale_lock;
+ struct list_head *unbuddied;
struct list_head lru;
+ struct list_head stale;
atomic64_t pages_nr;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
+ struct workqueue_struct *compact_wq;
+ struct workqueue_struct *release_wq;
+ struct work_struct work;
};
/*
@@ -122,9 +143,10 @@ struct z3fold_pool {
enum z3fold_page_flags {
PAGE_HEADLESS = 0,
MIDDLE_CHUNK_MAPPED,
+ NEEDS_COMPACTING,
+ PAGE_STALE
};
-
/*****************
* Helpers
*****************/
@@ -138,14 +160,19 @@ static int size_to_chunks(size_t size)
#define for_each_unbuddied_list(_iter, _begin) \
for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+static void compact_page_work(struct work_struct *w);
+
/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page)
+static struct z3fold_header *init_z3fold_page(struct page *page,
+ struct z3fold_pool *pool)
{
struct z3fold_header *zhdr = page_address(page);
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ clear_bit(PAGE_STALE, &page->private);
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page)
zhdr->last_chunks = 0;
zhdr->first_num = 0;
zhdr->start_middle = 0;
+ zhdr->cpu = -1;
+ zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
+ INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
}
@@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page)
__free_page(page);
}
-static void release_z3fold_page(struct kref *ref)
-{
- struct z3fold_header *zhdr;
- struct page *page;
-
- zhdr = container_of(ref, struct z3fold_header, refcount);
- page = virt_to_page(zhdr);
-
- if (!list_empty(&zhdr->buddy))
- list_del(&zhdr->buddy);
- if (!list_empty(&page->lru))
- list_del(&page->lru);
- free_z3fold_page(page);
-}
-
/* Lock a z3fold page */
static inline void z3fold_page_lock(struct z3fold_header *zhdr)
{
@@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle)
return (handle - zhdr->first_num) & BUDDY_MASK;
}
+static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct page *page = virt_to_page(zhdr);
+ struct z3fold_pool *pool = zhdr->pool;
+
+ WARN_ON(!list_empty(&zhdr->buddy));
+ set_bit(PAGE_STALE, &page->private);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ if (locked)
+ z3fold_page_unlock(zhdr);
+ spin_lock(&pool->stale_lock);
+ list_add(&zhdr->buddy, &pool->stale);
+ queue_work(pool->release_wq, &pool->work);
+ spin_unlock(&pool->stale_lock);
+}
+
+static void __attribute__((__unused__))
+ release_z3fold_page(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ __release_z3fold_page(zhdr, false);
+}
+
+static void release_z3fold_page_locked(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void release_z3fold_page_locked_list(struct kref *ref)
+{
+ struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
+ refcount);
+ spin_lock(&zhdr->pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&zhdr->pool->lock);
+
+ WARN_ON(z3fold_page_trylock(zhdr));
+ __release_z3fold_page(zhdr, true);
+}
+
+static void free_pages_work(struct work_struct *w)
+{
+ struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
+
+ spin_lock(&pool->stale_lock);
+ while (!list_empty(&pool->stale)) {
+ struct z3fold_header *zhdr = list_first_entry(&pool->stale,
+ struct z3fold_header, buddy);
+ struct page *page = virt_to_page(zhdr);
+
+ list_del(&zhdr->buddy);
+ if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
+ continue;
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ spin_unlock(&pool->stale_lock);
+ cancel_work_sync(&zhdr->work);
+ free_z3fold_page(page);
+ cond_resched();
+ spin_lock(&pool->stale_lock);
+ }
+ spin_unlock(&pool->stale_lock);
+}
+
/*
* Returns the number of free chunks in a z3fold page.
* NB: can't be used with HEADLESS pages.
@@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr)
return nfree;
}
-/*****************
- * API Functions
-*****************/
-/**
- * z3fold_create_pool() - create a new z3fold pool
- * @gfp: gfp flags when allocating the z3fold pool structure
- * @ops: user-defined operations for the z3fold pool
- *
- * Return: pointer to the new z3fold pool or NULL if the metadata allocation
- * failed.
- */
-static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
- const struct z3fold_ops *ops)
-{
- struct z3fold_pool *pool;
- int i;
-
- pool = kzalloc(sizeof(struct z3fold_pool), gfp);
- if (!pool)
- return NULL;
- spin_lock_init(&pool->lock);
- for_each_unbuddied_list(i, 0)
- INIT_LIST_HEAD(&pool->unbuddied[i]);
- INIT_LIST_HEAD(&pool->lru);
- atomic64_set(&pool->pages_nr, 0);
- pool->ops = ops;
- return pool;
-}
-
-/**
- * z3fold_destroy_pool() - destroys an existing z3fold pool
- * @pool: the z3fold pool to be destroyed
- *
- * The pool should be emptied before this function is called.
- */
-static void z3fold_destroy_pool(struct z3fold_pool *pool)
-{
- kfree(pool);
-}
-
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
return 0;
}
+static void do_compact_page(struct z3fold_header *zhdr, bool locked)
+{
+ struct z3fold_pool *pool = zhdr->pool;
+ struct page *page;
+ struct list_head *unbuddied;
+ int fchunks;
+
+ page = virt_to_page(zhdr);
+ if (locked)
+ WARN_ON(z3fold_page_trylock(zhdr));
+ else
+ z3fold_page_lock(zhdr);
+ if (test_bit(PAGE_STALE, &page->private) ||
+ !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_del_init(&zhdr->buddy);
+ spin_unlock(&pool->lock);
+
+ z3fold_compact_page(zhdr);
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ fchunks = num_free_chunks(zhdr);
+ if (fchunks < NCHUNKS &&
+ (!zhdr->first_chunks || !zhdr->middle_chunks ||
+ !zhdr->last_chunks)) {
+ /* the page's not completely free and it's unbuddied */
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[fchunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ }
+ put_cpu_ptr(pool->unbuddied);
+ z3fold_page_unlock(zhdr);
+}
+
+static void compact_page_work(struct work_struct *w)
+{
+ struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
+ work);
+
+ do_compact_page(zhdr, false);
+}
+
+
+/*
+ * API Functions
+ */
+
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @name: pool name
+ * @gfp: gfp flags when allocating the z3fold pool structure
+ * @ops: user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
+ const struct z3fold_ops *ops)
+{
+ struct z3fold_pool *pool = NULL;
+ int i, cpu;
+
+ pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+ if (!pool)
+ goto out;
+ spin_lock_init(&pool->lock);
+ spin_lock_init(&pool->stale_lock);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ for_each_possible_cpu(cpu) {
+ struct list_head *unbuddied =
+ per_cpu_ptr(pool->unbuddied, cpu);
+ for_each_unbuddied_list(i, 0)
+ INIT_LIST_HEAD(&unbuddied[i]);
+ }
+ INIT_LIST_HEAD(&pool->lru);
+ INIT_LIST_HEAD(&pool->stale);
+ atomic64_set(&pool->pages_nr, 0);
+ pool->name = name;
+ pool->compact_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->compact_wq)
+ goto out;
+ pool->release_wq = create_singlethread_workqueue(pool->name);
+ if (!pool->release_wq)
+ goto out_wq;
+ INIT_WORK(&pool->work, free_pages_work);
+ pool->ops = ops;
+ return pool;
+
+out_wq:
+ destroy_workqueue(pool->compact_wq);
+out:
+ kfree(pool);
+ return NULL;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool: the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+ destroy_workqueue(pool->release_wq);
+ destroy_workqueue(pool->compact_wq);
+ kfree(pool);
+}
+
/**
* z3fold_alloc() - allocates a region of a given size
* @pool: z3fold pool from which to allocate
@@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
{
int chunks = 0, i, freechunks;
struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
enum buddy bud;
- struct page *page;
+ bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
@@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
+ struct list_head *unbuddied;
chunks = size_to_chunks(size);
+lookup:
/* First, try to find an unbuddied z3fold page. */
- zhdr = NULL;
+ unbuddied = get_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
- spin_lock(&pool->lock);
- zhdr = list_first_entry_or_null(&pool->unbuddied[i],
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
struct z3fold_header, buddy);
- if (!zhdr || !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
+
+ if (!zhdr)
continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ put_cpu_ptr(pool->unbuddied);
+ goto lookup;
}
- kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
chunks >= zhdr->start_middle)
@@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
else if (zhdr->middle_chunks == 0)
bud = MIDDLE;
else {
- z3fold_page_unlock(zhdr);
- spin_lock(&pool->lock);
if (kref_put(&zhdr->refcount,
- release_z3fold_page))
+ release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
- spin_unlock(&pool->lock);
+ else
+ z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- continue;
+ goto lookup;
}
goto found;
}
bud = FIRST;
}
- /* Couldn't find unbuddied z3fold page, create new one */
- page = alloc_page(gfp);
+ spin_lock(&pool->stale_lock);
+ zhdr = list_first_entry_or_null(&pool->stale,
+ struct z3fold_header, buddy);
+ /*
+ * Before allocating a page, let's see if we can take one from the
+ * stale pages list. cancel_work_sync() can sleep so we must make
+ * sure it won't be called in case we're in atomic context.
+ */
+ if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
+ !unlikely(work_busy(&zhdr->work)))) {
+ list_del(&zhdr->buddy);
+ clear_bit(NEEDS_COMPACTING, &page->private);
+ spin_unlock(&pool->stale_lock);
+ if (can_sleep)
+ cancel_work_sync(&zhdr->work);
+ page = virt_to_page(zhdr);
+ } else {
+ spin_unlock(&pool->stale_lock);
+ page = alloc_page(gfp);
+ }
+
if (!page)
return -ENOMEM;
atomic64_inc(&pool->pages_nr);
- zhdr = init_z3fold_page(page);
+ zhdr = init_z3fold_page(page, pool);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
- spin_lock(&pool->lock);
goto headless;
}
z3fold_page_lock(zhdr);
@@ -451,15 +659,21 @@ found:
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
- spin_lock(&pool->lock);
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
/* Add to unbuddied list */
freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
}
headless:
+ spin_lock(&pool->lock);
/* Add/move z3fold page to beginning of LRU */
if (!list_empty(&page->lru))
list_del(&page->lru);
@@ -487,7 +701,6 @@ headless:
static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
{
struct z3fold_header *zhdr;
- int freechunks;
struct page *page;
enum buddy bud;
@@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
free_z3fold_page(page);
atomic64_dec(&pool->pages_nr);
- } else {
- if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 ||
- zhdr->last_chunks != 0) {
- z3fold_compact_page(zhdr);
- /* Add to the unbuddied list */
- spin_lock(&pool->lock);
- if (!list_empty(&zhdr->buddy))
- list_del(&zhdr->buddy);
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- }
+ return;
+ }
+
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ return;
+ }
+ if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
spin_lock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page))
- atomic64_dec(&pool->pages_nr);
+ list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
+ zhdr->cpu = -1;
+ do_compact_page(zhdr, true);
+ return;
}
-
+ queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ z3fold_page_unlock(zhdr);
}
/**
@@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
*/
static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
{
- int i, ret = 0, freechunks;
- struct z3fold_header *zhdr;
- struct page *page;
+ int i, ret = 0;
+ struct z3fold_header *zhdr = NULL;
+ struct page *page = NULL;
+ struct list_head *pos;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
spin_unlock(&pool->lock);
return -EINVAL;
}
- page = list_last_entry(&pool->lru, struct page, lru);
+ list_for_each_prev(pos, &pool->lru) {
+ page = list_entry(pos, struct page, lru);
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ /* candidate found */
+ break;
+
+ zhdr = page_address(page);
+ if (!z3fold_page_trylock(zhdr))
+ continue; /* can't evict at this point */
+ kref_get(&zhdr->refcount);
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ }
+
list_del_init(&page->lru);
+ spin_unlock(&pool->lock);
- zhdr = page_address(page);
if (!test_bit(PAGE_HEADLESS, &page->private)) {
- if (!list_empty(&zhdr->buddy))
- list_del_init(&zhdr->buddy);
- kref_get(&zhdr->refcount);
- spin_unlock(&pool->lock);
- z3fold_page_lock(zhdr);
/*
* We need encode the handles before unlocking, since
* we can race with free that will set
@@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
middle_handle = encode_handle(zhdr, MIDDLE);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
+ /*
+ * it's safe to unlock here because we hold a
+ * reference to this page
+ */
z3fold_page_unlock(zhdr);
} else {
first_handle = encode_handle(zhdr, HEADLESS);
last_handle = middle_handle = 0;
- spin_unlock(&pool->lock);
}
/* Issue the eviction callback(s) */
@@ -652,31 +879,12 @@ next:
if (ret == 0) {
free_z3fold_page(page);
return 0;
- } else {
- spin_lock(&pool->lock);
- }
- } else {
- z3fold_page_lock(zhdr);
- if ((zhdr->first_chunks || zhdr->last_chunks ||
- zhdr->middle_chunks) &&
- !(zhdr->first_chunks && zhdr->last_chunks &&
- zhdr->middle_chunks)) {
- z3fold_compact_page(zhdr);
- /* add to unbuddied list */
- spin_lock(&pool->lock);
- freechunks = num_free_chunks(zhdr);
- list_add(&zhdr->buddy,
- &pool->unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- }
- z3fold_page_unlock(zhdr);
- spin_lock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page)) {
- spin_unlock(&pool->lock);
- atomic64_dec(&pool->pages_nr);
- return 0;
}
+ } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
+ atomic64_dec(&pool->pages_nr);
+ return 0;
}
+ spin_lock(&pool->lock);
/*
* Add to the beginning of LRU.
@@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp,
{
struct z3fold_pool *pool;
- pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+ pool = z3fold_create_pool(name, gfp,
+ zpool_ops ? &z3fold_zpool_ops : NULL);
if (pool) {
pool->zpool = zpool;
pool->zpool_ops = zpool_ops;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 013eea76685e..62457eb82330 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
spin_lock(&class->lock);
if (!get_zspage_inuse(zspage)) {
- ret = -EBUSY;
- goto unlock_class;
+ /*
+ * Set "offset" to end of the page so that every loops
+ * skips unnecessary object scanning.
+ */
+ offset = PAGE_SIZE;
}
pos = offset;
@@ -2052,7 +2055,6 @@ unpin_objects:
}
}
kunmap_atomic(s_addr);
-unlock_class:
spin_unlock(&class->lock);
migrate_write_unlock(zspage);
@@ -2453,7 +2455,6 @@ void zs_destroy_pool(struct zs_pool *pool)
}
destroy_cache(pool);
- kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
}