aboutsummaryrefslogtreecommitdiff
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c216
1 files changed, 149 insertions, 67 deletions
diff --git a/fs/dax.c b/fs/dax.c
index aaec72ded1b6..641192808bb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
}
}
+static struct page *dax_busy_page(void *entry)
+{
+ unsigned long pfn;
+
+ for_each_mapped_pfn(entry, pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (page_ref_count(page) > 1)
+ return page;
+ }
+ return NULL;
+}
+
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
return entry;
}
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct page *page = NULL;
+ struct pagevec pvec;
+ pgoff_t index, end;
+ unsigned i;
+
+ /*
+ * In the 'limited' case get_user_pages() for dax is disabled.
+ */
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return NULL;
+
+ if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+ return NULL;
+
+ pagevec_init(&pvec);
+ index = 0;
+ end = -1;
+
+ /*
+ * If we race get_user_pages_fast() here either we'll see the
+ * elevated page count in the pagevec_lookup and wait, or
+ * get_user_pages_fast() will see that the page it took a reference
+ * against is no longer mapped in the page tables and bail to the
+ * get_user_pages() slow path. The slow path is protected by
+ * pte_lock() and pmd_lock(). New references are not taken without
+ * holding those locks, and unmap_mapping_range() will not zero the
+ * pte or pmd without holding the respective lock, so we are
+ * guaranteed to either see new references or prevent new
+ * references from being established.
+ */
+ unmap_mapping_range(mapping, 0, 0, 1);
+
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *pvec_ent = pvec.pages[i];
+ void *entry;
+
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (!radix_tree_exceptional_entry(pvec_ent))
+ continue;
+
+ xa_lock_irq(&mapping->i_pages);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_mapping_entry(mapping, index, entry);
+ xa_unlock_irq(&mapping->i_pages);
+ if (page)
+ break;
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ index++;
+
+ if (page)
+ break;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
@@ -677,7 +774,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.txt
+ * See Documentation/vm/mmu_notifier.rst
*/
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
@@ -905,14 +1002,13 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
- int ret = VM_FAULT_NOPAGE;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
struct page *zero_page;
- void *entry2;
pfn_t pfn;
zero_page = ZERO_PAGE(0);
@@ -922,14 +1018,9 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}
pfn = page_to_pfn_t(zero_page);
- entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_ZERO_PAGE, false);
- if (IS_ERR(entry2)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- vm_insert_mixed(vmf->vma, vaddr, pfn);
+ dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
+ false);
+ ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+ size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
@@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* vfs_write(), depending on which operation we are doing.
*/
if (iov_iter_rw(iter) == WRITE)
- map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
- map_len = copy_to_iter(kaddr, map_len, iter);
- if (map_len <= 0) {
- ret = map_len ? map_len : -EFAULT;
- break;
- }
+ xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+ map_len, iter);
+
+ pos += xfer;
+ length -= xfer;
+ done += xfer;
- pos += map_len;
- length -= map_len;
- done += map_len;
+ if (xfer == 0)
+ ret = -EFAULT;
+ if (xfer < map_len)
+ break;
}
dax_read_unlock(id);
@@ -1112,7 +1206,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);
-static int dax_fault_return(int error)
+static vm_fault_t dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
@@ -1132,7 +1226,7 @@ static bool dax_fault_is_synchronous(unsigned long flags,
&& (iomap->flags & IOMAP_F_DIRTY);
}
-static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1145,18 +1239,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
- int vmf_ret = 0;
+ vm_fault_t ret = 0;
void *entry;
pfn_t pfn;
- trace_dax_pte_fault(inode, vmf, vmf_ret);
+ trace_dax_pte_fault(inode, vmf, ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
if (pos >= i_size_read(inode)) {
- vmf_ret = VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
goto out;
}
@@ -1165,7 +1259,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
+ ret = dax_fault_return(PTR_ERR(entry));
goto out;
}
@@ -1176,7 +1270,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* retried.
*/
if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
- vmf_ret = VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
goto unlock_entry;
}
@@ -1189,7 +1283,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (iomap_errp)
*iomap_errp = error;
if (error) {
- vmf_ret = dax_fault_return(error);
+ ret = dax_fault_return(error);
goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
@@ -1219,9 +1313,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
- vmf_ret = finish_fault(vmf);
- if (!vmf_ret)
- vmf_ret = VM_FAULT_DONE_COW;
+ ret = finish_fault(vmf);
+ if (!ret)
+ ret = VM_FAULT_DONE_COW;
goto finish_iomap;
}
@@ -1240,10 +1334,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync);
- if (IS_ERR(entry)) {
- error = PTR_ERR(entry);
- goto error_finish_iomap;
- }
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1257,23 +1347,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto error_finish_iomap;
}
*pfnp = pfn;
- vmf_ret = VM_FAULT_NEEDDSYNC | major;
+ ret = VM_FAULT_NEEDDSYNC | major;
goto finish_iomap;
}
trace_dax_insert_mapping(inode, vmf, entry);
if (write)
- error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
else
- error = vm_insert_mixed(vma, vaddr, pfn);
+ ret = vmf_insert_mixed(vma, vaddr, pfn);
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if (error == -EBUSY)
- error = 0;
- break;
+ goto finish_iomap;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- vmf_ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1284,12 +1371,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
}
error_finish_iomap:
- vmf_ret = dax_fault_return(error) | major;
+ ret = dax_fault_return(error);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
- if (vmf_ret & VM_FAULT_ERROR)
+ if (ret & VM_FAULT_ERROR)
copied = 0;
/*
* The fault is done by now and there's no way back (other
@@ -1302,12 +1389,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff);
out:
- trace_dax_pte_fault_done(inode, vmf, vmf_ret);
- return vmf_ret;
+ trace_dax_pte_fault_done(inode, vmf, ret);
+ return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
- if (IS_ERR(ret))
- goto fallback;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1348,7 +1433,7 @@ fallback:
return VM_FAULT_FALLBACK;
}
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1358,7 +1443,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
- int result = VM_FAULT_FALLBACK;
+ vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
pgoff_t max_pgoff, pgoff;
void *entry;
@@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync);
- if (IS_ERR(entry))
- goto finish_iomap;
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1509,7 +1592,7 @@ out:
return result;
}
#else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1529,7 +1612,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* has done all the necessary locking for page fault to proceed
* successfully.
*/
-int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{
switch (pe_size) {
@@ -1553,14 +1636,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
* DAX file. It takes care of marking corresponding radix tree entry as dirty
* as well.
*/
-static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
enum page_entry_size pe_size,
pfn_t pfn)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
- int vmf_ret, error;
+ vm_fault_t ret;
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
@@ -1579,21 +1662,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
xa_unlock_irq(&mapping->i_pages);
switch (pe_size) {
case PE_SIZE_PTE:
- error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- vmf_ret = dax_fault_return(error);
+ ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
break;
#ifdef CONFIG_FS_DAX_PMD
case PE_SIZE_PMD:
- vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+ ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
break;
#endif
default:
- vmf_ret = VM_FAULT_FALLBACK;
+ ret = VM_FAULT_FALLBACK;
}
put_locked_mapping_entry(mapping, index);
- trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
- return vmf_ret;
+ trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+ return ret;
}
/**
@@ -1606,8 +1688,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
- pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;