From 38512aa98a3feb6acd7da8f0ed5dade5b592b426 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 7 Jun 2016 21:44:08 -0400 Subject: NFS: Don't flush caches for a getattr that races with writeback If there were outstanding writes then chalk up the unexpected change attribute on the server to them. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52e7d6869e3b..60051e62d3f1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1729,12 +1729,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* Could it be a race with writeback? */ + if (nfsi->nrequests == 0) { + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + } inode->i_version = fattr->change_attr; } } else { -- cgit v1.2.3 From 57b691819ee2b095da505b34abdcd3193d0af75c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Jun 2016 17:07:19 -0400 Subject: NFS: Cache access checks more aggressively If an attribute revalidation fails, then we already know that we'll zap the access cache. If, OTOH, the inode isn't changing, there should be no need to eject access calls just because they are old. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 52 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index aaf7bd0cbae2..210b33636fe4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2228,21 +2228,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; - int err = -ENOENT; + bool retry = true; + int err; spin_lock(&inode->i_lock); - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) - goto out_zap; - cache = nfs_access_search_rbtree(inode, cred); - if (cache == NULL) - goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) - goto out_stale; + for(;;) { + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + err = -ENOENT; + if (cache == NULL) + goto out; + /* Found an entry, is our attribute cache valid? */ + if (!nfs_attribute_cache_expired(inode) && + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + break; + err = -ECHILD; + if (!may_block) + goto out; + if (!retry) + goto out_zap; + spin_unlock(&inode->i_lock); + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (err) + return err; + spin_lock(&inode->i_lock); + retry = false; + } res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; @@ -2251,12 +2267,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str out: spin_unlock(&inode->i_lock); return err; -out_stale: - rb_erase(&cache->rb_node, &nfsi->access_cache); - list_del(&cache->lru); - spin_unlock(&inode->i_lock); - nfs_access_free_entry(cache); - return -ENOENT; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); @@ -2283,13 +2293,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); + if (err) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; - err = 0; out: rcu_read_unlock(); return err; @@ -2378,18 +2387,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; + bool may_block = (mask & MAY_NOT_BLOCK) == 0; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached_rcu(inode, cred, &cache); if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; status = -ECHILD; - if (mask & MAY_NOT_BLOCK) + if (!may_block) goto out; /* Be clever: ask server to check for all possible rights */ -- cgit v1.2.3 From ca0daa277acac1029f74d9fea838c9e507398226 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 8 Jun 2016 17:08:28 -0400 Subject: NFS: Cache aggressively when file is open for writing Unless the user is using file locking, we must assume close-to-open cache consistency when the file is open for writing. Adjust the caching algorithm so that it does not clear the cache on out-of-order writes and/or attribute revalidations. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 13 ++---------- fs/nfs/inode.c | 62 +++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 46 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..2d39d9f9da7d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -779,11 +779,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) return status; } -static int -is_time_granular(struct timespec *ts) { - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); -} - static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -817,12 +812,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - nfs_zap_caches(inode); - } + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_zap_mapping(inode, filp->f_mapping); out: return status; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 60051e62d3f1..4e65a5a8a01b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -878,7 +878,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (ctx->mode & FMODE_WRITE) + list_add(&ctx->list, &nfsi->open_files); + else + list_add_tail(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -1215,6 +1218,25 @@ int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space * return __nfs_revalidate_mapping(inode, mapping, true); } +static bool nfs_file_has_writers(struct nfs_inode *nfsi) +{ + struct inode *inode = &nfsi->vfs_inode; + + assert_spin_locked(&inode->i_lock); + + if (!S_ISREG(inode->i_mode)) + return false; + if (list_empty(&nfsi->open_files)) + return false; + /* Note: This relies on nfsi->open_files being ordered with writers + * being placed at the head of the list. + * See nfs_inode_attach_open_context() + */ + return (list_first_entry(&nfsi->open_files, + struct nfs_open_context, + list)->mode & FMODE_WRITE) == FMODE_WRITE; +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1279,22 +1301,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (!nfs_file_has_writers(nfsi)) { + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } } - if (nfsi->nrequests != 0) - invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1526,7 +1550,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + unsigned long invalid = NFS_INO_INVALID_ATTR; /* * Don't revalidate the pagecache if we hold a delegation, but do @@ -1675,6 +1699,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool have_writers = nfs_file_has_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", @@ -1730,7 +1755,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); /* Could it be a race with writeback? */ - if (nfsi->nrequests == 0) { + if (!have_writers) { invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS @@ -1770,9 +1795,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + if (nfsi->nrequests == 0 || new_isize > cur_isize) { i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!have_writers) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", -- cgit v1.2.3 From 6b56a89833fa7903595c8d138bb4927187315cba Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 18:23:01 -0400 Subject: NFS: Kill NFS_INO_NFS_INO_FLUSHING: it is a performance killer filemap_datawrite() and friends already deal just fine with livelock. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 8 -------- fs/nfs/nfstrace.h | 1 - fs/nfs/write.c | 11 ----------- 3 files changed, 20 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d39d9f9da7d..29d7477a62e8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -359,14 +359,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; /* * Wait for O_DIRECT to complete */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 0b9e5cc9a747..fe80a1c26340 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..980d44f3a84c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; -- cgit v1.2.3 From 811ed92ecc9f47eee90beabcf5c2133f2a6d2440 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 18:25:56 -0400 Subject: NFS: writepage of a single page should not be synchronous It is almost always better to wait for more so that we can issue a bulk commit. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 980d44f3a84c..b13d48881d3a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); -- cgit v1.2.3 From 93761d9863c332d1099d80629f89cf48eb745e48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Jun 2016 11:03:00 -0400 Subject: NFS: Don't hold the inode lock across fsync() Commits are no longer required to be serialised. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29d7477a62e8..249262b6bcbe 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -277,11 +277,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by -- cgit v1.2.3 From 4f52b6bb8c57b9accafad526a429d6c0851cc62f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Jun 2016 18:10:33 -0400 Subject: NFS: Don't call COMMIT in ->releasepage() While COMMIT has the potential to free up a lot of memory that is being taken by unstable writes, it isn't guaranteed to free up this particular page. Also, calling fsync() on the server is expensive and so we want to do it in a more controlled fashion, rather than have it triggered at random by the VM. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 249262b6bcbe..df4dd8e7e62e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -460,31 +460,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, */ static int nfs_release_page(struct page *page, gfp_t gfp) { - struct address_space *mapping = page->mapping; - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Always try to initiate a 'commit' if relevant, but only - * wait for it if the caller allows blocking. Even then, - * only wait 1 second and only if the 'bdi' is not congested. - * Waiting indefinitely can cause deadlocks when the NFS - * server is on this machine, when a new TCP connection is - * needed and in other rare cases. There is no particular - * need to wait extensively here. A short wait has the - * benefit that someone else can worry about the freezer. - */ - if (mapping) { - struct nfs_server *nfss = NFS_SERVER(mapping->host); - nfs_commit_inode(mapping->host, 0); - if (gfpflags_allow_blocking(gfp) && - !bdi_write_congested(&nfss->backing_dev_info)) { - wait_on_page_bit_killable_timeout(page, PG_private, - HZ); - if (PagePrivate(page)) - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } - } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; -- cgit v1.2.3 From 73e6c5d854d3f7f75e8b46d3e54aeb5d83fe6b1f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 12:27:25 -0400 Subject: pNFS/files: Fix layoutcommit after a commit to DS According to the errata https://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751 we should always send layout commit after a commit to DS. Fixes: bc7d4b8fd091 ("nfs/filelayout: set layoutcommit...") Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index aa59757389dc..b4c1407e8fe4 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -375,8 +375,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } -- cgit v1.2.3 From c001c87a63aa2f35358e33eb05e45e4cbcb34f54 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 12:39:49 -0400 Subject: pNFS/flexfiles: Fix layoutcommit after a commit to DS We should always do a layoutcommit after commit to DS, except if the layout segment we're using has set FF_FLAGS_NO_LAYOUTCOMMIT. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0e8018bc9880..2689c9e9dc3c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1530,8 +1530,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) + if (ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; -- cgit v1.2.3 From bc28e1c2e3c8a4c5198ebfd8bbae0afd73dfafd5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 16:14:40 -0400 Subject: pNFS/flexfiles: Clean up calls to pnfs_set_layoutcommit() Let's just have one place where we check ff_layout_need_layoutcommit(). Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2689c9e9dc3c..14f2ed3f1a5b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) * we always send layoutcommit after DS writes. */ static void -ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +ff_layout_set_layoutcommit(struct inode *inode, + struct pnfs_layout_segment *lseg, + loff_t end_offset) { - if (!ff_layout_need_layoutcommit(hdr->lseg)) + if (!ff_layout_need_layoutcommit(lseg)) return; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); + pnfs_set_layoutcommit(inode, lseg, end_offset); + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + (unsigned long long) NFS_I(inode)->layout->plh_lwb); } static bool @@ -1494,7 +1495,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr); + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + (loff_t)hdr->res.count); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; @@ -1530,8 +1532,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (ff_layout_need_layoutcommit(data->lseg)) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } -- cgit v1.2.3 From 2e18d4d822ea9cc811ea26a880cf2ed47cbf8889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Jun 2016 18:54:58 -0400 Subject: pNFS: Files and flexfiles always need to commit before layoutcommit So ensure that we mark the layout for commit once the write is done, and then ensure that the commit to ds is finished before sending layoutcommit. Note that by doing this, we're able to optimise away the commit for the case of servers that don't need layoutcommit in order to return updated attributes. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 9 ++++++--- fs/nfs/flexfilelayout/flexfilelayout.c | 7 +++++-- fs/nfs/nfs4xdr.c | 11 ++++++++--- fs/nfs/pnfs.c | 5 ++++- fs/nfs/pnfs_nfs.c | 7 +++++++ 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index b4c1407e8fe4..25bd91a6e088 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed != NFS_DATA_SYNC) + hdr->res.verf->committed == NFS_FILE_SYNC) return; + if (hdr->res.verf->committed == NFS_DATA_SYNC) + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); + /* Note: if the write is unstable, don't set end_offs until commit */ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 14f2ed3f1a5b..e6206eaf2bdf 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1470,6 +1470,7 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); @@ -1495,8 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + (loff_t)hdr->res.count); + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; + + /* Note: if the write is unstable, don't set end_offs until commit */ + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 661e753fe1c9..7bd3a5c09d31 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p = cpu_to_be32(0); /* reclaim */ encode_nfs4_stateid(xdr, &args->stateid); - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ - p = xdr_encode_hyper(p, args->lastbytewritten); + if (args->lastbytewritten != U64_MAX) { + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + } else { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ + } *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0c7e0d45a4de..62553182514e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2378,7 +2378,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; - data->args.lastbytewritten = end_pos - 1; + if (end_pos != 0) + data->args.lastbytewritten = end_pos - 1; + else + data->args.lastbytewritten = U64_MAX; data->res.server = NFS_SERVER(inode); if (ld->prepare_layoutcommit) { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 0dfc476da3e1..0d10cc280a23 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -932,6 +932,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); int pnfs_nfs_generic_sync(struct inode *inode, bool datasync) { + int ret; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + return ret; if (datasync) return 0; return pnfs_layoutcommit_inode(inode, true); -- cgit v1.2.3 From ac46bd374c9a838874c450c528e2e922ee748ff9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jul 2016 13:46:53 -0400 Subject: pNFS: Ensure we layoutcommit before revalidating attributes If we need to update the cached attributes, then we'd better make sure that we also layoutcommit first. Otherwise, the server may have stale attributes. Prior to this patch, the revalidation code tried to "fix" this problem by simply disabling attributes that would be affected by the layoutcommit. That approach breaks nfs_writeback_check_extend(), leading to a file size corruption. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4e65a5a8a01b..6c0618eb5d57 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -974,6 +974,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + /* pNFS: Attributes aren't updated until we layoutcommit */ + if (S_ISREG(inode->i_mode)) { + status = pnfs_sync_inode(inode, false); + if (status) + goto out; + } + status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1493,28 +1500,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -/* - * Don't trust the change_attribute, mtime, ctime or size if - * a pnfs LAYOUTCOMMIT is outstanding - */ -static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, - struct nfs_fattr *fattr) -{ - if (pnfs_layoutcommit_outstanding(inode)) - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | - NFS_ATTR_FATTR_MTIME | - NFS_ATTR_FATTR_CTIME | - NFS_ATTR_FATTR_SIZE); -} - static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); - nfs_inode_attrs_handle_layoutcommit(inode, fattr); - if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else -- cgit v1.2.3 From 6712007734cbd64ff924af16fc236751d47ff80b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jul 2016 19:08:58 -0400 Subject: pNFS: pnfs_layoutcommit_outstanding() is no longer used when !CONFIG_NFS_V4_1 Cleanup... Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0bee784..d6be5299a55a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -716,13 +716,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } -static inline bool -pnfs_layoutcommit_outstanding(struct inode *inode) -{ - return false; -} - - static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; -- cgit v1.2.3 From 8fc3c3862728373e0d0f5abccc6afc56c69e0c63 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 21:32:24 -0400 Subject: NFS: Fix O_DIRECT verifier problems We should not be interested in looking at the value of the stable field, since that could take any value. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 10 ++++++++-- fs/nfs/internal.h | 7 +++++++ fs/nfs/write.c | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 979b3c4dee6a..d6d43b5eafb3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); } +static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, + const struct nfs_writeverf *v2) +{ + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); +} + /* * nfs_direct_cmp_hdr_verf - compare verifier for pgio header * @dreq - direct request possibly spanning multiple servers @@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, nfs_direct_set_hdr_verf(dreq, hdr); return 0; } - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &hdr->verf); } /* @@ -238,7 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, if (verfp->committed < 0) return 1; - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &data->verf); } /** diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..150a8eb0f323 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -506,6 +506,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +static inline int +nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, + const struct nfs_write_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b13d48881d3a..3087fb6f1983 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1789,7 +1789,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); -- cgit v1.2.3 From a5314a74928fa6dbc4503a8c64f43bb5c1c12ac1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Jun 2016 21:42:32 -0400 Subject: NFS: Ensure we reset the write verifier 'committed' value on resend. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 ++ fs/nfs/internal.h | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d6d43b5eafb3..fb659bb50678 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -661,6 +661,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) dreq->mirrors[i].count = 0; get_dreq(dreq); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 150a8eb0f323..0eb5c924886d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -499,6 +499,23 @@ int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +#ifdef CONFIG_NFS_V4_1 +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ + int i; + + for (i = 0; i < cinfo->nbuckets; i++) + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} +#else +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ +} +#endif + + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -- cgit v1.2.3 From 2f3c7d87a347b12f725f6128b3097727b91b230e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 14:38:06 -0400 Subject: NFS: Remove racy size manipulations in O_DIRECT On success, the RPC callbacks will ensure that we make the appropriate calls to nfs_writeback_update_inode() Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index fb659bb50678..826d4dace0e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -376,15 +376,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { struct inode *inode = dreq->inode; - if (dreq->iocb && write) { - loff_t pos = dreq->iocb->ki_pos + dreq->count; - - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - if (write) nfs_zap_mapping(inode, inode->i_mapping); @@ -1058,14 +1049,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { - struct inode *inode = mapping->host; - iocb->ki_pos = pos + result; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); - /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } -- cgit v1.2.3 From 89698b24d24f9c8b470a73351b0b7199c17e0153 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 10:35:48 -0400 Subject: NFS Cleanup: move call to generic_write_checks() into fs/nfs/direct.c Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 12 ++++++++---- fs/nfs/file.c | 6 +----- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 826d4dace0e5..0169eca8eb42 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -988,6 +988,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; + size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; @@ -998,8 +999,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, - iov_iter_count(iter)); + result = generic_write_checks(iocb, iter); + if (result <= 0) + return result; + count = result; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; @@ -1017,7 +1021,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; } - task_io_account_write(iov_iter_count(iter)); + task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); @@ -1025,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index df4dd8e7e62e..c26847c84d00 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -629,12 +629,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result) return result; - if (iocb->ki_flags & IOCB_DIRECT) { - result = generic_write_checks(iocb, from); - if (result <= 0) - return result; + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_write(iocb, from); - } dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, count, (long long) iocb->ki_pos); -- cgit v1.2.3 From 18290650b1c8655cfe6e0d63dd34942a037a130b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 15:00:42 -0400 Subject: NFS: Move buffered I/O locking into nfs_file_write() Preparation for the patch that de-serialises O_DIRECT reads and writes. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c26847c84d00..46cf0afe3c0f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -623,7 +623,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_iter_count(from); result = nfs_key_timeout_notify(file, inode); if (result) @@ -633,9 +632,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return nfs_file_direct_write(iocb, from); dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) iocb->ki_pos); + file, iov_iter_count(from), (long long) iocb->ki_pos); - result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; /* @@ -647,28 +645,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - result = count; - if (!count) + inode_lock(inode); + result = generic_write_checks(iocb, from); + if (result > 0) { + current->backing_dev_info = inode_to_bdi(inode); + result = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + } + inode_unlock(inode); + if (result <= 0) goto out; - result = generic_file_write_iter(iocb, from); - if (result > 0) - written = result; + written = generic_write_sync(iocb, result); + iocb->ki_pos += written; /* Return error values */ - if (result >= 0 && nfs_need_check_write(file, inode)) { + if (nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; } - if (result > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; + return -EBUSY; } EXPORT_SYMBOL_GPL(nfs_file_write); -- cgit v1.2.3 From a5864c999de6703f7ce908f72337568520c6cad3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Jun 2016 17:07:19 -0400 Subject: NFS: Do not serialise O_DIRECT reads and writes Allow dio requests to be scheduled in parallel, but ensuring that they do not conflict with buffered I/O. Signed-off-by: Trond Myklebust --- fs/nfs/Makefile | 2 +- fs/nfs/direct.c | 41 ++++----------- fs/nfs/file.c | 12 +++-- fs/nfs/internal.h | 8 +++ fs/nfs/io.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+), 37 deletions(-) create mode 100644 fs/nfs/io.c (limited to 'fs') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0169eca8eb42..6d0e88096440 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -578,17 +578,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -603,10 +598,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -614,13 +611,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -1008,25 +1000,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -1041,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1048,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1058,13 +1039,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 46cf0afe3c0f..9f8da9e1b23f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -645,14 +649,14 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - inode_lock(inode); + nfs_start_io_write(inode); result = generic_write_checks(iocb, from); if (result > 0) { current->backing_dev_info = inode_to_bdi(inode); result = generic_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; } - inode_unlock(inode); + nfs_end_io_write(inode); if (result <= 0) goto out; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 0eb5c924886d..159b64ede82a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} -- cgit v1.2.3 From f7b5c340aca87d736a6b15aa40bf135f1baab011 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 09:29:47 -0400 Subject: NFS: Cleanup nfs_direct_complete() There is only one caller that sets the "write" argument to true, so just move the call to nfs_zap_mapping() and get rid of the now redundant argument. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6d0e88096440..c16d33eb1ddf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -372,13 +372,10 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - if (write) - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_end(inode); if (dreq->iocb) { @@ -431,7 +428,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); hdr->release(hdr); } @@ -537,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); return 0; } @@ -764,7 +761,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_direct_complete(dreq, true); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); + nfs_direct_complete(dreq); } } -- cgit v1.2.3 From f508d46ae41a796036aef566637685dbf83b554f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 09:55:48 -0400 Subject: NFS: Remove redundant waits for O_DIRECT in fsync() and write_begin() We're now waiting immediately after taking the locks, so waiting in fsync() and write_begin() is either redundant or potentially subject to livelock (if not holding the lock). Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9f8da9e1b23f..0e9b4a068f13 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -276,7 +276,6 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); - inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -361,11 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Wait for O_DIRECT to complete - */ - inode_dio_wait(mapping->host); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; -- cgit v1.2.3 From be527494e02b89e03485955b30de6c1e976a07eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 22 Jun 2016 08:19:36 -0400 Subject: NFS: Remove unused function nfs_revalidate_mapping_protected() Clean up... Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 38 ++++---------------------------------- 1 file changed, 4 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6c0618eb5d57..0e0500f2bb6b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1131,14 +1131,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1187,12 +1185,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1202,29 +1195,6 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, false); -} - -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) -{ - return __nfs_revalidate_mapping(inode, mapping, true); -} - static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; -- cgit v1.2.3 From 651b0e702981304f77091b82870a01480705f4fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:24:46 -0400 Subject: NFS: Do not aggressively cache file attributes in the case of O_DIRECT A file that is open for O_DIRECT is by definition not obeying close-to-open cache consistency semantics, so let's not cache the attributes too aggressively either. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 9 +++++++-- fs/nfs/internal.h | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 0e0500f2bb6b..7688436b19ba 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1214,6 +1214,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi) list)->mode & FMODE_WRITE) == FMODE_WRITE; } +static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) +{ + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1278,7 +1283,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if (!nfs_file_has_writers(nfsi)) { + if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; @@ -1660,7 +1665,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; - bool have_writers = nfs_file_has_writers(nfsi); + bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 159b64ede82a..01dccf18da0a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -419,6 +419,11 @@ extern void nfs_end_io_write(struct inode *inode); extern void nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); +static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) +{ + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; +} + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, -- cgit v1.2.3 From 79566ef018f53a181f067afdf7bef9cc53f9d34b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:45:40 -0400 Subject: NFS: Getattr doesn't require data sync semantics When retrieving stat() information, NFS unfortunately does require us to sync writes to disk in order to ensure that mtime and ctime are up to date. However we shouldn't have to ensure that those writes are persisted. Relaxing that requirement does mean that we may see an mtime/ctime change if the server reboots and forces us to replay all writes. The exception to this rule are pNFS clients that are required to send layoutcommit, however that is dealt with by the call to pnfs_sync_inode() in _nfs_revalidate_inode(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7688436b19ba..35fda08dc4f6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - err = nfs_sync_inode(inode); - inode_unlock(inode); + err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } -- cgit v1.2.3 From 1e564d3dbd684a105582471cb9ff2aada64a9052 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:50:53 -0400 Subject: NFSv4.2: Fix a race in nfs42_proc_deallocate() When punching holes in a file, we want to ensure the operation is serialised w.r.t. other writes, meaning that we want to call nfs_sync_inode() while holding the inode lock. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed09ba06..0f9f536e647b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; - nfs_wb_all(inode); inode_lock(inode); + err = nfs_sync_inode(inode); + if (err) + goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - +out_unlock: inode_unlock(inode); return err; } -- cgit v1.2.3 From 837bb1d752d92ea4d870877ffbd6ec5cf76624b3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 18:12:03 -0400 Subject: NFSv4.2: Fix writeback races in nfs4_copy_file_range We need to ensure that any writes to the destination file are serialised with the copy, meaning that the writeback has to occur under the inode lock. Also relax the writeback requirement on the source, and rely on the stateid checking to tell us if the source rebooted. Add the helper nfs_filemap_write_and_wait_range() to call pnfs_sync_inode() as is appropriate for pNFS servers that may need a layoutcommit. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 3 +++ fs/nfs/nfs42proc.c | 9 +++++++++ fs/nfs/nfs4file.c | 14 +------------- fs/nfs/write.c | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 01dccf18da0a..3b01c9146e15 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -512,6 +512,9 @@ int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); + #ifdef CONFIG_NFS_V4_1 static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 0f9f536e647b..b7d457cea03f 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -156,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + pos_src, pos_src + (loff_t)count - 1); + if (status) + return status; + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; + status = nfs_sync_inode(dst_inode); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 014b0e41ace5..7cdc0ab9e6f5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - struct inode *in_inode = file_inode(file_in); - struct inode *out_inode = file_inode(file_out); - int ret; - - if (in_inode == out_inode) + if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; - /* flush any pending writes */ - ret = nfs_sync_inode(in_inode); - if (ret) - return ret; - ret = nfs_sync_inode(out_inode); - if (ret) - return ret; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3087fb6f1983..538a473b324b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1912,6 +1912,24 @@ out_mark_dirty: } EXPORT_SYMBOL_GPL(nfs_write_inode); +/* + * Wrapper for filemap_write_and_wait_range() + * + * Needed for pNFS in order to ensure data becomes visible to the + * client. + */ +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, lstart, lend); + if (ret == 0) + ret = pnfs_sync_inode(mapping->host, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); + /* * flush the inode to disk. */ -- cgit v1.2.3 From e95fc4a06983c14273a39d26aad9cc5a8a09ff64 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 25 Jun 2016 17:57:39 -0400 Subject: NFSv4.2: llseek(SEEK_HOLE) and llseek(SEEK_DATA) don't require data sync We want to ensure that we write the cached data to the server, but don't require it be synced to disk. If the server reboots, we will get a stateid error, which will cause us to retry anyway. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index b7d457cea03f..616dc254b38b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -269,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - nfs_wb_all(inode); + status = nfs_filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) -- cgit v1.2.3 From 9a773e7c8de2a34ae682624624e95a96b121b6d1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Jun 2016 11:09:04 -0400 Subject: NFS nfs_vm_page_mkwrite: Don't freeze me, Bro... Prevent filesystem freezes while handling the write page fault. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0e9b4a068f13..039d58790629 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -569,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); + sb_start_pagefault(inode->i_sb); + /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); @@ -595,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: unlock_page(page); out: + sb_end_pagefault(inode->i_sb); return ret; } -- cgit v1.2.3 From 8b7d9d09b24f4ef16f7ae34b6d9e59857fda0870 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jul 2016 12:42:40 -0400 Subject: NFSv4: Revert "Truncating file opens should also sync O_DIRECT writes" We're not holding any locks, so both nfs_wb_all() and inode_dio_wait() are unenforcible and have livelock potential. Just limit ourselves to flushing out the data. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 7cdc0ab9e6f5..d085ad794884 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_sync_inode(inode); + filemap_write_and_wait(inode->i_mapping); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); -- cgit v1.2.3 From 10b7e9ad44881fcd46ac24eb7374377c6e8962ed Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Jul 2016 00:51:01 -0400 Subject: pNFS: Don't mark the inode as revalidated if a LAYOUTCOMMIT is outstanding We know that the attributes will need updating if there is still a LAYOUTCOMMIT outstanding. Reported-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 ++++- fs/nfs/pnfs.h | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 35fda08dc4f6..9df45832e28b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1664,7 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); - bool cache_revalidated = true; + bool cache_revalidated; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1713,6 +1713,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); + + cache_revalidated = !pnfs_layoutcommit_outstanding(inode); + /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (inode->i_version != fattr->change_attr) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d6be5299a55a..181283c4ebc3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -628,6 +628,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) return 0; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + static inline bool pnfs_roc(struct inode *ino) { -- cgit v1.2.3 From e033fb51ebb2983ee17b4a1b96ccbaedb137d9e9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Jul 2016 09:43:43 -0400 Subject: pNFS/files: filelayout_write_done_cb must call nfs_writeback_update_inode() All write callbacks are required to call nfs_writeback_update_inode() upon success to ensure that file size changes are recorded, and the attribute cache is invalidated. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 25bd91a6e088..a3fc48ba4931 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -357,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, } filelayout_set_layoutcommit(hdr); + + /* zero out the fattr */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } -- cgit v1.2.3