From bc78abbd55dd28e2287ec6d6502b842321a17c87 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Sep 2018 12:28:55 +0300 Subject: fuse: Fix use-after-free in fuse_dev_do_read() We may pick freed req in this way: [cpu0] [cpu1] fuse_dev_do_read() fuse_dev_do_write() list_move_tail(&req->list, ...); ... spin_unlock(&fpq->lock); ... ... request_end(fc, req); ... fuse_put_request(fc, req); if (test_bit(FR_INTERRUPTED, ...)) queue_interrupt(fiq, req); Fix that by keeping req alive until we finish all manipulations. Reported-by: syzbot+4e975615ca01f2277bdd@syzkaller.appspotmail.com Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 --- fs/fuse/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 11ea2c4a38ab..675caed3e655 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1311,12 +1311,14 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto out_end; } list_move_tail(&req->list, &fpq->processing); + __fuse_get_request(req); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) queue_interrupt(fiq, req); + fuse_put_request(fc, req); return reqsize; -- cgit v1.2.3 From d2d2d4fb1f54eff0f3faa9762d84f6446a4bc5d0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Sep 2018 12:52:42 +0300 Subject: fuse: Fix use-after-free in fuse_dev_do_write() After we found req in request_find() and released the lock, everything may happen with the req in parallel: cpu0 cpu1 fuse_dev_do_write() fuse_dev_do_write() req = request_find(fpq, ...) ... spin_unlock(&fpq->lock) ... ... req = request_find(fpq, oh.unique) ... spin_unlock(&fpq->lock) queue_interrupt(&fc->iq, req); ... ... ... ... ... request_end(fc, req); fuse_put_request(fc, req); ... queue_interrupt(&fc->iq, req); Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 --- fs/fuse/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 675caed3e655..c2af8042f176 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1877,16 +1877,20 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + __fuse_get_request(req); spin_unlock(&fpq->lock); err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) + if (nbytes != sizeof(struct fuse_out_header)) { + fuse_put_request(fc, req); goto err_finish; + } if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); fuse_copy_finish(cs); return nbytes; -- cgit v1.2.3 From 4c316f2f3ff315cb48efb7435621e5bfb81df96d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:22 +0200 Subject: fuse: set FR_SENT while locked Otherwise fuse_dev_do_write() could come in and finish off the request, and the set_bit(FR_SENT, ...) could trigger the WARN_ON(test_bit(FR_SENT, ...)) in request_end(). Signed-off-by: Miklos Szeredi Reported-by: syzbot+ef054c4d3f64cd7f7cec@syzkaller.appspotmai Fixes: 46c34a348b0a ("fuse: no fc->lock for pqueue parts") Cc: # v4.2 --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c2af8042f176..34976b42f3e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1312,8 +1312,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, } list_move_tail(&req->list, &fpq->processing); __fuse_get_request(req); - spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) -- cgit v1.2.3 From 908a572b80f6e9577b45e81b3dfe2e22111286b8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:22 +0200 Subject: fuse: fix blocked_waitq wakeup Using waitqueue_active() is racy. Make sure we issue a wake_up() unconditionally after storing into fc->blocked. After that it's okay to optimize with waitqueue_active() since the first wake up provides the necessary barrier for all waiters, not the just the woken one. Signed-off-by: Miklos Szeredi Fixes: 3c18ef8117f0 ("fuse: optimize wake_up") Cc: # v3.10 --- fs/fuse/dev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 34976b42f3e1..51eb602a435b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -391,12 +391,19 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (test_bit(FR_BACKGROUND, &req->flags)) { spin_lock(&fc->lock); clear_bit(FR_BACKGROUND, &req->flags); - if (fc->num_background == fc->max_background) + if (fc->num_background == fc->max_background) { fc->blocked = 0; - - /* Wake up next waiter, if any */ - if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); + } else if (!fc->blocked) { + /* + * Wake up next waiter, if any. It's okay to use + * waitqueue_active(), as we've already synced up + * fc->blocked with waiters with the wake_up() call + * above. + */ + if (waitqueue_active(&fc->blocked_waitq)) + wake_up(&fc->blocked_waitq); + } if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); -- cgit v1.2.3 From 88bc7d5097a11d9bdcf08ecf85c81ba998353437 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 21 Aug 2018 14:36:31 +0200 Subject: fuse: add support for copy_file_range() There are several FUSE filesystems that can implement server-side copy or other efficient copy/duplication/clone methods. The copy_file_range() syscall is the standard interface that users have access to while not depending on external libraries that bypass FUSE. Signed-off-by: Niels de Vos Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 3 +++ 2 files changed, 80 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 32d0b883e74f..63136a2c23ab 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3011,6 +3011,82 @@ out: return err; } +static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct fuse_file *ff_in = file_in->private_data; + struct fuse_file *ff_out = file_out->private_data; + struct inode *inode_out = file_inode(file_out); + struct fuse_inode *fi_out = get_fuse_inode(inode_out); + struct fuse_conn *fc = ff_in->fc; + FUSE_ARGS(args); + struct fuse_copy_file_range_in inarg = { + .fh_in = ff_in->fh, + .off_in = pos_in, + .nodeid_out = ff_out->nodeid, + .fh_out = ff_out->fh, + .off_out = pos_out, + .len = len, + .flags = flags + }; + struct fuse_write_out outarg; + ssize_t err; + /* mark unstable when write-back is not used, and file_out gets + * extended */ + bool is_unstable = (!fc->writeback_cache) && + ((pos_out + len) > inode_out->i_size); + + if (fc->no_copy_file_range) + return -EOPNOTSUPP; + + inode_lock(inode_out); + + if (fc->writeback_cache) { + err = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len); + if (err) + goto out; + + fuse_sync_writes(inode_out); + } + + if (is_unstable) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + args.in.h.opcode = FUSE_COPY_FILE_RANGE; + args.in.h.nodeid = ff_in->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err == -ENOSYS) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } + if (err) + goto out; + + if (fc->writeback_cache) { + fuse_write_update_size(inode_out, pos_out + outarg.size); + file_update_time(file_out); + } + + fuse_invalidate_attr(inode_out); + + err = outarg.size; +out: + if (is_unstable) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); + + inode_unlock(inode_out); + + return err; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read_iter = fuse_file_read_iter, @@ -3027,6 +3103,7 @@ static const struct file_operations fuse_file_operations = { .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, .fallocate = fuse_file_fallocate, + .copy_file_range = fuse_copy_file_range, }; static const struct file_operations fuse_direct_io_file_operations = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f78e9614bb5f..3e45d408a644 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -637,6 +637,9 @@ struct fuse_conn { /** Allow other than the mounter user to access the filesystem ? */ unsigned allow_other:1; + /** Does the filesystem support copy_file_range? */ + unsigned no_copy_file_range:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; -- cgit v1.2.3 From e287179afe2190faa7b97915cb89215dde5e044b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 31 Jul 2018 13:25:25 +0300 Subject: fuse: use list_first_entry() in flush_bg_queue() This cleanup patch makes the function to use the primitive instead of direct dereferencing. Also, move fiq dereferencing out of cycle, since it's always constant. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 51eb602a435b..6a7d3b4424e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -353,12 +353,13 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, static void flush_bg_queue(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; - req = list_entry(fc->bg_queue.next, struct fuse_req, list); + req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; spin_lock(&fiq->waitq.lock); -- cgit v1.2.3 From 2a23f2b8adbe4bd584f936f7ac17a99750eed9d7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Aug 2018 18:29:29 +0300 Subject: fuse: use READ_ONCE on congestion_threshold and max_background Since they are of unsigned int type, it's allowed to read them unlocked during reporting to userspace. Let's underline this fact with READ_ONCE() macroses. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 0b694655d988..acc35819aae6 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -107,7 +107,7 @@ static ssize_t fuse_conn_max_background_read(struct file *file, if (!fc) return 0; - val = fc->max_background; + val = READ_ONCE(fc->max_background); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); @@ -144,7 +144,7 @@ static ssize_t fuse_conn_congestion_threshold_read(struct file *file, if (!fc) return 0; - val = fc->congestion_threshold; + val = READ_ONCE(fc->congestion_threshold); fuse_conn_put(fc); return fuse_conn_limit_read(file, buf, len, ppos, val); -- cgit v1.2.3 From 2b30a533148af4f3865c0dcd619ad93ab3f4ba52 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Aug 2018 18:29:37 +0300 Subject: fuse: add locking to max_background and congestion_threshold changes Functions sequences like request_end()->flush_bg_queue() require that max_background and congestion_threshold are constant during their execution. Otherwise, checks like if (fc->num_background == fc->max_background) made in different time may behave not like expected. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index acc35819aae6..eaa0e2b21623 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -125,7 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { + spin_lock(&fc->lock); fc->max_background = val; + fc->blocked = fc->num_background >= fc->max_background; + if (!fc->blocked) + wake_up(&fc->blocked_waitq); + spin_unlock(&fc->lock); fuse_conn_put(fc); } } @@ -155,18 +160,31 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, size_t count, loff_t *ppos) { unsigned uninitialized_var(val); + struct fuse_conn *fc; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, max_user_congthresh); - if (ret > 0) { - struct fuse_conn *fc = fuse_ctl_file_conn_get(file); - if (fc) { - fc->congestion_threshold = val; - fuse_conn_put(fc); + if (ret <= 0) + goto out; + fc = fuse_ctl_file_conn_get(file); + if (!fc) + goto out; + + spin_lock(&fc->lock); + fc->congestion_threshold = val; + if (fc->sb) { + if (fc->num_background < fc->congestion_threshold) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } else { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } } - + spin_unlock(&fc->lock); + fuse_conn_put(fc); +out: return ret; } -- cgit v1.2.3 From ae2dffa39485c6fd4f22321814c7287c274b473a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Aug 2018 18:29:46 +0300 Subject: fuse: introduce fc->bg_lock To reduce contention of fc->lock, this patch introduces bg_lock for protection of fields related to background queue. These are: max_background, congestion_threshold, num_background, active_background, bg_queue and blocked. This allows next patch to make async reads not requiring fc->lock, so async reads and writes will have better performance executed in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 8 ++++---- fs/fuse/dev.c | 20 ++++++++++++-------- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 8 ++++++-- fs/fuse/inode.c | 3 +++ 5 files changed, 26 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index eaa0e2b21623..989df5accaee 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -125,12 +125,12 @@ static ssize_t fuse_conn_max_background_write(struct file *file, if (ret > 0) { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); fc->max_background = val; fc->blocked = fc->num_background >= fc->max_background; if (!fc->blocked) wake_up(&fc->blocked_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); fuse_conn_put(fc); } } @@ -171,7 +171,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); fc->congestion_threshold = val; if (fc->sb) { if (fc->num_background < fc->congestion_threshold) { @@ -182,7 +182,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } } - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6a7d3b4424e1..d4b9ffc6544d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -287,10 +287,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) * We get here in the unlikely case that a background * request was allocated but not sent */ - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); if (!fc->blocked) wake_up(&fc->blocked_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } if (test_bit(FR_WAITING, &req->flags)) { @@ -390,7 +390,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { - spin_lock(&fc->lock); + spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) { fc->blocked = 0; @@ -413,7 +413,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->num_background--; fc->active_background--; flush_bg_queue(fc); - spin_unlock(&fc->lock); + spin_unlock(&fc->bg_lock); } wake_up(&req->waitq); if (req->end) @@ -586,8 +586,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) * * fc->connected must have been checked previously */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) +void fuse_request_send_background_nocheck(struct fuse_conn *fc, + struct fuse_req *req) { BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { @@ -595,6 +595,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + spin_lock(&fc->bg_lock); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -604,6 +605,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); } void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) @@ -611,7 +613,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) BUG_ON(!req->end); spin_lock(&fc->lock); if (fc->connected) { - fuse_request_send_background_locked(fc, req); + fuse_request_send_background_nocheck(fc, req); spin_unlock(&fc->lock); } else { spin_unlock(&fc->lock); @@ -2118,7 +2120,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) LIST_HEAD(to_end); fc->connected = 0; - fc->blocked = 0; fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { @@ -2140,8 +2141,11 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) list_splice_tail_init(&fpq->processing, &to_end); spin_unlock(&fpq->lock); } + spin_lock(&fc->bg_lock); + fc->blocked = 0; fc->max_background = UINT_MAX; flush_bg_queue(fc); + spin_unlock(&fc->bg_lock); spin_lock(&fiq->waitq.lock); fiq->connected = 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 63136a2c23ab..65351d43c2b6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1502,7 +1502,7 @@ __acquires(fc->lock) req->in.args[1].size = inarg->size; fi->writectr++; - fuse_request_send_background_locked(fc, req); + fuse_request_send_background_nocheck(fc, req); return; out_free: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3e45d408a644..d6d55641a5a6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -500,6 +500,10 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; + /** Protects: max_background, congestion_threshold, num_background, + * active_background, bg_queue, blocked */ + spinlock_t bg_lock; + /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -860,8 +864,8 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req); +void fuse_request_send_background_nocheck(struct fuse_conn *fc, + struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index db9e60b7eb69..ed3f49628ce2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -605,6 +605,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); + spin_lock_init(&fc->bg_lock); init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); @@ -852,6 +853,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); + spin_lock(&fc->bg_lock); if (arg->max_background) { fc->max_background = arg->max_background; @@ -865,6 +867,7 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) fc->congestion_threshold > max_user_congthresh) fc->congestion_threshold = max_user_congthresh; } + spin_unlock(&fc->bg_lock); } static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) -- cgit v1.2.3 From 63825b4e1da5a3cba79d835a5925e5daf7db3a77 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Aug 2018 18:29:56 +0300 Subject: fuse: do not take fc->lock in fuse_request_send_background() Currently, we take fc->lock there only to check for fc->connected. But this flag is changed only on connection abort, which is very rare operation. So allow checking fc->connected under just fc->bg_lock and use this lock (as well as fc->lock) when resetting fc->connected. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 48 ++++++++++++++++++++++++------------------------ fs/fuse/file.c | 4 +++- fs/fuse/fuse_i.h | 4 +--- 3 files changed, 28 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d4b9ffc6544d..071feb8cb265 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -581,42 +581,38 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_nocheck(struct fuse_conn *fc, - struct fuse_req *req) +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + bool queued = false; + + WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); if (!test_bit(FR_WAITING, &req->flags)) { __set_bit(FR_WAITING, &req->flags); atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); spin_lock(&fc->bg_lock); - fc->num_background++; - if (fc->num_background == fc->max_background) - fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); - } - list_add_tail(&req->list, &fc->bg_queue); - flush_bg_queue(fc); + if (likely(fc->connected)) { + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + } + list_add_tail(&req->list, &fc->bg_queue); + flush_bg_queue(fc); + queued = true; + } spin_unlock(&fc->bg_lock); + + return queued; } void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(!req->end); - spin_lock(&fc->lock); - if (fc->connected) { - fuse_request_send_background_nocheck(fc, req); - spin_unlock(&fc->lock); - } else { - spin_unlock(&fc->lock); + WARN_ON(!req->end); + if (!fuse_request_queue_background(fc, req)) { req->out.h.error = -ENOTCONN; req->end(fc, req); fuse_put_request(fc, req); @@ -2119,7 +2115,11 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) struct fuse_req *req, *next; LIST_HEAD(to_end); + /* Background queuing checks fc->connected under bg_lock */ + spin_lock(&fc->bg_lock); fc->connected = 0; + spin_unlock(&fc->bg_lock); + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65351d43c2b6..d15c14912e72 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1487,6 +1487,7 @@ __acquires(fc->lock) struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; + bool queued; if (!fc->connected) goto out_free; @@ -1502,7 +1503,8 @@ __acquires(fc->lock) req->in.args[1].size = inarg->size; fi->writectr++; - fuse_request_send_background_nocheck(fc, req); + queued = fuse_request_queue_background(fc, req); + WARN_ON(!queued); return; out_free: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d6d55641a5a6..6e6eab8127a4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -863,9 +863,7 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); - -void fuse_request_send_background_nocheck(struct fuse_conn *fc, - struct fuse_req *req); +bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); -- cgit v1.2.3 From c59fd85e4fd07fdf0ab523a5e9734f5338d6aa19 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 11 Sep 2018 13:11:56 +0300 Subject: fuse: change interrupt requests allocation algorithm Using of two unconnected IDs req->in.h.unique and req->intr_unique does not allow to link requests to a hash table. We need can't use none of them as a key to calculate hash. This patch changes the algorithm of allocation of IDs for a request. Plain requests obtain even ID, while interrupt requests are encoded in the low bit. So, in next patches we will be able to use the rest of ID bits to calculate hash, and the hash will be the same for plain and interrupt requests. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 071feb8cb265..38bb46ab2d7b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,6 +25,10 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + static struct kmem_cache *fuse_req_cachep; static struct fuse_dev *fuse_get_dev(struct file *file) @@ -319,7 +323,8 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - return ++fiq->reqctr; + fiq->reqctr += FUSE_REQ_ID_STEP; + return fiq->reqctr; } static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) @@ -1090,7 +1095,7 @@ __releases(fiq->waitq.lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fiq); + req->intr_unique = (req->in.h.unique | FUSE_INT_REQ_BIT); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; -- cgit v1.2.3 From 3a5358d1a1b70bb3360578f09894d6856629ecdf Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 11 Sep 2018 13:12:05 +0300 Subject: fuse: kill req->intr_unique This field is not needed after the previous patch, since we can easily convert request ID to interrupt request ID and vice versa. Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 11 +++++------ fs/fuse/fuse_i.h | 3 --- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 38bb46ab2d7b..eee43057b99b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1095,12 +1095,11 @@ __releases(fiq->waitq.lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = (req->in.h.unique | FUSE_INT_REQ_BIT); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; ih.opcode = FUSE_INTERRUPT; - ih.unique = req->intr_unique; + ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; spin_unlock(&fiq->waitq.lock); @@ -1808,7 +1807,7 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) struct fuse_req *req; list_for_each_entry(req, &fpq->processing, list) { - if (req->in.h.unique == unique || req->intr_unique == unique) + if (req->in.h.unique == unique) return req; } return NULL; @@ -1882,12 +1881,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (!fpq->connected) goto err_unlock_pq; - req = request_find(fpq, oh.unique); + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); if (!req) goto err_unlock_pq; - /* Is it an interrupt reply? */ - if (req->intr_unique == oh.unique) { + /* Is it an interrupt reply ID? */ + if (oh.unique & FUSE_INT_REQ_BIT) { __fuse_get_request(req); spin_unlock(&fpq->lock); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6e6eab8127a4..1d7b5b7a051d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -311,9 +311,6 @@ struct fuse_req { /** refcount */ refcount_t count; - /** Unique ID for the interrupt request */ - u64 intr_unique; - /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; -- cgit v1.2.3 From be2ff42c5d6ebc8552c82a7d1697afae30510ed9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 11 Sep 2018 13:12:14 +0300 Subject: fuse: Use hash table to link processing request We noticed the performance bottleneck in FUSE running our Virtuozzo storage over rdma. On some types of workload we observe 20% of times spent in request_find() in profiler. This function is iterating over long requests list, and it scales bad. The patch introduces hash table to reduce the number of iterations, we do in this function. Hash generating algorithm is taken from hash_add() function, while 256 lines table is used to store pending requests. This fixes problem and improves the performance. Reported-by: Alexey Kuznetsov Signed-off-by: Kirill Tkhai Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 21 +++++++++++++++++---- fs/fuse/fuse_i.h | 7 +++++-- fs/fuse/inode.c | 27 +++++++++++++++++++-------- 3 files changed, 41 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eee43057b99b..91b4ecf85dc7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -327,6 +327,11 @@ static u64 fuse_get_unique(struct fuse_iqueue *fiq) return fiq->reqctr; } +static unsigned int fuse_req_hash(u64 unique) +{ + return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); +} + static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + @@ -1248,6 +1253,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_req *req; struct fuse_in *in; unsigned reqsize; + unsigned int hash; restart: spin_lock(&fiq->waitq.lock); @@ -1320,7 +1326,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, err = reqsize; goto out_end; } - list_move_tail(&req->list, &fpq->processing); + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); __fuse_get_request(req); set_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); @@ -1804,9 +1811,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { + unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; - list_for_each_entry(req, &fpq->processing, list) { + list_for_each_entry(req, &fpq->processing[hash], list) { if (req->in.h.unique == unique) return req; } @@ -2118,6 +2126,7 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) struct fuse_dev *fud; struct fuse_req *req, *next; LIST_HEAD(to_end); + unsigned int i; /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); @@ -2142,7 +2151,9 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) } spin_unlock(&req->waitq.lock); } - list_splice_tail_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], + &to_end); spin_unlock(&fpq->lock); } spin_lock(&fc->bg_lock); @@ -2185,10 +2196,12 @@ int fuse_dev_release(struct inode *inode, struct file *file) struct fuse_conn *fc = fud->fc; struct fuse_pqueue *fpq = &fud->pq; LIST_HEAD(to_end); + unsigned int i; spin_lock(&fpq->lock); WARN_ON(!list_empty(&fpq->io)); - list_splice_init(&fpq->processing, &to_end); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); end_requests(fc, &to_end); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1d7b5b7a051d..2c4272076f62 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -408,6 +408,9 @@ struct fuse_iqueue { struct fasync_struct *fasync; }; +#define FUSE_PQ_HASH_BITS 8 +#define FUSE_PQ_HASH_SIZE (1 << FUSE_PQ_HASH_BITS) + struct fuse_pqueue { /** Connection established */ unsigned connected; @@ -415,8 +418,8 @@ struct fuse_pqueue { /** Lock protecting accessess to members of this structure */ spinlock_t lock; - /** The list of requests being processed */ - struct list_head processing; + /** Hash table of requests being processed */ + struct list_head *processing; /** The list of requests under I/O */ struct list_head io; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ed3f49628ce2..9383b47b3d9b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -594,9 +594,11 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq) static void fuse_pqueue_init(struct fuse_pqueue *fpq) { - memset(fpq, 0, sizeof(struct fuse_pqueue)); + unsigned int i; + spin_lock_init(&fpq->lock); - INIT_LIST_HEAD(&fpq->processing); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + INIT_LIST_HEAD(&fpq->processing[i]); INIT_LIST_HEAD(&fpq->io); fpq->connected = 1; } @@ -1025,17 +1027,26 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) { struct fuse_dev *fud; + struct list_head *pq; fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); - if (fud) { - fud->fc = fuse_conn_get(fc); - fuse_pqueue_init(&fud->pq); + if (!fud) + return NULL; - spin_lock(&fc->lock); - list_add_tail(&fud->entry, &fc->devices); - spin_unlock(&fc->lock); + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(fud); + return NULL; } + fud->pq.processing = pq; + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + return fud; } EXPORT_SYMBOL_GPL(fuse_dev_alloc); -- cgit v1.2.3 From d123d8e1833c5d854b56f2a7da17cafd0a901df8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:23 +0200 Subject: fuse: split out readdir.c Directory reading code is about to grow larger, so split it out from dir.c into a new source file. Signed-off-by: Miklos Szeredi --- fs/fuse/Makefile | 2 +- fs/fuse/dir.c | 259 +----------------------------------------------------- fs/fuse/fuse_i.h | 12 +++ fs/fuse/readdir.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+), 258 deletions(-) create mode 100644 fs/fuse/readdir.c (limited to 'fs') diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 60da84a86dab..f7b807bc1027 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o +fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0979609d6eba..3a333b0ea9ad 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -16,22 +16,6 @@ #include #include -static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) -{ - struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_inode *fi = get_fuse_inode(dir); - - if (!fc->do_readdirplus) - return false; - if (!fc->readdirplus_auto) - return true; - if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) - return true; - if (ctx->pos == 0) - return true; - return false; -} - static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -80,8 +64,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ -static void fuse_change_entry_timeout(struct dentry *entry, - struct fuse_entry_out *o) +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); @@ -92,7 +75,7 @@ static u64 attr_timeout(struct fuse_attr_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static u64 entry_attr_timeout(struct fuse_entry_out *o) +u64 entry_attr_timeout(struct fuse_entry_out *o) { return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } @@ -262,11 +245,6 @@ invalid: goto out; } -static int invalid_nodeid(u64 nodeid) -{ - return !nodeid || nodeid == FUSE_ROOT_ID; -} - static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); @@ -1165,239 +1143,6 @@ static int fuse_permission(struct inode *inode, int mask) return err; } -static int parse_dirfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx) -{ - while (nbytes >= FUSE_NAME_OFFSET) { - struct fuse_dirent *dirent = (struct fuse_dirent *) buf; - size_t reclen = FUSE_DIRENT_SIZE(dirent); - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) - break; - - buf += reclen; - nbytes -= reclen; - ctx->pos = dirent->off; - } - - return 0; -} - -static int fuse_direntplus_link(struct file *file, - struct fuse_direntplus *direntplus, - u64 attr_version) -{ - struct fuse_entry_out *o = &direntplus->entry_out; - struct fuse_dirent *dirent = &direntplus->dirent; - struct dentry *parent = file->f_path.dentry; - struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); - struct dentry *dentry; - struct dentry *alias; - struct inode *dir = d_inode(parent); - struct fuse_conn *fc; - struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - - if (!o->nodeid) { - /* - * Unlike in the case of fuse_lookup, zero nodeid does not mean - * ENOENT. Instead, it only means the userspace filesystem did - * not want to return attributes/handle for this entry. - * - * So do nothing. - */ - return 0; - } - - if (name.name[0] == '.') { - /* - * We could potentially refresh the attributes of the directory - * and its parent? - */ - if (name.len == 1) - return 0; - if (name.name[1] == '.' && name.len == 2) - return 0; - } - - if (invalid_nodeid(o->nodeid)) - return -EIO; - if (!fuse_valid_type(o->attr.mode)) - return -EIO; - - fc = get_fuse_conn(dir); - - name.hash = full_name_hash(parent, name.name, name.len); - dentry = d_lookup(parent, &name); - if (!dentry) { -retry: - dentry = d_alloc_parallel(parent, &name, &wq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (!d_in_lookup(dentry)) { - struct fuse_inode *fi; - inode = d_inode(dentry); - if (!inode || - get_node_id(inode) != o->nodeid || - ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { - d_invalidate(dentry); - dput(dentry); - goto retry; - } - if (is_bad_inode(inode)) { - dput(dentry); - return -EIO; - } - - fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->nlookup++; - spin_unlock(&fc->lock); - - forget_all_cached_acls(inode); - fuse_change_attributes(inode, &o->attr, - entry_attr_timeout(o), - attr_version); - /* - * The other branch comes via fuse_iget() - * which bumps nlookup inside - */ - } else { - inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, - &o->attr, entry_attr_timeout(o), - attr_version); - if (!inode) - inode = ERR_PTR(-ENOMEM); - - alias = d_splice_alias(inode, dentry); - d_lookup_done(dentry); - if (alias) { - dput(dentry); - dentry = alias; - } - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - } - if (fc->readdirplus_auto) - set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); - fuse_change_entry_timeout(dentry, o); - - dput(dentry); - return 0; -} - -static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) -{ - struct fuse_direntplus *direntplus; - struct fuse_dirent *dirent; - size_t reclen; - int over = 0; - int ret; - - while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { - direntplus = (struct fuse_direntplus *) buf; - dirent = &direntplus->dirent; - reclen = FUSE_DIRENTPLUS_SIZE(direntplus); - - if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) - return -EIO; - if (reclen > nbytes) - break; - if (memchr(dirent->name, '/', dirent->namelen) != NULL) - return -EIO; - - if (!over) { - /* We fill entries into dstbuf only as much as - it can hold. But we still continue iterating - over remaining entries to link them. If not, - we need to send a FORGET for each of those - which we did not link. - */ - over = !dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type); - if (!over) - ctx->pos = dirent->off; - } - - buf += reclen; - nbytes -= reclen; - - ret = fuse_direntplus_link(file, direntplus, attr_version); - if (ret) - fuse_force_forget(file, direntplus->entry_out.nodeid); - } - - return 0; -} - -static int fuse_readdir(struct file *file, struct dir_context *ctx) -{ - int plus, err; - size_t nbytes; - struct page *page; - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - u64 attr_version = 0; - bool locked; - - if (is_bad_inode(inode)) - return -EIO; - - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); - return -ENOMEM; - } - - plus = fuse_use_readdirplus(inode, ctx); - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; - if (plus) { - attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); - } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); - } - locked = fuse_lock_inode(inode); - fuse_request_send(fc, req); - fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err) { - if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, - file, ctx, - attr_version); - } else { - err = parse_dirfile(page_address(page), nbytes, file, - ctx); - } - } - - __free_page(page); - fuse_invalidate_atime(inode); - return err; -} - static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2c4272076f62..dfe10c2df6a9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -704,6 +704,11 @@ static inline u64 get_node_id(struct inode *inode) return get_fuse_inode(inode)->nodeid; } +static inline int invalid_nodeid(u64 nodeid) +{ + return !nodeid || nodeid == FUSE_ROOT_ID; +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -878,6 +883,9 @@ void fuse_invalidate_entry_cache(struct dentry *entry); void fuse_invalidate_atime(struct inode *inode); +u64 entry_attr_timeout(struct fuse_entry_out *o); +void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); + /** * Acquire reference to fuse_conn */ @@ -997,4 +1005,8 @@ struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); + +/* readdir.c */ +int fuse_readdir(struct file *file, struct dir_context *ctx); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c new file mode 100644 index 000000000000..3e100e00e21e --- /dev/null +++ b/fs/fuse/readdir.c @@ -0,0 +1,259 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2018 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +#include "fuse_i.h" +#include + +static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (ctx->pos == 0) + return true; + return false; +} + +static int parse_dirfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx) +{ + while (nbytes >= FUSE_NAME_OFFSET) { + struct fuse_dirent *dirent = (struct fuse_dirent *) buf; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + break; + + buf += reclen; + nbytes -= reclen; + ctx->pos = dirent->off; + } + + return 0; +} + +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) +{ + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + + if (invalid_nodeid(o->nodeid)) + return -EIO; + if (!fuse_valid_type(o->attr.mode)) + return -EIO; + + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(parent, name.name, name.len); + dentry = d_lookup(parent, &name); + if (!dentry) { +retry: + dentry = d_alloc_parallel(parent, &name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (!d_in_lookup(dentry)) { + struct fuse_inode *fi; + inode = d_inode(dentry); + if (!inode || + get_node_id(inode) != o->nodeid || + ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { + d_invalidate(dentry); + dput(dentry); + goto retry; + } + if (is_bad_inode(inode)) { + dput(dentry); + return -EIO; + } + + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + forget_all_cached_acls(inode); + fuse_change_attributes(inode, &o->attr, + entry_attr_timeout(o), + attr_version); + /* + * The other branch comes via fuse_iget() + * which bumps nlookup inside + */ + } else { + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), + attr_version); + if (!inode) + inode = ERR_PTR(-ENOMEM); + + alias = d_splice_alias(inode, dentry); + d_lookup_done(dentry); + if (alias) { + dput(dentry); + dentry = alias; + } + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + } + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + fuse_change_entry_timeout(dentry, o); + + dput(dentry); + return 0; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + struct dir_context *ctx, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + if (memchr(dirent->name, '/', dirent->namelen) != NULL) + return -EIO; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = !dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type); + if (!over) + ctx->pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + int plus, err; + size_t nbytes; + struct page *page; + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + u64 attr_version = 0; + bool locked; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + page = alloc_page(GFP_KERNEL); + if (!page) { + fuse_put_request(fc, req); + return -ENOMEM; + } + + plus = fuse_use_readdirplus(inode, ctx); + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); + } + locked = fuse_lock_inode(inode); + fuse_request_send(fc, req); + fuse_unlock_inode(inode, locked); + nbytes = req->out.args[0].size; + err = req->out.h.error; + fuse_put_request(fc, req); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, ctx, attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + ctx); + } + } + + __free_page(page); + fuse_invalidate_atime(inode); + return err; +} -- cgit v1.2.3 From 18172b10b674a7cd5340b2dd70202ce6622400bd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Sep 2018 16:43:23 +0200 Subject: fuse: extract fuse_emit() helper Prepare for cache filling by introducing a helper for emitting a single directory entry. Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 3e100e00e21e..65336c93c1f4 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -26,6 +26,13 @@ static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) return false; } +static bool fuse_emit(struct file *file, struct dir_context *ctx, + struct fuse_dirent *dirent) +{ + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, + dirent->type); +} + static int parse_dirfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx) { @@ -39,8 +46,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, if (memchr(dirent->name, '/', dirent->namelen) != NULL) return -EIO; - if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) + if (!fuse_emit(file, ctx, dirent)) break; buf += reclen; @@ -183,8 +189,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, we need to send a FORGET for each of those which we did not link. */ - over = !dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type); + over = !fuse_emit(file, ctx, dirent); if (!over) ctx->pos = dirent->off; } -- cgit v1.2.3 From 69e34551152a286f827d54dcb5700da6aeaac1fb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:04 +0200 Subject: fuse: allow caching readdir This patch just adds the cache filling functions, which are invoked if FOPEN_CACHE_DIR flag is set in the OPENDIR reply. Cache reading and cache invalidation are added by subsequent patches. The directory cache uses the page cache. Directory entries are packed into a page in the same format as in the READDIR reply. A page only contains whole entries, the space at the end of the page is cleared. The page is locked while being modified. Multiple parallel readdirs on the same directory can fill the cache; the only constraint is that continuity must be maintained (d_off of last entry points to position of current entry). Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 15 +++++++++ fs/fuse/inode.c | 4 +++ fs/fuse/readdir.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 109 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index dfe10c2df6a9..d2fa7588533e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -103,6 +103,21 @@ struct fuse_inode { /** List of writepage requestst (pending or sent) */ struct list_head writepages; + /* readdir cache */ + struct { + /* true if fully cached */ + bool cached; + + /* size of cache */ + loff_t size; + + /* position at end of cache (position of next entry) */ + loff_t pos; + + /* protects above fields */ + spinlock_t lock; + } rdc; + /** Miscellaneous bits describing inode state */ unsigned long state; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9383b47b3d9b..892efe6351eb 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -100,6 +100,10 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); init_waitqueue_head(&fi->page_waitq); + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 65336c93c1f4..6c5ada164f7e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -9,6 +9,8 @@ #include "fuse_i.h" #include +#include +#include static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) { @@ -26,9 +28,91 @@ static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx) return false; } +static void fuse_add_dirent_to_cache(struct file *file, + struct fuse_dirent *dirent, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + size_t reclen = FUSE_DIRENT_SIZE(dirent); + pgoff_t index; + struct page *page; + loff_t size; + unsigned int offset; + void *addr; + + spin_lock(&fi->rdc.lock); + /* + * Is cache already completed? Or this entry does not go at the end of + * cache? + */ + if (fi->rdc.cached || pos != fi->rdc.pos) { + spin_unlock(&fi->rdc.lock); + return; + } + size = fi->rdc.size; + offset = size & ~PAGE_MASK; + index = size >> PAGE_SHIFT; + /* Dirent doesn't fit in current page? Jump to next page. */ + if (offset + reclen > PAGE_SIZE) { + index++; + offset = 0; + } + spin_unlock(&fi->rdc.lock); + + if (offset) { + page = find_lock_page(file->f_mapping, index); + } else { + page = find_or_create_page(file->f_mapping, index, + mapping_gfp_mask(file->f_mapping)); + } + if (!page) + return; + + spin_lock(&fi->rdc.lock); + /* Raced with another readdir */ + if (fi->rdc.size != size || WARN_ON(fi->rdc.pos != pos)) + goto unlock; + + addr = kmap_atomic(page); + if (!offset) + clear_page(addr); + memcpy(addr + offset, dirent, reclen); + kunmap_atomic(addr); + fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; + fi->rdc.pos = dirent->off; +unlock: + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); +} + +static void fuse_readdir_cache_end(struct file *file, loff_t pos) +{ + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); + loff_t end; + + spin_lock(&fi->rdc.lock); + /* does cache end position match current position? */ + if (fi->rdc.pos != pos) { + spin_unlock(&fi->rdc.lock); + return; + } + + fi->rdc.cached = true; + end = ALIGN(fi->rdc.size, PAGE_SIZE); + spin_unlock(&fi->rdc.lock); + + /* truncate unused tail of cache */ + truncate_inode_pages(file->f_mapping, end); +} + static bool fuse_emit(struct file *file, struct dir_context *ctx, struct fuse_dirent *dirent) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_add_dirent_to_cache(file, dirent, ctx->pos); + return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, dirent->type); } @@ -249,7 +333,12 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) err = req->out.h.error; fuse_put_request(fc, req); if (!err) { - if (plus) { + if (!nbytes) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_CACHE_DIR) + fuse_readdir_cache_end(file, ctx->pos); + } else if (plus) { err = parse_dirplusfile(page_address(page), nbytes, file, ctx, attr_version); } else { -- cgit v1.2.3 From 5d7bc7e8680c7ca4c8a4f139ce2a54ccb8131ef0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:04 +0200 Subject: fuse: allow using readdir cache The cache is only used if it's completed, not while it's still being filled; this constraint could be lifted later, if it turns out to be useful. Introduce state in struct fuse_file that indicates the position within the cache. After a seek, reset the position to the beginning of the cache and search the cache for the current position. If the current position is not found in the cache, then fall back to uncached readdir. It can also happen that page(s) disappear from the cache, in which case we must also fall back to uncached readdir. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 + fs/fuse/fuse_i.h | 15 ++++++ fs/fuse/readdir.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 161 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d15c14912e72..e10c0443c56f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -59,6 +59,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); + mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -73,6 +74,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { fuse_request_free(ff->reserved_req); + mutex_destroy(&ff->readdir.lock); kfree(ff); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d2fa7588533e..49e42635e3ac 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -163,6 +163,21 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + /* Readdir related */ + struct { + /* + * Protects below fields against (crazy) parallel readdir on + * same open file. Uncontended in the normal case. + */ + struct mutex lock; + + /* Dir stream position */ + loff_t pos; + + /* Offset in cache */ + loff_t cache_off; + } readdir; + /** RB node to be linked on fuse_conn->polled_files */ struct rb_node polled_node; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 6c5ada164f7e..5bdc0b945d72 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -289,7 +289,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, return 0; } -int fuse_readdir(struct file *file, struct dir_context *ctx) +static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus, err; size_t nbytes; @@ -300,9 +300,6 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) u64 attr_version = 0; bool locked; - if (is_bad_inode(inode)) - return -EIO; - req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); @@ -351,3 +348,146 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) fuse_invalidate_atime(inode); return err; } + +enum fuse_parse_result { + FOUND_ERR = -1, + FOUND_NONE = 0, + FOUND_SOME, + FOUND_ALL, +}; + +static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, + void *addr, unsigned int size, + struct dir_context *ctx) +{ + unsigned int offset = ff->readdir.cache_off & ~PAGE_MASK; + enum fuse_parse_result res = FOUND_NONE; + + WARN_ON(offset >= size); + + for (;;) { + struct fuse_dirent *dirent = addr + offset; + unsigned int nbytes = size - offset; + size_t reclen = FUSE_DIRENT_SIZE(dirent); + + if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) + break; + + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) + return FOUND_ERR; + if (WARN_ON(reclen > nbytes)) + return FOUND_ERR; + if (WARN_ON(memchr(dirent->name, '/', dirent->namelen) != NULL)) + return FOUND_ERR; + + if (ff->readdir.pos == ctx->pos) { + res = FOUND_SOME; + if (!dir_emit(ctx, dirent->name, dirent->namelen, + dirent->ino, dirent->type)) + return FOUND_ALL; + ctx->pos = dirent->off; + } + ff->readdir.pos = dirent->off; + ff->readdir.cache_off += reclen; + + offset += reclen; + } + + return res; +} + +#define UNCACHED 1 + +static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + struct fuse_inode *fi = get_fuse_inode(inode); + enum fuse_parse_result res; + pgoff_t index; + unsigned int size; + struct page *page; + void *addr; + + /* Seeked? If so, reset the cache stream */ + if (ff->readdir.pos != ctx->pos) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + +retry: + spin_lock(&fi->rdc.lock); + if (!fi->rdc.cached) { + spin_unlock(&fi->rdc.lock); + return UNCACHED; + } + WARN_ON(fi->rdc.size < ff->readdir.cache_off); + + index = ff->readdir.cache_off >> PAGE_SHIFT; + + if (index == (fi->rdc.size >> PAGE_SHIFT)) + size = fi->rdc.size & ~PAGE_MASK; + else + size = PAGE_SIZE; + spin_unlock(&fi->rdc.lock); + + /* EOF? */ + if ((ff->readdir.cache_off & ~PAGE_MASK) == size) + return 0; + + page = find_get_page_flags(file->f_mapping, index, + FGP_ACCESSED | FGP_LOCK); + if (!page) { + /* + * Uh-oh: page gone missing, cache is useless + */ + return UNCACHED; + } + + addr = kmap(page); + res = fuse_parse_cache(ff, addr, size, ctx); + kunmap(page); + unlock_page(page); + put_page(page); + + if (res == FOUND_ERR) + return -EIO; + + if (res == FOUND_ALL) + return 0; + + if (size == PAGE_SIZE) { + /* We hit end of page: skip to next page. */ + ff->readdir.cache_off = ALIGN(ff->readdir.cache_off, PAGE_SIZE); + goto retry; + } + + /* + * End of cache reached. If found position, then we are done, otherwise + * need to fall back to uncached, since the position we were looking for + * wasn't in the cache. + */ + return res == FOUND_SOME ? 0 : UNCACHED; +} + +int fuse_readdir(struct file *file, struct dir_context *ctx) +{ + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); + int err; + + if (is_bad_inode(inode)) + return -EIO; + + mutex_lock(&ff->readdir.lock); + + err = UNCACHED; + if (ff->open_flags & FOPEN_CACHE_DIR) + err = fuse_readdir_cached(file, ctx); + if (err == UNCACHED) + err = fuse_readdir_uncached(file, ctx); + + mutex_unlock(&ff->readdir.lock); + + return err; +} -- cgit v1.2.3 From 3494927e090bf511e54eecaf33a8e56e5c0463db Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:04 +0200 Subject: fuse: add readdir cache version Allow the cache to be invalidated when page(s) have gone missing. In this case increment the version of the cache and reset to an empty state. Add a version number to the directory stream in struct fuse_file as well, indicating the version of the cache it's supposed to be reading. If the cache version doesn't match the stream's version, then reset the stream to the beginning of the cache. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 7 +++++++ fs/fuse/inode.c | 1 + fs/fuse/readdir.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 49e42635e3ac..8b24805e62ee 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -114,6 +114,9 @@ struct fuse_inode { /* position at end of cache (position of next entry) */ loff_t pos; + /* version of the cache */ + u64 version; + /* protects above fields */ spinlock_t lock; } rdc; @@ -176,6 +179,10 @@ struct fuse_file { /* Offset in cache */ loff_t cache_off; + + /* Version of cache we are reading */ + u64 version; + } readdir; /** RB node to be linked on fuse_conn->polled_files */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 892efe6351eb..eef2ae713f75 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -104,6 +104,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->rdc.cached = false; fi->rdc.size = 0; fi->rdc.pos = 0; + fi->rdc.version = 0; mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 5bdc0b945d72..18318cc31c05 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -36,6 +36,7 @@ static void fuse_add_dirent_to_cache(struct file *file, pgoff_t index; struct page *page; loff_t size; + u64 version; unsigned int offset; void *addr; @@ -48,6 +49,7 @@ static void fuse_add_dirent_to_cache(struct file *file, spin_unlock(&fi->rdc.lock); return; } + version = fi->rdc.version; size = fi->rdc.size; offset = size & ~PAGE_MASK; index = size >> PAGE_SHIFT; @@ -69,7 +71,8 @@ static void fuse_add_dirent_to_cache(struct file *file, spin_lock(&fi->rdc.lock); /* Raced with another readdir */ - if (fi->rdc.size != size || WARN_ON(fi->rdc.pos != pos)) + if (fi->rdc.version != version || fi->rdc.size != size || + WARN_ON(fi->rdc.pos != pos)) goto unlock; addr = kmap_atomic(page); @@ -396,6 +399,14 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, return res; } +static void fuse_rdc_reset(struct fuse_inode *fi) +{ + fi->rdc.cached = false; + fi->rdc.version++; + fi->rdc.size = 0; + fi->rdc.pos = 0; +} + #define UNCACHED 1 static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) @@ -421,6 +432,21 @@ retry: spin_unlock(&fi->rdc.lock); return UNCACHED; } + /* + * If cache version changed since the last getdents() call, then reset + * the cache stream. + */ + if (ff->readdir.version != fi->rdc.version) { + ff->readdir.pos = 0; + ff->readdir.cache_off = 0; + } + /* + * If at the beginning of the cache, than reset version to + * current. + */ + if (ff->readdir.pos == 0) + ff->readdir.version = fi->rdc.version; + WARN_ON(fi->rdc.size < ff->readdir.cache_off); index = ff->readdir.cache_off >> PAGE_SHIFT; @@ -437,13 +463,30 @@ retry: page = find_get_page_flags(file->f_mapping, index, FGP_ACCESSED | FGP_LOCK); + spin_lock(&fi->rdc.lock); if (!page) { /* * Uh-oh: page gone missing, cache is useless */ + if (fi->rdc.version == ff->readdir.version) + fuse_rdc_reset(fi); + spin_unlock(&fi->rdc.lock); return UNCACHED; } + /* Make sure it's still the same version after getting the page. */ + if (ff->readdir.version != fi->rdc.version) { + spin_unlock(&fi->rdc.lock); + unlock_page(page); + put_page(page); + goto retry; + } + spin_unlock(&fi->rdc.lock); + + /* + * Contents of the page are now protected against changing by holding + * the page lock. + */ addr = kmap(page); res = fuse_parse_cache(ff, addr, size, ctx); kunmap(page); -- cgit v1.2.3 From 7118883b44b8edfea732aadeb0d4424da3f152b2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:04 +0200 Subject: fuse: use mtime for readdir cache verification Store the modification time of the directory in the cache, obtained before starting to fill the cache. When reading the cache, verify that the directory hasn't changed, by checking if current modification time is the same as the one stored in the cache. This only needs to be done when the current file position is at the beginning of the directory, as mandated by POSIX. Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 3 +++ fs/fuse/readdir.c | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8b24805e62ee..3deb013a289e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -117,6 +117,9 @@ struct fuse_inode { /* version of the cache */ u64 version; + /* modification time of directory when cache was started */ + struct timespec64 mtime; + /* protects above fields */ spinlock_t lock; } rdc; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 18318cc31c05..dafd6543cca2 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -399,8 +399,10 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, return res; } -static void fuse_rdc_reset(struct fuse_inode *fi) +static void fuse_rdc_reset(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + fi->rdc.cached = false; fi->rdc.version++; fi->rdc.size = 0; @@ -413,6 +415,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) { struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); enum fuse_parse_result res; pgoff_t index; @@ -426,12 +429,40 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) ff->readdir.cache_off = 0; } + /* + * We're just about to start reading into the cache or reading the + * cache; both cases require an up-to-date mtime value. + */ + if (!ctx->pos && fc->auto_inval_data) { + int err = fuse_update_attributes(inode, file); + + if (err) + return err; + } + retry: spin_lock(&fi->rdc.lock); +retry_locked: if (!fi->rdc.cached) { + /* Starting cache? Set cache mtime. */ + if (!ctx->pos && !fi->rdc.size) { + fi->rdc.mtime = inode->i_mtime; + } spin_unlock(&fi->rdc.lock); return UNCACHED; } + /* + * When at the beginning of the directory (i.e. just after opendir(3) or + * rewinddir(3)), then need to check whether directory contents have + * changed, and reset the cache if so. + */ + if (!ctx->pos) { + if (!timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + fuse_rdc_reset(inode); + goto retry_locked; + } + } + /* * If cache version changed since the last getdents() call, then reset * the cache stream. @@ -469,9 +500,8 @@ retry: * Uh-oh: page gone missing, cache is useless */ if (fi->rdc.version == ff->readdir.version) - fuse_rdc_reset(fi); - spin_unlock(&fi->rdc.lock); - return UNCACHED; + fuse_rdc_reset(inode); + goto retry_locked; } /* Make sure it's still the same version after getting the page. */ -- cgit v1.2.3 From 261aaba72fdba17b74a3a434d9f925b43d90e958 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:05 +0200 Subject: fuse: use iversion for readdir cache verification Use the internal iversion counter to make sure modifications of the directory through this filesystem are not missed by the mtime check (due to mtime granularity). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 21 ++++++++++++++------- fs/fuse/fuse_i.h | 3 +++ fs/fuse/readdir.c | 5 ++++- 3 files changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3a333b0ea9ad..6800fdc3e730 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,7 @@ #include #include #include +#include #include static void fuse_advise_use_readdirplus(struct inode *dir) @@ -89,6 +90,12 @@ void fuse_invalidate_attr(struct inode *inode) get_fuse_inode(inode)->i_time = 0; } +static void fuse_dir_changed(struct inode *dir) +{ + fuse_invalidate_attr(dir); + inode_maybe_inc_iversion(dir, false); +} + /** * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. @@ -447,7 +454,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { fuse_sync_release(ff, flags); @@ -561,7 +568,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, } else { fuse_change_entry_timeout(entry, &outarg); } - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); return 0; out_put_forget_req: @@ -671,7 +678,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } else if (err == -EINTR) @@ -693,7 +700,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); - fuse_invalidate_attr(dir); + fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); @@ -732,9 +739,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_update_ctime(d_inode(newent)); } - fuse_invalidate_attr(olddir); + fuse_dir_changed(olddir); if (olddir != newdir) - fuse_invalidate_attr(newdir); + fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { @@ -967,7 +974,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!entry) goto unlock; - fuse_invalidate_attr(parent); + fuse_dir_changed(parent); fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3deb013a289e..d9d1ea78efa6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -120,6 +120,9 @@ struct fuse_inode { /* modification time of directory when cache was started */ struct timespec64 mtime; + /* iversion of directory when cache was started */ + u64 iversion; + /* protects above fields */ spinlock_t lock; } rdc; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index dafd6543cca2..ab18b78f4755 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -8,6 +8,7 @@ #include "fuse_i.h" +#include #include #include #include @@ -447,6 +448,7 @@ retry_locked: /* Starting cache? Set cache mtime. */ if (!ctx->pos && !fi->rdc.size) { fi->rdc.mtime = inode->i_mtime; + fi->rdc.iversion = inode_query_iversion(inode); } spin_unlock(&fi->rdc.lock); return UNCACHED; @@ -457,7 +459,8 @@ retry_locked: * changed, and reset the cache if so. */ if (!ctx->pos) { - if (!timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + if (inode_peek_iversion(inode) != fi->rdc.iversion || + !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { fuse_rdc_reset(inode); goto retry_locked; } -- cgit v1.2.3 From ab2257e9941b9ef28d4a4a451e4b146d40a21e18 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:05 +0200 Subject: fuse: reduce size of struct fuse_inode Do this by grouping fields used for cached writes and putting them into a union with fileds used for cached readdir (with obviously no overlap, since we don't have hybrid objects). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 13 +++++++++++- fs/fuse/file.c | 8 ++++++++ fs/fuse/fuse_i.h | 62 +++++++++++++++++++++++++++++++------------------------- fs/fuse/inode.c | 16 ++++----------- 4 files changed, 58 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 6800fdc3e730..d1b2f42d746e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1414,8 +1414,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, file = NULL; } - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; is_truncate = true; + } if (is_truncate) { fuse_set_nowrite(inode); @@ -1619,8 +1622,16 @@ void fuse_init_common(struct inode *inode) void fuse_init_dir(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; + + spin_lock_init(&fi->rdc.lock); + fi->rdc.cached = false; + fi->rdc.size = 0; + fi->rdc.pos = 0; + fi->rdc.version = 0; } void fuse_init_symlink(struct inode *inode) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e10c0443c56f..b10d14baeb1f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3143,6 +3143,14 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode) { + struct fuse_inode *fi = get_fuse_inode(inode); + inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + + INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + fi->writectr = 0; + init_waitqueue_head(&fi->page_waitq); + INIT_LIST_HEAD(&fi->writepages); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d9d1ea78efa6..f5bdce84e766 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -87,45 +87,51 @@ struct fuse_inode { /** Version of last attribute change */ u64 attr_version; - /** Files usable in writepage. Protected by fc->lock */ - struct list_head write_files; + union { + /* Write related fields (regular file only) */ + struct { + /* Files usable in writepage. Protected by fc->lock */ + struct list_head write_files; - /** Writepages pending on truncate or fsync */ - struct list_head queued_writes; + /* Writepages pending on truncate or fsync */ + struct list_head queued_writes; - /** Number of sent writes, a negative bias (FUSE_NOWRITE) - * means more writes are blocked */ - int writectr; + /* Number of sent writes, a negative bias + * (FUSE_NOWRITE) means more writes are blocked */ + int writectr; - /** Waitq for writepage completion */ - wait_queue_head_t page_waitq; + /* Waitq for writepage completion */ + wait_queue_head_t page_waitq; - /** List of writepage requestst (pending or sent) */ - struct list_head writepages; + /* List of writepage requestst (pending or sent) */ + struct list_head writepages; + }; - /* readdir cache */ - struct { - /* true if fully cached */ - bool cached; + /* readdir cache (directory only) */ + struct { + /* true if fully cached */ + bool cached; - /* size of cache */ - loff_t size; + /* size of cache */ + loff_t size; - /* position at end of cache (position of next entry) */ - loff_t pos; + /* position at end of cache (position of next entry) */ + loff_t pos; - /* version of the cache */ - u64 version; + /* version of the cache */ + u64 version; - /* modification time of directory when cache was started */ - struct timespec64 mtime; + /* modification time of directory when cache was + * started */ + struct timespec64 mtime; - /* iversion of directory when cache was started */ - u64 iversion; + /* iversion of directory when cache was started */ + u64 iversion; - /* protects above fields */ - spinlock_t lock; - } rdc; + /* protects above fields */ + spinlock_t lock; + } rdc; + }; /** Miscellaneous bits describing inode state */ unsigned long state; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index eef2ae713f75..82db1ab53420 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -93,18 +93,8 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; - fi->writectr = 0; fi->orig_ino = 0; fi->state = 0; - INIT_LIST_HEAD(&fi->write_files); - INIT_LIST_HEAD(&fi->queued_writes); - INIT_LIST_HEAD(&fi->writepages); - init_waitqueue_head(&fi->page_waitq); - spin_lock_init(&fi->rdc.lock); - fi->rdc.cached = false; - fi->rdc.size = 0; - fi->rdc.pos = 0; - fi->rdc.version = 0; mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { @@ -124,8 +114,10 @@ static void fuse_i_callback(struct rcu_head *head) static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!list_empty(&fi->write_files)); - BUG_ON(!list_empty(&fi->queued_writes)); + if (S_ISREG(inode->i_mode)) { + WARN_ON(!list_empty(&fi->write_files)); + WARN_ON(!list_empty(&fi->queued_writes)); + } mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); -- cgit v1.2.3 From 8a7aa286ab67d7dfac8abbefab899597b5977c9a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:05 +0200 Subject: fuse: allocate page array more efficiently When allocating page array for a request the array for the page pointers and the array for page descriptors are allocated by two separate kmalloc() calls. Merge these into one allocation. Also instead of initializing the request and the page arrays with memset(), use the zeroing allocation variants. Reserved requests never carry pages (page array size is zero). Make that explicit by initializing the page array pointers to NULL and make sure the assumption remains true by adding a WARN_ON(). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 91b4ecf85dc7..fefb9dd8a2f4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -44,9 +44,6 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, struct fuse_page_desc *page_descs, unsigned npages) { - memset(req, 0, sizeof(*req)); - memset(pages, 0, sizeof(*pages) * npages); - memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); @@ -59,28 +56,22 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) { - struct page **pages; - struct fuse_page_desc *page_descs; - - if (npages <= FUSE_REQ_INLINE_PAGES) { + struct page **pages = NULL; + struct fuse_page_desc *page_descs = NULL; + + if (npages > FUSE_REQ_INLINE_PAGES) { + pages = kzalloc(npages * (sizeof(*pages) + + sizeof(*page_descs)), flags); + if (!pages) { + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + page_descs = (void *) pages + npages * sizeof(*pages); + } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; - } else { - pages = kmalloc_array(npages, sizeof(struct page *), - flags); - page_descs = - kmalloc_array(npages, - sizeof(struct fuse_page_desc), - flags); - } - - if (!pages || !page_descs) { - kfree(pages); - kfree(page_descs); - kmem_cache_free(fuse_req_cachep, req); - return NULL; } fuse_request_init(req, pages, page_descs, npages); @@ -101,10 +92,9 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) void fuse_request_free(struct fuse_req *req) { - if (req->pages != req->inline_pages) { + if (req->pages != req->inline_pages) kfree(req->pages); - kfree(req->page_descs); - } + kmem_cache_free(fuse_req_cachep, req); } @@ -239,8 +229,10 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct file *file = req->stolen_file; struct fuse_file *ff = file->private_data; + WARN_ON(req->max_pages); spin_lock(&fc->lock); - fuse_request_init(req, req->pages, req->page_descs, req->max_pages); + memset(req, 0, sizeof(*req)); + fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); -- cgit v1.2.3 From 5da784cce4308ae10a79e3c8c41b13fb9568e4e0 Mon Sep 17 00:00:00 2001 From: Constantine Shulyupin Date: Thu, 6 Sep 2018 15:37:06 +0300 Subject: fuse: add max_pages to init_out Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to improve performance. Old RFC with detailed description of the problem and many fixes by Mitsuo Hayasaka (mitsuo.hayasaka.hu@hitachi.com): - https://lkml.org/lkml/2012/7/5/136 We've encountered performance degradation and fixed it on a big and complex virtual environment. Environment to reproduce degradation and improvement: 1. Add lag to user mode FUSE Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in passthrough_fh.c 2. patch UM fuse with configurable max_pages parameter. The patch will be provided latter. 3. run test script and perform test on tmpfs fuse_test() { cd /tmp mkdir -p fusemnt passthrough_fh -o max_pages=$1 /tmp/fusemnt grep fuse /proc/self/mounts dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \ count=1K bs=1M 2>&1 | grep -v records rm fusemnt/tmp/tmp killall passthrough_fh } Test results: passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0 1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s passthrough_fh /tmp/fusemnt fuse.passthrough_fh \ rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0 1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s Obviously with bigger lag the difference between 'before' and 'after' will be more significant. Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136), observed improvement from 400-550 to 520-740. Signed-off-by: Constantine Shulyupin Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +++-- fs/fuse/file.c | 59 ++++++++++++++++++++++++++++---------------------------- fs/fuse/fuse_i.h | 10 ++++++++-- fs/fuse/inode.c | 8 +++++++- 4 files changed, 48 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fefb9dd8a2f4..69d4df78a417 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -61,6 +61,7 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) struct page **pages = NULL; struct fuse_page_desc *page_descs = NULL; + WARN_ON(npages > FUSE_MAX_MAX_PAGES); if (npages > FUSE_REQ_INLINE_PAGES) { pages = kzalloc(npages * (sizeof(*pages) + sizeof(*page_descs)), flags); @@ -1674,7 +1675,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - int num_pages; + unsigned int num_pages; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1686,7 +1687,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num = file_size - outarg->offset; num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + num_pages = min(num_pages, fc->max_pages); req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b10d14baeb1f..035843b501fe 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -850,11 +850,11 @@ static int fuse_readpages_fill(void *_data, struct page *page) fuse_wait_on_page_writeback(inode, page->index); if (req->num_pages && - (req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { - int nr_alloc = min_t(unsigned, data->nr_pages, - FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, + fc->max_pages); fuse_send_readpages(req, data->file); if (fc->async_read) req = fuse_get_req_for_background(fc, nr_alloc); @@ -889,7 +889,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); + unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -1104,12 +1104,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, return count > 0 ? count : err; } -static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, + unsigned int max_pages) { - return min_t(unsigned, + return min_t(unsigned int, ((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, - FUSE_MAX_PAGES_PER_REQ); + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, @@ -1131,7 +1132,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, do { struct fuse_req *req; ssize_t count; - unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); + unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), + fc->max_pages); req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { @@ -1321,11 +1323,6 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, return ret < 0 ? ret : 0; } -static inline int fuse_iter_npages(const struct iov_iter *ii_p) -{ - return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ); -} - ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { @@ -1345,9 +1342,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, int err = 0; if (io->async) - req = fuse_get_req_for_background(fc, fuse_iter_npages(iter)); + req = fuse_get_req_for_background(fc, iov_iter_npages(iter, + fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1392,9 +1390,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, fuse_put_request(fc, req); if (io->async) req = fuse_get_req_for_background(fc, - fuse_iter_npages(iter)); + iov_iter_npages(iter, fc->max_pages)); else - req = fuse_get_req(fc, fuse_iter_npages(iter)); + req = fuse_get_req(fc, iov_iter_npages(iter, + fc->max_pages)); if (IS_ERR(req)) break; } @@ -1823,7 +1822,7 @@ static int fuse_writepages_fill(struct page *page, is_writeback = fuse_page_is_writeback(inode, page->index); if (req && req->num_pages && - (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ || + (is_writeback || req->num_pages == fc->max_pages || (req->num_pages + 1) * PAGE_SIZE > fc->max_write || data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); @@ -1851,7 +1850,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ); + req = fuse_request_alloc_nofs(fc->max_pages); if (!req) { __free_page(tmp_page); goto out_unlock; @@ -1908,6 +1907,7 @@ static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_wb_data data; int err; @@ -1920,7 +1920,7 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, + data.orig_pages = kcalloc(fc->max_pages, sizeof(struct page *), GFP_NOFS); if (!data.orig_pages) @@ -2391,10 +2391,11 @@ static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, } /* Make sure iov_length() won't overflow */ -static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, + size_t count) { size_t n; - u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + u32 max = fc->max_pages << PAGE_SHIFT; for (n = 0; n < count; n++, iov++) { if (iov->iov_len > (size_t) max) @@ -2518,7 +2519,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL); + pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!pages || !iov_page) goto out; @@ -2557,7 +2558,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > FUSE_MAX_PAGES_PER_REQ) + if (max_pages > fc->max_pages) goto out; while (num_pages < max_pages) { pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2644,11 +2645,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); if (err) goto out; @@ -2839,9 +2840,9 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(file_dentry(file), &attr, file); } -static inline loff_t fuse_round_up(loff_t off) +static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) { - return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + return round_up(off, fc->max_pages << PAGE_SHIFT); } static ssize_t @@ -2870,7 +2871,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); count = iov_iter_count(iter); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f5bdce84e766..3d578745c852 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -28,8 +28,11 @@ #include #include -/** Max number of pages that can be used in a single read request */ -#define FUSE_MAX_PAGES_PER_REQ 32 +/** Default max number of pages that can be used in a single read request */ +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/** Maximum of max_pages received in init_out */ +#define FUSE_MAX_MAX_PAGES 256 /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -525,6 +528,9 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; + /** Maxmum number of pages that can be used in a single request */ + unsigned int max_pages; + /** Input queue */ struct fuse_iqueue iq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 82db1ab53420..8cebf4d5f51b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -928,6 +928,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } if (arg->flags & FUSE_ABORT_ERROR) fc->abort_err = 1; + if (arg->flags & FUSE_MAX_PAGES) { + fc->max_pages = + min_t(unsigned int, FUSE_MAX_MAX_PAGES, + max_t(unsigned int, arg->max_pages, 1)); + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -959,7 +964,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1152,6 +1157,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = max_t(unsigned, 4096, d.max_read); + fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; /* Used by get_root_inode() */ sb->s_fs_info = fc; -- cgit v1.2.3 From e52a8250480acd3b26534793c61816e30d85fbb6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 1 Oct 2018 10:07:06 +0200 Subject: fuse: realloc page array Writeback caching currently allocates requests with the maximum number of possible pages, while the actual number of pages per request depends on a couple of factors that cannot be determined when the request is allocated (whether page is already under writeback, whether page is contiguous with previous pages already added to a request). This patch allows such requests to start with no page allocation (all pages inline) and grow the page array on demand. If the max_pages tunable remains the default value, then this will mean just one allocation that is the same size as before. If the tunable is larger, then this adds at most 3 additional memory allocations (which is generously compensated by the improved performance from the larger request). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 49 +++++++++++++++++++++++++++++++++++++++++++++---- fs/fuse/file.c | 8 +++++++- fs/fuse/fuse_i.h | 4 ++++ 3 files changed, 56 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69d4df78a417..ae813e609932 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -54,6 +54,18 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, __set_bit(FR_PENDING, &req->flags); } +static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) pages + npages * sizeof(struct page *); + + return pages; +} + static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); @@ -63,13 +75,12 @@ static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) WARN_ON(npages > FUSE_MAX_MAX_PAGES); if (npages > FUSE_REQ_INLINE_PAGES) { - pages = kzalloc(npages * (sizeof(*pages) + - sizeof(*page_descs)), flags); + pages = fuse_req_pages_alloc(npages, flags, + &page_descs); if (!pages) { kmem_cache_free(fuse_req_cachep, req); return NULL; } - page_descs = (void *) pages + npages * sizeof(*pages); } else if (npages) { pages = req->inline_pages; page_descs = req->inline_page_descs; @@ -91,11 +102,41 @@ struct fuse_req *fuse_request_alloc_nofs(unsigned npages) return __fuse_request_alloc(npages, GFP_NOFS); } -void fuse_request_free(struct fuse_req *req) +static void fuse_req_pages_free(struct fuse_req *req) { if (req->pages != req->inline_pages) kfree(req->pages); +} + +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags) +{ + struct page **pages; + struct fuse_page_desc *page_descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, req->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= req->max_pages); + pages = fuse_req_pages_alloc(npages, flags, &page_descs); + if (!pages) + return false; + + memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); + memcpy(page_descs, req->page_descs, + sizeof(struct fuse_page_desc) * req->max_pages); + fuse_req_pages_free(req); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; + + return true; +} + +void fuse_request_free(struct fuse_req *req) +{ + fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 035843b501fe..f5507198ea00 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1827,7 +1827,13 @@ static int fuse_writepages_fill(struct page *page, data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); data->req = NULL; + } else if (req && req->num_pages == req->max_pages) { + if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + fuse_writepages_send(data); + req = data->req = NULL; + } } + err = -ENOMEM; tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) @@ -1850,7 +1856,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); err = -ENOMEM; - req = fuse_request_alloc_nofs(fc->max_pages); + req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); if (!req) { __free_page(tmp_page); goto out_unlock; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3d578745c852..b7d96e7b5e0f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -879,6 +879,10 @@ struct fuse_req *fuse_request_alloc(unsigned npages); struct fuse_req *fuse_request_alloc_nofs(unsigned npages); +bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, + gfp_t flags); + + /** * Free a request */ -- cgit v1.2.3 From 2f1e81965fd0f672c3246e751385cdfe8f86bbee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 15 Oct 2018 15:43:06 +0200 Subject: fuse: allow fine grained attr cache invaldation This patch adds the infrastructure for more fine grained attribute invalidation. Currently only 'atime' is invalidated separately. The use of this infrastructure is extended to the statx(2) interface, which for now means that if only 'atime' is invalid and STATX_ATIME is not specified in the mask argument, then no GETATTR request will be generated. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 18 +++++++++++++----- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 2 ++ 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d1b2f42d746e..2b6fc2b35649 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -81,13 +81,18 @@ u64 entry_attr_timeout(struct fuse_entry_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } +static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +{ + set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); +} + /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { - get_fuse_inode(inode)->i_time = 0; + fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); } static void fuse_dir_changed(struct inode *dir) @@ -103,7 +108,7 @@ static void fuse_dir_changed(struct inode *dir) void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* @@ -917,7 +922,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, unsigned int flags) + struct kstat *stat, u32 request_mask, + unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -927,6 +933,8 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; + else if (request_mask & READ_ONCE(fi->inval_mask)) + sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -944,7 +952,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL, 0); + return fuse_update_get_attr(inode, file, NULL, STATX_BASIC_STATS, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -1566,7 +1574,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat, flags); + return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b7d96e7b5e0f..0e32524e66bb 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -80,6 +80,9 @@ struct fuse_inode { /** Time in jiffies until the file attributes are valid */ u64 i_time; + /* Which attributes are invalid */ + u32 inval_mask; + /** The sticky bit in inode->i_mode may have been removed, so preserve the original mode */ umode_t orig_i_mode; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8cebf4d5f51b..d5f845aefbc9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -90,6 +90,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi = get_fuse_inode(inode); fi->i_time = 0; + fi->inval_mask = 0; fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; @@ -164,6 +165,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; + WRITE_ONCE(fi->inval_mask, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); -- cgit v1.2.3 From 802dc0497be2b538ca4300704b45b59bffe29585 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 15 Oct 2018 15:43:06 +0200 Subject: fuse: don't need GETATTR after every READ If 'auto_inval_data' mode is active, then fuse_file_read_iter() will call fuse_update_attributes(), which will check the attribute validity and send a GETATTR request if some of the attributes are no longer valid. The page cache is then invalidated if the size or mtime have changed. Then, if a READ request was sent and reply received (which is the case if the data wasn't cached yet, or if the file is opened for O_DIRECT), the atime attribute is invalidated. This will result in the next read() also triggering a GETATTR, ... This can be fixed by only sending GETATTR if the mode or size are invalid, we don't need to do a refresh if only atime is invalid. More generally, none of the callers of fuse_update_attributes() need an up-to-date atime value, so for now just remove STATX_ATIME from the request mask when attributes are updated for internal use. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2b6fc2b35649..7b8f63e7489f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -952,7 +952,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL, STATX_BASIC_STATS, 0); + /* Do *not* need to get atime for internal purposes */ + return fuse_update_get_attr(inode, file, NULL, + STATX_BASIC_STATS & ~STATX_ATIME, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, -- cgit v1.2.3 From 9a2eb24d1a349f53e20f3d4503568551561eed23 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 15 Oct 2018 15:43:06 +0200 Subject: fuse: only invalidate atime in direct read After sending a synchronous READ request from __fuse_direct_read() we only need to invalidate atime; none of the other attributes should be changed by a read(). Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f5507198ea00..58dbc39fea63 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1419,7 +1419,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, res = fuse_direct_io(io, iter, ppos, 0); - fuse_invalidate_attr(inode); + fuse_invalidate_atime(inode); return res; } -- cgit v1.2.3 From 5571f1e65486be025f73fa6aa30fb03725d362a2 Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Thu, 11 Oct 2018 08:17:00 -0700 Subject: fuse: enable caching of symlinks FUSE file reads are cached in the page cache, but symlink reads are not. This patch enables FUSE READLINK operations to be cached which can improve performance of some FUSE workloads. In particular, I'm working on a FUSE filesystem for access to source code and discovered that about a 10% improvement to build times is achieved with this patch (there are a lot of symlinks in the source tree). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 108 ++++++++++++++++++++++++++++++++++++++++++------------- fs/fuse/fuse_i.h | 3 ++ fs/fuse/inode.c | 4 ++- 3 files changed, 89 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7b8f63e7489f..47395b0c3b35 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1160,38 +1160,78 @@ static int fuse_permission(struct inode *inode, int mask) return err; } -static const char *fuse_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static int fuse_readlink_page(struct inode *inode, struct page *page) { struct fuse_conn *fc = get_fuse_conn(inode); - FUSE_ARGS(args); - char *link; - ssize_t ret; + struct fuse_req *req; + int err; - if (!dentry) - return ERR_PTR(-ECHILD); + req = fuse_get_req(fc, 1); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->out.page_zeroing = 1; + req->out.argpages = 1; + req->num_pages = 1; + req->pages[0] = page; + req->page_descs[0].length = PAGE_SIZE - 1; + req->in.h.opcode = FUSE_READLINK; + req->in.h.nodeid = get_node_id(inode); + req->out.argvar = 1; + req->out.numargs = 1; + req->out.args[0].size = PAGE_SIZE - 1; + fuse_request_send(fc, req); + err = req->out.h.error; - link = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!link) - return ERR_PTR(-ENOMEM); + if (!err) { + char *link = page_address(page); + size_t len = req->out.args[0].size; - args.in.h.opcode = FUSE_READLINK; - args.in.h.nodeid = get_node_id(inode); - args.out.argvar = 1; - args.out.numargs = 1; - args.out.args[0].size = PAGE_SIZE - 1; - args.out.args[0].value = link; - ret = fuse_simple_request(fc, &args); - if (ret < 0) { - kfree(link); - link = ERR_PTR(ret); - } else { - link[ret] = '\0'; - set_delayed_call(done, kfree_link, link); + BUG_ON(len >= PAGE_SIZE); + link[len] = '\0'; } + + fuse_put_request(fc, req); fuse_invalidate_atime(inode); - return link; + + return err; +} + +static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct page *page; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out_err; + + if (fc->cache_symlinks) + return page_get_link(dentry, inode, callback); + + err = -ECHILD; + if (!dentry) + goto out_err; + + page = alloc_page(GFP_KERNEL); + err = -ENOMEM; + if (!page) + goto out_err; + + err = fuse_readlink_page(inode, page); + if (err) { + __free_page(page); + goto out_err; + } + + set_delayed_call(callback, page_put_link, page); + + return page_address(page); + +out_err: + return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) @@ -1644,7 +1684,25 @@ void fuse_init_dir(struct inode *inode) fi->rdc.version = 0; } +static int fuse_symlink_readpage(struct file *null, struct page *page) +{ + int err = fuse_readlink_page(page->mapping->host, page); + + if (!err) + SetPageUptodate(page); + + unlock_page(page); + + return err; +} + +static const struct address_space_operations fuse_symlink_aops = { + .readpage = fuse_symlink_readpage, +}; + void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; + inode->i_data.a_ops = &fuse_symlink_aops; + inode_nohighmem(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0e32524e66bb..e9f712e81c7d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -613,6 +613,9 @@ struct fuse_conn { /** handle fs handles killing suid/sgid/cap on write/chown/trunc */ unsigned handle_killpriv:1; + /** cache READLINK responses in page cache */ + unsigned cache_symlinks:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d5f845aefbc9..0b94b23b02d4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -928,6 +928,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_CACHE_SYMLINKS) + fc->cache_symlinks = 1; if (arg->flags & FUSE_ABORT_ERROR) fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { @@ -966,7 +968,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); -- cgit v1.2.3