From a1db2f7edef095a385d477ab81e780694d63eebd Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 12 Oct 2022 13:23:23 +0200 Subject: fs/fuse: Replace kmap() with kmap_local_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of kmap() is being deprecated in favor of kmap_local_page(). There are two main problems with kmap(): (1) It comes with an overhead as the mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and still valid. Therefore, replace kmap() with kmap_local_page() in fuse_readdir_cached(), it being the only call site of kmap() currently left in fs/fuse. Cc: "Venkataramanan, Anirudh" Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index e8deaacf1832..dc603479b30e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -547,9 +547,9 @@ retry_locked: * Contents of the page are now protected against changing by holding * the page lock. */ - addr = kmap(page); + addr = kmap_local_page(page); res = fuse_parse_cache(ff, addr, size, ctx); - kunmap(page); + kunmap_local(addr); unlock_page(page); put_page(page); -- cgit v1.2.3 From 4f8d37020e1fd0bf6ee9381ba918135ef3712efd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2022 14:25:21 +0200 Subject: fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY Add a flag to entry expiration that lets the filesystem expire a dentry without kicking it out from the cache immediately. This makes a difference for overmounted dentries, where plain invalidation would detach all submounts before dropping the dentry from the cache. If only expiry is set on the dentry, then any overmounts are left alone and until ->d_revalidate() is called. Note: ->d_revalidate() is not called for the case of following a submount, so invalidation will only be triggered for the non-overmounted case. The dentry could also be mounted in a different mount instance, in which case any submounts will still be detached. Suggested-by: Jakob Blomer Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 4 ++-- fs/fuse/dir.c | 6 ++++-- fs/fuse/fuse_i.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..0a845fac3ba8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); kfree(buf); return err; @@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); kfree(buf); return err; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bb97a384dc5d..d092c7d75929 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1170,7 +1170,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name) + u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; @@ -1197,7 +1197,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, goto unlock; fuse_dir_changed(parent); - fuse_invalidate_entry(entry); + if (!(flags & FUSE_EXPIRE_ONLY)) + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 98a9cf531873..8789d94eda21 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1220,7 +1220,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, * then the dentry is unhashed (d_delete()). */ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, - u64 child_nodeid, struct qstr *name); + u64 child_nodeid, struct qstr *name, u32 flags); int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); -- cgit v1.2.3 From ccc031e26afe60d2a5a3d93dabd9c978210825fb Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Wed, 28 Sep 2022 20:19:34 +0800 Subject: fuse: always revalidate rename target dentry The previous commit df8629af2934 ("fuse: always revalidate if exclusive create") ensures that the dentries are revalidated on O_EXCL creates. This commit complements it by also performing revalidation for rename target dentries. Otherwise, a rename target file that only exists in kernel dentry cache but not in the filesystem will result in EEXIST if RENAME_NOREPLACE flag is used. Signed-off-by: Jiachen Zhang Signed-off-by: Zhang Tianci Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fuse') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d092c7d75929..2c4b08a6ec81 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -214,7 +214,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || - (flags & (LOOKUP_EXCL | LOOKUP_REVAL))) { + (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; -- cgit v1.2.3 From 0618021e34c6b1edc4fc754ec53ab7fdcc98aaec Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 14 Sep 2022 16:26:32 +0200 Subject: fuse: Remove user_ns check for FUSE_DEV_IOC_CLONE Commit 8ed1f0e22f49e ("fs/fuse: fix ioctl type confusion") fixed a type confusion bug by adding an ->f_op comparison. Based on some off-list discussion back then, another check was added to compare the f_cred->user_ns. This is not for security reasons, but was based on the idea that a FUSE device FD should be using the UID/GID mappings of its f_cred->user_ns, and those translations are done using fc->user_ns, which matches the f_cred->user_ns of the initial FUSE device FD thanks to the check in fuse_fill_super(). See also commit 8cb08329b0809 ("fuse: Support fuse filesystems outside of init_user_ns"). But FUSE_DEV_IOC_CLONE is, at a higher level, a *cloning* operation that copies an existing context (with a weird API that involves first opening /dev/fuse, then tying the resulting new FUSE device FD to an existing FUSE instance). So if an application is already passing FUSE FDs across userns boundaries and dealing with the resulting ID mapping complications somehow, it doesn't make much sense to block this cloning operation. I've heard that this check is an obstacle for some folks, and I don't see a good reason to keep it, so remove it. Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a845fac3ba8..c73d9c4132f6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2267,8 +2267,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, * Check against file->f_op because CUSE * uses the same ioctl handler. */ - if (old->f_op == file->f_op && - old->f_cred->user_ns == file->f_cred->user_ns) + if (old->f_op == file->f_op) fud = fuse_get_dev(old); if (fud) { -- cgit v1.2.3 From 00d369bc2de5986504fc0d60bfeae4c342b2cad5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Sep 2022 11:40:21 +0200 Subject: fuse: port to vfs{g,u}id_t and associated helpers A while ago we introduced a dedicated vfs{g,u}id_t type in commit 1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched over a good part of the VFS. Ultimately we will remove all legacy idmapped mount helpers that operate only on k{g,u}id_t in favor of the new type safe helpers that operate on vfs{g,u}id_t. Cc: Seth Forshee (Digital Ocean) Cc: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/fuse') diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 337cb29a8dd5..84c1ca4bc1dc 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -98,7 +98,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) && !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; -- cgit v1.2.3 From e2283a736676554f72dbdcb62fdc1d23daf7044f Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 1 Sep 2022 07:42:59 +0000 Subject: fuse: remove the unneeded result variable Return the value fuse_dev_release() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c7d882a9fe33..a06fbb1a8a5b 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -545,7 +545,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) { struct fuse_dev *fud = file->private_data; struct cuse_conn *cc = fc_to_cc(fud->fc); - int rc; /* remove from the conntbl, no more access from this point on */ mutex_lock(&cuse_lock); @@ -560,9 +559,7 @@ static int cuse_channel_release(struct inode *inode, struct file *file) cdev_del(cc->cdev); } - rc = fuse_dev_release(inode, file); /* puts the base reference */ - - return rc; + return fuse_dev_release(inode, file); } static struct file_operations cuse_channel_fops; /* initialized during init */ -- cgit v1.2.3 From 153524053bbb0d27bb2e0be36d1b46862e9ce74c Mon Sep 17 00:00:00 2001 From: Dharmendra Singh Date: Fri, 17 Jun 2022 12:40:27 +0530 Subject: fuse: allow non-extending parallel direct writes on the same file In general, as of now, in FUSE, direct writes on the same file are serialized over inode lock i.e we hold inode lock for the full duration of the write request. I could not find in fuse code and git history a comment which clearly explains why this exclusive lock is taken for direct writes. Following might be the reasons for acquiring an exclusive lock but not be limited to 1) Our guess is some USER space fuse implementations might be relying on this lock for serialization. 2) The lock protects against file read/write size races. 3) Ruling out any issues arising from partial write failures. This patch relaxes the exclusive lock for direct non-extending writes only. File size extending writes might not need the lock either, but we are not entirely sure if there is a risk to introduce any kind of regression. Furthermore, benchmarking with fio does not show a difference between patch versions that take on file size extension a) an exclusive lock and b) a shared lock. A possible example of an issue with i_size extending writes are write error cases. Some writes might succeed and others might fail for file system internal reasons - for example ENOSPACE. With parallel file size extending writes it _might_ be difficult to revert the action of the failing write, especially to restore the right i_size. With these changes, we allow non-extending parallel direct writes on the same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If this flag is set on the file (flag is passed from libfuse to fuse kernel as part of file open/create), we do not take exclusive lock anymore, but instead use a shared lock that allows non-extending writes to run in parallel. FUSE implementations which rely on this inode lock for serialization can continue to do so and serialized direct writes are still the default. Implementations that do not do write serialization need to be updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file open/create reply. On patch review there were concerns that network file systems (or vfs multiple mounts of the same file system) might have issues with parallel writes. We believe this is not the case, as this is just a local lock, which network file systems could not rely on anyway. I.e. this lock is just for local consistency. Signed-off-by: Dharmendra Singh Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89f4741728ba..d360d38ab616 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } +static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; + bool exclusive_lock = + !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || + iocb->ki_flags & IOCB_APPEND || + fuse_direct_write_extending_i_size(iocb, from); + + /* + * Take exclusive lock if + * - Parallel direct writes are disabled - a user space decision + * - Parallel direct writes are enabled and i_size is being extended. + * This might not be needed at all, but needs further investigation. + */ + if (exclusive_lock) + inode_lock(inode); + else { + inode_lock_shared(inode); + + /* A race with truncate might have come up as the decision for + * the lock type was done without holding the lock, check again. + */ + if (fuse_direct_write_extending_i_size(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + exclusive_lock = true; + } + } - /* Don't allow parallel writes to the same file */ - inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - inode_unlock(inode); + if (exclusive_lock) + inode_unlock(inode); + else + inode_unlock_shared(inode); return res; } @@ -2931,6 +2967,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { fuse_write_update_attr(inode, pos, ret); + /* For extending writes we already hold exclusive lock */ if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } -- cgit v1.2.3 From b138777786f780b9d5ce1989032608acbede0493 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 25 Oct 2022 09:10:17 -0700 Subject: fuse: Rearrange fuse_allow_current_process checks This is a followup to a previous commit of mine [0], which added the allow_sys_admin_access && capable(CAP_SYS_ADMIN) check. This patch rearranges the order of checks in fuse_allow_current_process without changing functionality. Commit 9ccf47b26b73 ("fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other") added allow_sys_admin_access && capable(CAP_SYS_ADMIN) check to the beginning of the function, with the reasoning that allow_sys_admin_access should be an 'escape hatch' for users with CAP_SYS_ADMIN, allowing them to skip any subsequent checks. However, placing this new check first results in many capable() calls when allow_sys_admin_access is set, where another check would've also returned 1. This can be problematic when a BPF program is tracing capable() calls. At Meta we ran into such a scenario recently. On a host where allow_sys_admin_access is set but most of the FUSE access is from processes which would pass other checks - i.e. they don't need CAP_SYS_ADMIN 'escape hatch' - this results in an unnecessary capable() call for each fs op. We also have a daemon tracing capable() with BPF and doing some data collection, so tracing these extraneous capable() calls has the potential to regress performance for an application doing many FUSE ops. So rearrange the order of these checks such that CAP_SYS_ADMIN 'escape hatch' is checked last. Add a small helper, fuse_permissible_uidgid, to make the logic easier to understand. Previously, if allow_other is set on the fuse_conn, uid/git checking doesn't happen as current_in_userns result is returned. These semantics are maintained here: fuse_permissible_uidgid check only happens if allow_other is not set. Signed-off-by: Dave Marchevsky Suggested-by: Andrii Nakryiko Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 35 ++++++++++++++++++++--------------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'fs/fuse') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 2c4b08a6ec81..aa67869e3444 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1237,6 +1237,18 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, return err; } +static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) +{ + const struct cred *cred = current_cred(); + + return (uid_eq(cred->euid, fc->user_id) && + uid_eq(cred->suid, fc->user_id) && + uid_eq(cred->uid, fc->user_id) && + gid_eq(cred->egid, fc->group_id) && + gid_eq(cred->sgid, fc->group_id) && + gid_eq(cred->gid, fc->group_id)); +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This @@ -1250,26 +1262,19 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_current_process(struct fuse_conn *fc) +bool fuse_allow_current_process(struct fuse_conn *fc) { - const struct cred *cred; - - if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) - return 1; + bool allow; if (fc->allow_other) - return current_in_userns(fc->user_ns); + allow = current_in_userns(fc->user_ns); + else + allow = fuse_permissible_uidgid(fc); - cred = current_cred(); - if (uid_eq(cred->euid, fc->user_id) && - uid_eq(cred->suid, fc->user_id) && - uid_eq(cred->uid, fc->user_id) && - gid_eq(cred->egid, fc->group_id) && - gid_eq(cred->sgid, fc->group_id) && - gid_eq(cred->gid, fc->group_id)) - return 1; + if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + allow = true; - return 0; + return allow; } static int fuse_access(struct inode *inode, int mask) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8789d94eda21..d339b1ace887 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1179,7 +1179,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr); /** * Is current process allowed to perform filesystem operation? */ -int fuse_allow_current_process(struct fuse_conn *fc); +bool fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); -- cgit v1.2.3