From db3db63b1d17c98f69e894edaa2b0b364ecde7a9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 4 Nov 2023 23:11:17 +0100 Subject: vfs: remove a redundant might_sleep in wait_on_inode wait_on_bit already does it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20231104221117.2584708-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/writeback.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c..6d0a14f7019d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } -- cgit v1.2.3 From 297945d9bc13a10e2ce39f0a3aad38c6812435a5 Mon Sep 17 00:00:00 2001 From: Abhinav Singh Date: Wed, 8 Nov 2023 10:15:50 +0530 Subject: fs : Fix warning using plain integer as NULL Sparse static analysis tools generate a warning with this message "Using plain integer as NULL pointer". In this case this warning is being shown because we are trying to initialize pointer to NULL using integer value 0. Signed-off-by: Abhinav Singh Link: https://lore.kernel.org/r/20231108044550.1006555-1-singhabhinav9051571833@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/dax.c | 2 +- fs/direct-io.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 3380b43cb6bb..423fc1607dfa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ bool zero_edge = srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN; - void *saddr = 0; + void *saddr = NULL; int ret = 0; if (!zero_edge) { diff --git a/fs/direct-io.c b/fs/direct-io.c index 20533266ade6..60456263a338 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, loff_t offset = iocb->ki_pos; const loff_t end = offset + count; struct dio *dio; - struct dio_submit sdio = { 0, }; + struct dio_submit sdio = { NULL, }; struct buffer_head map_bh = { 0, }; struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); -- cgit v1.2.3 From d218569004b6f8242d176aad250ed66becc80cae Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Tue, 31 Oct 2023 18:47:28 +0700 Subject: fs: Clarify "non-RCY" in access_override_creds() comment The term is originally intended as a joke that stands for "non-racy". This trips new contributors who mistake it for RCU typo [1]. Replace the term with more-explicit wording. Link: https://lore.kernel.org/r/20231030-debatten-nachrangig-f58abcdac530@brauner/ Signed-off-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20231031114728.41485-1-bagasdotme@gmail.com Signed-off-by: Christian Brauner --- fs/open.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 02dc608d40d8..0bd7fce21cbf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -442,7 +442,8 @@ static const struct cred *access_override_creds(void) * 'get_current_cred()' function), that will clear the * non_rcu field, because now that other user may be * expecting RCU freeing. But normal thread-synchronous - * cred accesses will keep things non-RCY. + * cred accesses will keep things non-racy to avoid RCU + * freeing. */ override_cred->non_rcu = 1; -- cgit v1.2.3 From f73f6181eb057671e358ebac8ed7f0014f12efb8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 30 Aug 2023 09:32:15 -0700 Subject: userns: eliminate many kernel-doc warnings Drop the kernel-doc "/**" notation from 8 structs or functions to prevent 22 kernel-doc warnings (samples below). user_namespace.c:239: warning: Function parameter or member 'map_up' not described in 'idmap_key' user_namespace.c:246: warning: Function parameter or member 'k' not described in 'cmp_map_id' user_namespace.c:277: warning: Function parameter or member 'extents' not described in 'map_id_range_down_max' user_namespace.c:295: warning: Function parameter or member 'extents' not described in 'map_id_range_down_base' user_namespace.c:344: warning: Function parameter or member 'extents' not described in 'map_id_up_base' user_namespace.c:364: warning: Function parameter or member 'extents' not described in 'map_id_up_max' user_namespace.c:776: warning: Function parameter or member 'map' not described in 'insert_extent' user_namespace.c:844: warning: Function parameter or member 'map' not described in 'sort_idmaps' Fixes: 6397fac4915a ("userns: bump idmap limits to 340") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20230830163215.13193-1-rdunlap@infradead.org Cc: Eric Biederman Cc: Christian Brauner Signed-off-by: Christian Brauner --- kernel/user_namespace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index eabe8bcc7042..625101249e4d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -231,7 +231,7 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -/** +/* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ @@ -241,7 +241,7 @@ struct idmap_key { u32 count; /* == 0 unless used with map_id_range_down() */ }; -/** +/* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ @@ -271,7 +271,7 @@ static int cmp_map_id(const void *k, const void *e) return 1; } -/** +/* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -288,7 +288,7 @@ map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 cou sizeof(struct uid_gid_extent), cmp_map_id); } -/** +/* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -337,7 +337,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return map_id_range_down(map, id, 1); } -/** +/* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -358,7 +358,7 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) return NULL; } -/** +/* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ @@ -770,7 +770,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } -/** +/* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. @@ -839,7 +839,7 @@ static int cmp_extents_reverse(const void *a, const void *b) return 0; } -/** +/* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -- cgit v1.2.3 From 6f672f7b3b9676853c3b074151ff0a156cdd7b07 Mon Sep 17 00:00:00 2001 From: YangXin Date: Sat, 18 Nov 2023 21:21:36 +0800 Subject: fs: namei: Fix spelling mistake "Retuns" to "Returns" There are two spelling mistake in comments. Fix it. Signed-off-by: YangXin Link: https://lore.kernel.org/r/20231118132136.3084-1-yx.0xffff@gmail.com Signed-off-by: Christian Brauner --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..53db89e99f97 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2467,7 +2467,7 @@ static int handle_lookup_down(struct nameidata *nd) return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); @@ -2522,7 +2522,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, return retval; } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +/* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { -- cgit v1.2.3 From d7802b734fe33e781437151033032d28291b809b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 21 Nov 2023 00:15:55 +0100 Subject: fs: add missing @mp parameter documentation Fix the W=1 build warning: ../fs/namespace.c:3050: warning: Function parameter or member 'mp' not described in 'can_move_mount_beneath' Signed-off-by: Christian Brauner --- fs/namespace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/namespace.c b/fs/namespace.c index fbf0e596fcd3..b899cbbe24d9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3026,6 +3026,7 @@ static inline bool path_overmounted(const struct path *path) * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount + * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. -- cgit v1.2.3 From 600f111ef51dc2cbdb330b09d09f1856efa64912 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 21:58:23 +0000 Subject: fs: Rename mapping private members It is hard to find where mapping->private_lock, mapping->private_list and mapping->private_data are used, due to private_XXX being a relatively common name for variables and structure members in the kernel. To fit with other members of struct address_space, rename them all to have an i_ prefix. Tested with an allmodconfig build. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org Acked-by: Darrick J. Wong Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/aio.c | 16 ++++---- fs/btrfs/extent_io.c | 52 ++++++++++++------------- fs/btrfs/subpage.c | 4 +- fs/buffer.c | 108 +++++++++++++++++++++++++-------------------------- fs/ext4/inode.c | 4 +- fs/gfs2/glock.c | 2 +- fs/gfs2/ops_fstype.c | 2 +- fs/hugetlbfs/inode.c | 4 +- fs/inode.c | 8 ++-- fs/nfs/write.c | 12 +++--- fs/nilfs2/inode.c | 4 +- fs/ntfs/aops.c | 10 ++--- include/linux/fs.h | 12 +++--- mm/hugetlb.c | 2 +- mm/migrate.c | 6 +-- 15 files changed, 123 insertions(+), 123 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f8589caef9c1..d02842156b35 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) return ERR_CAST(inode); inode->i_mapping->a_ops = &aio_ctx_aops; - inode->i_mapping->private_data = ctx; + inode->i_mapping->i_private_data = ctx; inode->i_size = PAGE_SIZE * nr_pages; file = alloc_file_pseudo(inode, aio_mnt, "[aio]", @@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx) /* Prevent further access to the kioctx from migratepages */ i_mapping = aio_ring_file->f_mapping; - spin_lock(&i_mapping->private_lock); - i_mapping->private_data = NULL; + spin_lock(&i_mapping->i_private_lock); + i_mapping->i_private_data = NULL; ctx->aio_ring_file = NULL; - spin_unlock(&i_mapping->private_lock); + spin_unlock(&i_mapping->i_private_lock); fput(aio_ring_file); } @@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, rc = 0; - /* mapping->private_lock here protects against the kioctx teardown. */ - spin_lock(&mapping->private_lock); - ctx = mapping->private_data; + /* mapping->i_private_lock here protects against the kioctx teardown. */ + spin_lock(&mapping->i_private_lock); + ctx = mapping->i_private_data; if (!ctx) { rc = -EINVAL; goto out; @@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, out_unlock: mutex_unlock(&ctx->ring_lock); out: - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return rc; } #else diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 03cef28d9e37..3431a53bf3fd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -870,7 +870,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * will not race with any other ebs. */ if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) @@ -1736,16 +1736,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start++; continue; } @@ -1759,7 +1759,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1811,9 +1811,9 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -1824,16 +1824,16 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -3056,7 +3056,7 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); if (PagePrivate(page)) { subpage = (struct btrfs_subpage *)page->private; @@ -3079,14 +3079,14 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3110,7 +3110,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag detach_page_private(page); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return; } @@ -3133,7 +3133,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ @@ -3514,7 +3514,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3535,10 +3535,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, goto free_eb; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); @@ -3558,7 +3558,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * Thus needs no special handling in error path. */ btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; @@ -4563,12 +4563,12 @@ static int try_release_subpage_extent_buffer(struct page *page) * Finally to check if we have cleared page private, as if we have * released all ebs in the page, the page private should be cleared now. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } @@ -4584,9 +4584,9 @@ int try_release_extent_buffer(struct page *page) * We need to make sure nobody is changing page->private, as we rely on * page->private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 1; } @@ -4601,10 +4601,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..2347cf15278b 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -200,7 +200,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; atomic_inc(&subpage->eb_refs); @@ -215,7 +215,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&page->mapping->i_private_lock); subpage = (struct btrfs_subpage *)page->private; ASSERT(atomic_read(&subpage->eb_refs)); diff --git a/fs/buffer.c b/fs/buffer.c index 967f34b70aa8..5ffc44ab4854 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync); * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's - * private_lock. + * i_private_lock. * - * Hack idea: for the blockdev mapping, private_lock contention + * Hack idea: for the blockdev mapping, i_private_lock contention * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take private_lock. + * succeeds, there is no need to take i_private_lock. */ static struct buffer_head * __find_get_block_slow(struct block_device *bdev, sector_t block) @@ -204,7 +204,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->private_lock); + spin_lock(&bd_mapping->i_private_lock); head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << bd_inode->i_blkbits); } out_unlock: - spin_unlock(&bd_mapping->private_lock); + spin_unlock(&bd_mapping->i_private_lock); folio_put(folio); out: return ret; @@ -467,25 +467,25 @@ EXPORT_SYMBOL(mark_buffer_async_write); * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the - * management of a list of dependent buffers at ->i_mapping->private_list. + * management of a list of dependent buffers at ->i_mapping->i_private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. - * So the locking for private_list is via the private_lock in the address_space + * So the locking for i_private_list is via the i_private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, - * mapping->private_lock does *not* protect mapping->private_list! In fact, - * mapping->private_list will always be protected by the backing blockdev's - * ->private_lock. + * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact, + * mapping->i_private_list will always be protected by the backing blockdev's + * ->i_private_lock. * * Which introduces a requirement: all buffers on an address_space's - * ->private_list must be from the same address_space: the blockdev's. + * ->i_private_list must be from the same address_space: the blockdev's. * - * address_spaces which do not place buffers at ->private_list via these - * utility functions are free to use private_lock and private_list for - * whatever they want. The only requirement is that list_empty(private_list) + * address_spaces which do not place buffers at ->i_private_list via these + * utility functions are free to use i_private_lock and i_private_list for + * whatever they want. The only requirement is that list_empty(i_private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The @@ -508,7 +508,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); */ /* - * The buffer's backing address_space's private_lock must be held + * The buffer's backing address_space's i_private_lock must be held */ static void __remove_assoc_queue(struct buffer_head *bh) { @@ -519,7 +519,7 @@ static void __remove_assoc_queue(struct buffer_head *bh) int inode_has_buffers(struct inode *inode) { - return !list_empty(&inode->i_data.private_list); + return !list_empty(&inode->i_data.i_private_list); } /* @@ -561,7 +561,7 @@ repeat: * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * - * Starts I/O against the buffers at mapping->private_list, and waits upon + * Starts I/O against the buffers at mapping->i_private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). @@ -570,13 +570,13 @@ repeat: */ int sync_mapping_buffers(struct address_space *mapping) { - struct address_space *buffer_mapping = mapping->private_data; + struct address_space *buffer_mapping = mapping->i_private_data; - if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + if (buffer_mapping == NULL || list_empty(&mapping->i_private_list)) return 0; - return fsync_buffers_list(&buffer_mapping->private_lock, - &mapping->private_list); + return fsync_buffers_list(&buffer_mapping->i_private_lock, + &mapping->i_private_list); } EXPORT_SYMBOL(sync_mapping_buffers); @@ -673,17 +673,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) struct address_space *buffer_mapping = bh->b_folio->mapping; mark_buffer_dirty(bh); - if (!mapping->private_data) { - mapping->private_data = buffer_mapping; + if (!mapping->i_private_data) { + mapping->i_private_data = buffer_mapping; } else { - BUG_ON(mapping->private_data != buffer_mapping); + BUG_ON(mapping->i_private_data != buffer_mapping); } if (!bh->b_assoc_map) { - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_move_tail(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -706,7 +706,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * - * We use private_lock to lock against try_to_free_buffers while using the + * We use i_private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * @@ -718,7 +718,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) struct buffer_head *head; bool newly_dirty; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -734,7 +734,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) */ folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); @@ -827,7 +827,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) smp_mb(); if (buffer_dirty(bh)) { list_add(&bh->b_assoc_buffers, - &mapping->private_list); + &mapping->i_private_list); bh->b_assoc_map = mapping; } spin_unlock(lock); @@ -851,7 +851,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * - * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */ @@ -859,13 +859,13 @@ void invalidate_inode_buffers(struct inode *inode) { if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) __remove_assoc_queue(BH_ENTRY(list->next)); - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } } EXPORT_SYMBOL(invalidate_inode_buffers); @@ -882,10 +882,10 @@ int remove_inode_buffers(struct inode *inode) if (inode_has_buffers(inode)) { struct address_space *mapping = &inode->i_data; - struct list_head *list = &mapping->private_list; - struct address_space *buffer_mapping = mapping->private_data; + struct list_head *list = &mapping->i_private_list; + struct address_space *buffer_mapping = mapping->i_private_data; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); while (!list_empty(list)) { struct buffer_head *bh = BH_ENTRY(list->next); if (buffer_dirty(bh)) { @@ -894,7 +894,7 @@ int remove_inode_buffers(struct inode *inode) } __remove_assoc_queue(bh); } - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } return ret; } @@ -1064,11 +1064,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, * lock to be atomic wrt __find_get_block(), which does not * run under the folio lock. */ - spin_lock(&inode->i_mapping->private_lock); + spin_lock(&inode->i_mapping->i_private_lock); link_dev_buffers(folio, bh); end_block = folio_init_buffers(folio, bdev, (sector_t)index << sizebits, size); - spin_unlock(&inode->i_mapping->private_lock); + spin_unlock(&inode->i_mapping->i_private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; failed: @@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * and then attach the address_space's inode to its superblock's dirty * inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock, * i_pages lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh) if (bh->b_assoc_map) { struct address_space *buffer_mapping = bh->b_folio->mapping; - spin_lock(&buffer_mapping->private_lock); + spin_lock(&buffer_mapping->i_private_lock); list_del_init(&bh->b_assoc_buffers); bh->b_assoc_map = NULL; - spin_unlock(&buffer_mapping->private_lock); + spin_unlock(&buffer_mapping->i_private_lock); } __brelse(bh); } @@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio); /* * We attach and possibly dirty the buffers atomically wrt - * block_dirty_folio() via private_lock. try_to_free_buffers + * block_dirty_folio() via i_private_lock. try_to_free_buffers * is already excluded via the folio lock. */ struct buffer_head *create_empty_buffers(struct folio *folio, @@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh); tail->b_this_page = head; - spin_lock(&folio->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); if (folio_test_uptodate(folio) || folio_test_dirty(folio)) { bh = head; do { @@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio, } while (bh != head); } folio_attach_private(folio, head); - spin_unlock(&folio->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return head; } @@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) if (!folio_buffers(folio)) continue; /* - * We use folio lock instead of bd_mapping->private_lock + * We use folio lock instead of bd_mapping->i_private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */ @@ -2883,7 +2883,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either - * locking the folio or by holding its mapping's private_lock. + * locking the folio or by holding its mapping's i_private_lock. * * If the folio is dirty but all the buffers are clean then we need to * be sure to mark the folio clean as well. This is because the folio @@ -2894,7 +2894,7 @@ EXPORT_SYMBOL(sync_dirty_buffer); * The same applies to regular filesystem folios: if all the buffers are * clean then we set the folio clean and proceed. To do that, we require * total exclusion from block_dirty_folio(). That is obtained with - * private_lock. + * i_private_lock. * * try_to_free_buffers() is non-blocking. */ @@ -2946,7 +2946,7 @@ bool try_to_free_buffers(struct folio *folio) goto out; } - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); ret = drop_buffers(folio, &buffers_to_free); /* @@ -2959,13 +2959,13 @@ bool try_to_free_buffers(struct folio *folio) * the folio's buffers clean. We discover that here and clean * the folio also. * - * private_lock must be held over this entire operation in order + * i_private_lock must be held over this entire operation in order * to synchronise against block_dirty_folio and prevent the * dirty bit from being lost. */ if (ret) folio_cancel_dirty(folio); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 61277f7f8722..0558c8c986d4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * - * ext4 never places buffers on inode->i_mapping->private_list. metadata + * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, @@ -3213,7 +3213,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } /* Any metadata buffers to write? */ - if (!list_empty(&inode->i_mapping->private_list)) + if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d6bf1f8c25dc..d8b619ed2f1e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1213,7 +1213,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->host = s->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108c5d26839..00ce89bdf32c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->host = sb->s_bdev->bd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f757d4f7ad98..05609ab15cbc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) * at inode creation time. If this is a device special inode, * i_mapping may not point to the original address space. */ - resv_map = (struct resv_map *)(&inode->i_data)->private_data; + resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); @@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; simple_inode_init_ts(inode); - inode->i_mapping->private_data = resv_map; + inode->i_mapping->i_private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: diff --git a/fs/inode.c b/fs/inode.c index edcd8a61975f..788aa0aa542b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -209,7 +209,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->nr_thps, 0); #endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->private_data = NULL; + mapping->i_private_data = NULL; mapping->writeback_index = 0; init_rwsem(&mapping->invalidate_lock); lockdep_set_class_and_name(&mapping->invalidate_lock, @@ -396,8 +396,8 @@ static void __address_space_init_once(struct address_space *mapping) { xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); init_rwsem(&mapping->i_mmap_rwsem); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); + INIT_LIST_HEAD(&mapping->i_private_list); + spin_lock_init(&mapping->i_private_lock); mapping->i_mmap = RB_ROOT_CACHED; } @@ -618,7 +618,7 @@ void clear_inode(struct inode *inode) * nor even WARN_ON(!mapping_empty). */ xa_unlock_irq(&inode->i_data.i_pages); - BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!list_empty(&inode->i_data.i_private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); BUG_ON(!list_empty(&inode->i_wb_list)); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b664caea8b4e..7248705faef4 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio) if (!folio_test_private(folio)) return NULL; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); req = nfs_folio_private_request(folio); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return req; } @@ -769,13 +769,13 @@ static void nfs_inode_add_request(struct nfs_page *req) * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!folio_test_swapcache(folio))) { set_bit(PG_MAPPED, &req->wb_flags); folio_set_private(folio); folio->private = req; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. @@ -796,13 +796,13 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(folio && !folio_test_swapcache(folio))) { folio->private = NULL; folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index f861f3a0bf5c..2ead36dfa2a3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -214,7 +214,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, /* * The page may not be locked, eg if called from try_to_unmap_one() */ - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); head = folio_buffers(folio); if (head) { struct buffer_head *bh = head; @@ -230,7 +230,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping, } else if (ret) { nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits); } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 71e31e789b29..548f3b51aa5f 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = { * * If the page does not have buffers, we create them and set them uptodate. * The page may not be locked which is why we need to handle the buffers under - * the mapping->private_lock. Once the buffers are marked dirty we no longer + * the mapping->i_private_lock. Once the buffers are marked dirty we no longer * need the lock since try_to_free_buffers() does not free dirty buffers. */ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { @@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; bh_size = VFS_I(ni)->i_sb->s_blocksize; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (unlikely(!page_has_buffers(page))) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head = alloc_page_buffers(page, bh_size, true); - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); if (likely(!page_has_buffers(page))) { struct buffer_head *tail; @@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { break; set_buffer_dirty(bh); } while ((bh = bh->b_this_page) != head); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); filemap_dirty_folio(mapping, page_folio(page)); if (unlikely(buffers_to_free)) { do { diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..f171505940ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -484,9 +484,9 @@ struct address_space { unsigned long flags; struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; - void *private_data; + spinlock_t i_private_lock; + struct list_head i_private_list; + void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1169ef2f2176..38c4477fda6a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1141,7 +1141,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) * The VERY common case is inode->mapping == &inode->i_data but, * this may not be true for device special inodes. */ - return (struct resv_map *)(&inode->i_data)->private_data; + return (struct resv_map *)(&inode->i_data)->i_private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) diff --git a/mm/migrate.c b/mm/migrate.c index 35a88334bb3c..377f55ebf7f4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -746,7 +746,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, recheck_buffers: busy = false; - spin_lock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); bh = head; do { if (atomic_read(&bh->b_count)) { @@ -760,7 +760,7 @@ recheck_buffers: rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -787,7 +787,7 @@ recheck_buffers: rc = MIGRATEPAGE_SUCCESS; unlock_buffers: if (check_refs) - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); bh = head; do { unlock_buffer(bh); -- cgit v1.2.3 From 85884871921000b9bca2184077b1159771e50047 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:22 +0100 Subject: i915: make inject_virtual_interrupt() void The single caller of inject_virtual_interrupt() ignores the return value anyway. This allows us to simplify eventfd_signal() in follow-up patches. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-1-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Zhenyu Wang Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- drivers/gpu/drm/i915/gvt/interrupt.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index de3f5903d1a7..b32ba5f2b240 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -422,7 +422,7 @@ static void init_irq_map(struct intel_gvt_irq *irq) #define MSI_CAP_DATA(offset) (offset + 8) #define MSI_CAP_EN 0x1 -static int inject_virtual_interrupt(struct intel_vgpu *vgpu) +static void inject_virtual_interrupt(struct intel_vgpu *vgpu) { unsigned long offset = vgpu->gvt->device_info.msi_cap_offset; u16 control, data; @@ -434,10 +434,10 @@ static int inject_virtual_interrupt(struct intel_vgpu *vgpu) /* Do not generate MSI if MSIEN is disabled */ if (!(control & MSI_CAP_EN)) - return 0; + return; if (WARN(control & GENMASK(15, 1), "only support one MSI format\n")) - return -EINVAL; + return; trace_inject_msi(vgpu->id, addr, data); @@ -451,10 +451,9 @@ static int inject_virtual_interrupt(struct intel_vgpu *vgpu) * returned and don't inject interrupt into guest. */ if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) - return -ESRCH; - if (vgpu->msi_trigger && eventfd_signal(vgpu->msi_trigger, 1) != 1) - return -EFAULT; - return 0; + return; + if (vgpu->msi_trigger) + eventfd_signal(vgpu->msi_trigger, 1); } static void propagate_event(struct intel_gvt_irq *irq, -- cgit v1.2.3 From 3652117f854819a148ff0fbe4492587d3520b5e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:23 +0100 Subject: eventfd: simplify eventfd_signal() Ever since the eventfd type was introduced back in 2007 in commit e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal() function only ever passed 1 as a value for @n. There's no point in keeping that additional argument. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-2-bd549b14ce0c@kernel.org Acked-by: Xu Yilun Acked-by: Andrew Donnellan # ocxl Acked-by: Eric Farman # s390 Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/xen.c | 2 +- drivers/accel/habanalabs/common/device.c | 2 +- drivers/fpga/dfl.c | 2 +- drivers/gpu/drm/drm_syncobj.c | 6 +++--- drivers/gpu/drm/i915/gvt/interrupt.c | 2 +- drivers/infiniband/hw/mlx5/devx.c | 2 +- drivers/misc/ocxl/file.c | 2 +- drivers/s390/cio/vfio_ccw_chp.c | 2 +- drivers/s390/cio/vfio_ccw_drv.c | 4 ++-- drivers/s390/cio/vfio_ccw_ops.c | 6 +++--- drivers/s390/crypto/vfio_ap_ops.c | 2 +- drivers/usb/gadget/function/f_fs.c | 4 ++-- drivers/vdpa/vdpa_user/vduse_dev.c | 6 +++--- drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 6 +++--- drivers/vfio/pci/vfio_pci_intrs.c | 12 ++++++------ drivers/vfio/platform/vfio_platform_irq.c | 4 ++-- drivers/vhost/vdpa.c | 4 ++-- drivers/vhost/vhost.c | 10 +++++----- drivers/vhost/vhost.h | 2 +- drivers/virt/acrn/ioeventfd.c | 2 +- drivers/xen/privcmd.c | 2 +- fs/aio.c | 2 +- fs/eventfd.c | 11 ++++------- include/linux/eventfd.h | 4 ++-- mm/memcontrol.c | 10 +++++----- mm/vmpressure.c | 2 +- samples/vfio-mdev/mtty.c | 4 ++-- virt/kvm/eventfd.c | 4 ++-- 30 files changed, 61 insertions(+), 64 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 238afd7335e4..4943f6b2bbee 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *h if (!eventfd) return HV_STATUS_INVALID_PORT_ID; - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); return HV_STATUS_SUCCESS; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index e53fad915a62..523bb6df5ac9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) if (ret < 0 && ret != -ENOTCONN) return false; } else { - eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1); + eventfd_signal(evtchnfd->deliver.eventfd.ctx); } *r = 0; diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9711e8fc979d..3a89644f087c 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2044,7 +2044,7 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 notifier_event->events_mask |= event_mask; if (notifier_event->eventfd) - eventfd_signal(notifier_event->eventfd, 1); + eventfd_signal(notifier_event->eventfd); mutex_unlock(¬ifier_event->lock); } diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c index dd7a783d53b5..e73f88050f08 100644 --- a/drivers/fpga/dfl.c +++ b/drivers/fpga/dfl.c @@ -1872,7 +1872,7 @@ static irqreturn_t dfl_irq_handler(int irq, void *arg) { struct eventfd_ctx *trigger = arg; - eventfd_signal(trigger, 1); + eventfd_signal(trigger); return IRQ_HANDLED; } diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 01da6789d044..b9cc62982196 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -1365,7 +1365,7 @@ static void syncobj_eventfd_entry_fence_func(struct dma_fence *fence, struct syncobj_eventfd_entry *entry = container_of(cb, struct syncobj_eventfd_entry, fence_cb); - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } @@ -1388,13 +1388,13 @@ syncobj_eventfd_entry_func(struct drm_syncobj *syncobj, entry->fence = fence; if (entry->flags & DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE) { - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } else { ret = dma_fence_add_callback(fence, &entry->fence_cb, syncobj_eventfd_entry_fence_func); if (ret == -ENOENT) { - eventfd_signal(entry->ev_fd_ctx, 1); + eventfd_signal(entry->ev_fd_ctx); syncobj_eventfd_entry_free(entry); } } diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c b/drivers/gpu/drm/i915/gvt/interrupt.c index b32ba5f2b240..c8e7dfc9f791 100644 --- a/drivers/gpu/drm/i915/gvt/interrupt.c +++ b/drivers/gpu/drm/i915/gvt/interrupt.c @@ -453,7 +453,7 @@ static void inject_virtual_interrupt(struct intel_vgpu *vgpu) if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status)) return; if (vgpu->msi_trigger) - eventfd_signal(vgpu->msi_trigger, 1); + eventfd_signal(vgpu->msi_trigger); } static void propagate_event(struct intel_gvt_irq *irq, diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8ba53edf2311..869369cb5b5f 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2498,7 +2498,7 @@ static void dispatch_event_fd(struct list_head *fd_list, list_for_each_entry_rcu(item, fd_list, xa_list) { if (item->eventfd) - eventfd_signal(item->eventfd, 1); + eventfd_signal(item->eventfd); else deliver_event(item, data); } diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index ac69b7f361f5..7eb74711ac96 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -184,7 +184,7 @@ static irqreturn_t irq_handler(void *private) { struct eventfd_ctx *ev_ctx = private; - eventfd_signal(ev_ctx, 1); + eventfd_signal(ev_ctx); return IRQ_HANDLED; } diff --git a/drivers/s390/cio/vfio_ccw_chp.c b/drivers/s390/cio/vfio_ccw_chp.c index d3f3a611f95b..38c176cf6295 100644 --- a/drivers/s390/cio/vfio_ccw_chp.c +++ b/drivers/s390/cio/vfio_ccw_chp.c @@ -115,7 +115,7 @@ static ssize_t vfio_ccw_crw_region_read(struct vfio_ccw_private *private, /* Notify the guest if more CRWs are on our queue */ if (!list_empty(&private->crw) && private->crw_trigger) - eventfd_signal(private->crw_trigger, 1); + eventfd_signal(private->crw_trigger); return ret; } diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 43601816ea4e..bfb35cfce1ef 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -112,7 +112,7 @@ void vfio_ccw_sch_io_todo(struct work_struct *work) private->state = VFIO_CCW_STATE_IDLE; if (private->io_trigger) - eventfd_signal(private->io_trigger, 1); + eventfd_signal(private->io_trigger); } void vfio_ccw_crw_todo(struct work_struct *work) @@ -122,7 +122,7 @@ void vfio_ccw_crw_todo(struct work_struct *work) private = container_of(work, struct vfio_ccw_private, crw_work); if (!list_empty(&private->crw) && private->crw_trigger) - eventfd_signal(private->crw_trigger, 1); + eventfd_signal(private->crw_trigger); } /* diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index cba4971618ff..ea532a8a4a0c 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -421,7 +421,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private, case VFIO_IRQ_SET_DATA_NONE: { if (*ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } case VFIO_IRQ_SET_DATA_BOOL: @@ -432,7 +432,7 @@ static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private, return -EFAULT; if (trigger && *ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } case VFIO_IRQ_SET_DATA_EVENTFD: @@ -612,7 +612,7 @@ static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count) "Relaying device request to user (#%u)\n", count); - eventfd_signal(private->req_trigger, 1); + eventfd_signal(private->req_trigger); } else if (count == 0) { dev_notice(dev, "No device request channel registered, blocked until released by user\n"); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 4db538a55192..542b5be73a6a 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1794,7 +1794,7 @@ static void vfio_ap_mdev_request(struct vfio_device *vdev, unsigned int count) "Relaying device request to user (#%u)\n", count); - eventfd_signal(matrix_mdev->req_trigger, 1); + eventfd_signal(matrix_mdev->req_trigger); } else if (count == 0) { dev_notice(dev, "No device request registered, blocked until released by user\n"); diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index efe3e3b85769..fdd0fc7b8f25 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -831,7 +831,7 @@ static void ffs_user_copy_worker(struct work_struct *work) io_data->kiocb->ki_complete(io_data->kiocb, ret); if (io_data->ffs->ffs_eventfd && !kiocb_has_eventfd) - eventfd_signal(io_data->ffs->ffs_eventfd, 1); + eventfd_signal(io_data->ffs->ffs_eventfd); if (io_data->read) kfree(io_data->to_free); @@ -2738,7 +2738,7 @@ static void __ffs_event_add(struct ffs_data *ffs, ffs->ev.types[ffs->ev.count++] = type; wake_up_locked(&ffs->ev.waitq); if (ffs->ffs_eventfd) - eventfd_signal(ffs->ffs_eventfd, 1); + eventfd_signal(ffs->ffs_eventfd); } static void ffs_event_add(struct ffs_data *ffs, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0ddd4b8abecb..6cb5ce4a8b9a 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -493,7 +493,7 @@ static void vduse_vq_kick(struct vduse_virtqueue *vq) goto unlock; if (vq->kickfd) - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); else vq->kicked = true; unlock: @@ -911,7 +911,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev, eventfd_ctx_put(vq->kickfd); vq->kickfd = ctx; if (vq->ready && vq->kicked && vq->kickfd) { - eventfd_signal(vq->kickfd, 1); + eventfd_signal(vq->kickfd); vq->kicked = false; } spin_unlock(&vq->kick_lock); @@ -960,7 +960,7 @@ static bool vduse_vq_signal_irqfd(struct vduse_virtqueue *vq) spin_lock_irq(&vq->irq_lock); if (vq->ready && vq->cb.trigger) { - eventfd_signal(vq->cb.trigger, 1); + eventfd_signal(vq->cb.trigger); signal = true; } spin_unlock_irq(&vq->irq_lock); diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index c51229fccbd6..d62fbfff20b8 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -54,7 +54,7 @@ static irqreturn_t vfio_fsl_mc_irq_handler(int irq_num, void *arg) { struct vfio_fsl_mc_irq *mc_irq = (struct vfio_fsl_mc_irq *)arg; - eventfd_signal(mc_irq->trigger, 1); + eventfd_signal(mc_irq->trigger); return IRQ_HANDLED; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1929103ee59a..1cbc990d42e0 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -443,7 +443,7 @@ static int vfio_pci_core_runtime_resume(struct device *dev) */ down_write(&vdev->memory_lock); if (vdev->pm_wake_eventfd_ctx) { - eventfd_signal(vdev->pm_wake_eventfd_ctx, 1); + eventfd_signal(vdev->pm_wake_eventfd_ctx); __vfio_pci_runtime_pm_exit(vdev); } up_write(&vdev->memory_lock); @@ -1883,7 +1883,7 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger, 1); + eventfd_signal(vdev->req_trigger); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); @@ -2302,7 +2302,7 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, mutex_lock(&vdev->igate); if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger, 1); + eventfd_signal(vdev->err_trigger); mutex_unlock(&vdev->igate); diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index cbb4bcbfbf83..237beac83809 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -94,7 +94,7 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) ctx = vfio_irq_ctx_get(vdev, 0); if (WARN_ON_ONCE(!ctx)) return; - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } } @@ -342,7 +342,7 @@ static irqreturn_t vfio_msihandler(int irq, void *arg) { struct eventfd_ctx *trigger = arg; - eventfd_signal(trigger, 1); + eventfd_signal(trigger); return IRQ_HANDLED; } @@ -689,11 +689,11 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, if (!ctx) continue; if (flags & VFIO_IRQ_SET_DATA_NONE) { - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t *bools = data; if (bools[i - start]) - eventfd_signal(ctx->trigger, 1); + eventfd_signal(ctx->trigger); } } return 0; @@ -707,7 +707,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, if (flags & VFIO_IRQ_SET_DATA_NONE) { if (*ctx) { if (count) { - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); } else { eventfd_ctx_put(*ctx); *ctx = NULL; @@ -722,7 +722,7 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, trigger = *(uint8_t *)data; if (trigger && *ctx) - eventfd_signal(*ctx, 1); + eventfd_signal(*ctx); return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c index 665197caed89..61a1bfb68ac7 100644 --- a/drivers/vfio/platform/vfio_platform_irq.c +++ b/drivers/vfio/platform/vfio_platform_irq.c @@ -155,7 +155,7 @@ static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id) spin_unlock_irqrestore(&irq_ctx->lock, flags); if (ret == IRQ_HANDLED) - eventfd_signal(irq_ctx->trigger, 1); + eventfd_signal(irq_ctx->trigger); return ret; } @@ -164,7 +164,7 @@ static irqreturn_t vfio_irq_handler(int irq, void *dev_id) { struct vfio_platform_irq *irq_ctx = dev_id; - eventfd_signal(irq_ctx->trigger, 1); + eventfd_signal(irq_ctx->trigger); return IRQ_HANDLED; } diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 30df5c58db73..8d9f958946d6 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -178,7 +178,7 @@ static irqreturn_t vhost_vdpa_virtqueue_cb(void *private) struct eventfd_ctx *call_ctx = vq->call_ctx.ctx; if (call_ctx) - eventfd_signal(call_ctx, 1); + eventfd_signal(call_ctx); return IRQ_HANDLED; } @@ -189,7 +189,7 @@ static irqreturn_t vhost_vdpa_config_cb(void *private) struct eventfd_ctx *config_ctx = v->config_ctx; if (config_ctx) - eventfd_signal(config_ctx, 1); + eventfd_signal(config_ctx); return IRQ_HANDLED; } diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e0c181ad17e3..045f666b4f12 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2248,7 +2248,7 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, len -= l; if (!len) { if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); return 0; } } @@ -2271,7 +2271,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) log_used(vq, (used - (void __user *)vq->used), sizeof vq->used->flags); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return 0; } @@ -2289,7 +2289,7 @@ static int vhost_update_avail_event(struct vhost_virtqueue *vq) log_used(vq, (used - (void __user *)vq->used), sizeof *vhost_avail_event(vq)); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return 0; } @@ -2715,7 +2715,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, log_used(vq, offsetof(struct vring_used, idx), sizeof vq->used->idx); if (vq->log_ctx) - eventfd_signal(vq->log_ctx, 1); + eventfd_signal(vq->log_ctx); } return r; } @@ -2763,7 +2763,7 @@ void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { /* Signal the Guest tell them we used something up. */ if (vq->call_ctx.ctx && vhost_notify(dev, vq)) - eventfd_signal(vq->call_ctx.ctx, 1); + eventfd_signal(vq->call_ctx.ctx); } EXPORT_SYMBOL_GPL(vhost_signal); diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index f60d5f7bef94..9e942fcda5c3 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -249,7 +249,7 @@ void vhost_iotlb_map_free(struct vhost_iotlb *iotlb, #define vq_err(vq, fmt, ...) do { \ pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ if ((vq)->error_ctx) \ - eventfd_signal((vq)->error_ctx, 1);\ + eventfd_signal((vq)->error_ctx);\ } while (0) enum { diff --git a/drivers/virt/acrn/ioeventfd.c b/drivers/virt/acrn/ioeventfd.c index ac4037e9f947..4e845c6ca0b5 100644 --- a/drivers/virt/acrn/ioeventfd.c +++ b/drivers/virt/acrn/ioeventfd.c @@ -223,7 +223,7 @@ static int acrn_ioeventfd_handler(struct acrn_ioreq_client *client, mutex_lock(&client->vm->ioeventfds_lock); p = hsm_ioeventfd_match(client->vm, addr, val, size, req->type); if (p) - eventfd_signal(p->eventfd, 1); + eventfd_signal(p->eventfd); mutex_unlock(&client->vm->ioeventfds_lock); return 0; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1ce7f3c7a950..7efc0c62e984 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1147,7 +1147,7 @@ static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id) if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY && ioreq->size == kioeventfd->addr_len && (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) { - eventfd_signal(kioeventfd->eventfd, 1); + eventfd_signal(kioeventfd->eventfd); state = STATE_IORESP_READY; break; } diff --git a/fs/aio.c b/fs/aio.c index d02842156b35..7c691cf84cc7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1166,7 +1166,7 @@ static void aio_complete(struct aio_kiocb *iocb) * from IRQ context. */ if (iocb->ki_eventfd) - eventfd_signal(iocb->ki_eventfd, 1); + eventfd_signal(iocb->ki_eventfd); /* * We have to order our ring_info tail store above and test diff --git a/fs/eventfd.c b/fs/eventfd.c index 33a918f9566c..d2f7d2d8a351 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -72,22 +72,19 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) } /** - * eventfd_signal - Adds @n to the eventfd counter. + * eventfd_signal - Increment the event counter * @ctx: [in] Pointer to the eventfd context. - * @n: [in] Value of the counter to be added to the eventfd internal counter. - * The value cannot be negative. * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX * value, and we signal this as overflow condition by returning a EPOLLERR * to poll(2). * - * Returns the amount by which the counter was incremented. This will be less - * than @n if the counter has overflowed. + * Returns the amount by which the counter was incremented. */ -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +__u64 eventfd_signal(struct eventfd_ctx *ctx) { - return eventfd_signal_mask(ctx, n, 0); + return eventfd_signal_mask(ctx, 1, 0); } EXPORT_SYMBOL_GPL(eventfd_signal); diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index b9d83652c097..562089431551 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,7 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); +__u64 eventfd_signal(struct eventfd_ctx *ctx); __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); @@ -58,7 +58,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +static inline int eventfd_signal(struct eventfd_ctx *ctx) { return -ENOSYS; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 774bd6e21e27..dfbb1d3b77b7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4378,7 +4378,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * only one element of the array here. */ for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) - eventfd_signal(t->entries[i].eventfd, 1); + eventfd_signal(t->entries[i].eventfd); /* i = current_threshold + 1 */ i++; @@ -4390,7 +4390,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) * only one element of the array here. */ for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) - eventfd_signal(t->entries[i].eventfd, 1); + eventfd_signal(t->entries[i].eventfd); /* Update current_threshold */ t->current_threshold = i - 1; @@ -4430,7 +4430,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) spin_lock(&memcg_oom_lock); list_for_each_entry(ev, &memcg->oom_notify, list) - eventfd_signal(ev->eventfd, 1); + eventfd_signal(ev->eventfd); spin_unlock(&memcg_oom_lock); return 0; @@ -4649,7 +4649,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, /* already in OOM ? */ if (memcg->under_oom) - eventfd_signal(eventfd, 1); + eventfd_signal(eventfd); spin_unlock(&memcg_oom_lock); return 0; @@ -4941,7 +4941,7 @@ static void memcg_event_remove(struct work_struct *work) event->unregister_event(memcg, event->eventfd); /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); + eventfd_signal(event->eventfd); eventfd_ctx_put(event->eventfd); kfree(event); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 22c6689d9302..bd5183dfd879 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -169,7 +169,7 @@ static bool vmpressure_event(struct vmpressure *vmpr, continue; if (level < ev->level) continue; - eventfd_signal(ev->efd, 1); + eventfd_signal(ev->efd); ret = true; } mutex_unlock(&vmpr->events_lock); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 69ba0281f9e0..2284b3751240 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -234,10 +234,10 @@ static void mtty_trigger_interrupt(struct mdev_state *mdev_state) if (is_msi(mdev_state)) { if (mdev_state->msi_evtfd) - eventfd_signal(mdev_state->msi_evtfd, 1); + eventfd_signal(mdev_state->msi_evtfd); } else if (is_intx(mdev_state)) { if (mdev_state->intx_evtfd && !mdev_state->intx_mask) { - eventfd_signal(mdev_state->intx_evtfd, 1); + eventfd_signal(mdev_state->intx_evtfd); mdev_state->intx_mask = true; } } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 89912a17f5d5..c0e230f4c3e9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -61,7 +61,7 @@ static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, srcu_read_lock_held(&resampler->kvm->irq_srcu)) - eventfd_signal(irqfd->resamplefd, 1); + eventfd_signal(irqfd->resamplefd); } /* @@ -786,7 +786,7 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; - eventfd_signal(p->eventfd, 1); + eventfd_signal(p->eventfd); return 0; } -- cgit v1.2.3 From 120ae58593630819209a011a3f9c89f73bcc9894 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:24 +0100 Subject: eventfd: simplify eventfd_signal_mask() The eventfd_signal_mask() helper was introduced for io_uring and similar to eventfd_signal() it always passed 1 for @n. So don't bother with that argument at all. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-3-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/eventfd.c | 7 ++++--- include/linux/eventfd.h | 5 ++--- io_uring/io_uring.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index d2f7d2d8a351..41109ba6bbe0 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,9 +43,10 @@ struct eventfd_ctx { int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; + __u64 n = 1; /* * Deadlock or stack overflow issues can happen if we recurse here @@ -68,7 +69,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); - return n; + return n == 1; } /** @@ -84,7 +85,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) */ __u64 eventfd_signal(struct eventfd_ctx *ctx) { - return eventfd_signal_mask(ctx, 1, 0); + return eventfd_signal_mask(ctx, 0); } EXPORT_SYMBOL_GPL(eventfd_signal); diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 562089431551..971943ecb2a6 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx) return -ENOSYS; } -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, - unsigned mask) +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { return -ENOSYS; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ed254076c723..70170a41eac4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -558,7 +558,7 @@ static void io_eventfd_ops(struct rcu_head *rcu) int ops = atomic_xchg(&ev_fd->ops, 0); if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback * ordering in a race but if references are 0 we know we have to free @@ -594,7 +594,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) goto out; if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); } else { atomic_inc(&ev_fd->refs); if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) -- cgit v1.2.3 From b7638ad0c7802ea854599ce753d0e6d20690f7e2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:25 +0100 Subject: eventfd: make eventfd_signal{_mask}() void No caller care about the return value. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-4-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/eventfd.c | 40 +++++++++++++++------------------------- include/linux/eventfd.h | 16 +++++++--------- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 41109ba6bbe0..16bea05a7c78 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -43,10 +43,19 @@ struct eventfd_ctx { int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) +/** + * eventfd_signal_mask - Increment the event counter + * @ctx: [in] Pointer to the eventfd context. + * @mask: [in] poll mask + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returning a EPOLLERR + * to poll(2). + */ +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; - __u64 n = 1; /* * Deadlock or stack overflow issues can happen if we recurse here @@ -57,37 +66,18 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) - return 0; + return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; - if (ULLONG_MAX - ctx->count < n) - n = ULLONG_MAX - ctx->count; - ctx->count += n; + if (ctx->count < ULLONG_MAX) + ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); - - return n == 1; -} - -/** - * eventfd_signal - Increment the event counter - * @ctx: [in] Pointer to the eventfd context. - * - * This function is supposed to be called by the kernel in paths that do not - * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returning a EPOLLERR - * to poll(2). - * - * Returns the amount by which the counter was incremented. - */ -__u64 eventfd_signal(struct eventfd_ctx *ctx) -{ - return eventfd_signal_mask(ctx, 0); } -EXPORT_SYMBOL_GPL(eventfd_signal); +EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 971943ecb2a6..e32bee4345fb 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx) +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { - return -ENOSYS; -} - -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) -{ - return -ENOSYS; } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) @@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) #endif +static inline void eventfd_signal(struct eventfd_ctx *ctx) +{ + eventfd_signal_mask(ctx, 0); +} + #endif /* _LINUX_EVENTFD_H */ -- cgit v1.2.3 From 71eb6b6b0ba93b1467bccff57b5de746b09113d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 22 Nov 2023 18:42:53 -0500 Subject: fs/aio: obey min_nr when doing wakeups I've been observing workloads where IPIs due to wakeups in aio_complete() are ~15% of total CPU time in the profile. Most of those wakeups are unnecessary when completion batching is in use in io_getevents(). This plumbs min_nr through via the wait eventry, so that aio_complete() can avoid doing unnecessary wakeups. Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20231122234257.179390-1-kent.overstreet@linux.dev Cc: Benjamin LaHaise Cc: Christian Brauner Cc: Cc: Signed-off-by: Christian Brauner --- fs/aio.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 7c691cf84cc7..918d80d4c5ad 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } +struct aio_waiter { + struct wait_queue_entry w; + size_t min_nr; +}; + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned tail, pos, head; + unsigned tail, pos, head, avail; unsigned long flags; /* @@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); + + avail = tail > head + ? tail - head + : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->wait)) { + struct aio_waiter *curr, *next; + unsigned long flags; + + spin_lock_irqsave(&ctx->wait.lock, flags); + list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) + if (avail >= curr->min_nr) { + list_del_init_careful(&curr->w.entry); + wake_up_process(curr->w.private); + } + spin_unlock_irqrestore(&ctx->wait.lock, flags); + } } static inline void iocb_put(struct aio_kiocb *iocb) @@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { - long ret = 0; + struct hrtimer_sleeper t; + struct aio_waiter w; + long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. @@ -1306,12 +1327,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + aio_read_events(ctx, min_nr, nr, event, &ret); + if (until == 0 || ret < 0 || ret >= min_nr) + return ret; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (until != KTIME_MAX) { + hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); + } + + init_wait(&w.w); + + while (1) { + unsigned long nr_got = ret; + + w.min_nr = min_nr - ret; + + ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE); + if (!ret2 && !t.task) + ret2 = -ETIME; + + if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) + break; + + if (nr_got == ret) + schedule(); + } + + finish_wait(&ctx->wait, &w.w); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; } -- cgit v1.2.3 From e65a29f0235a438ece414d2d99bbf0d31aa97d04 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:37 +0100 Subject: mnt_idmapping: remove check_fsmapping() The helper is a bit pointless. Just open-code the check. Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-1-dae4abdde5bd@kernel.org Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 17 ++--------------- fs/namespace.c | 2 +- include/linux/mnt_idmapping.h | 3 --- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 57d1dedf3f8f..2674942311c3 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -25,19 +25,6 @@ struct mnt_idmap nop_mnt_idmap = { }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); -/** - * check_fsmapping - check whether an mount idmapping is allowed - * @idmap: idmap of the relevent mount - * @sb: super block of the filesystem - * - * Return: true if @idmap is allowed, false if not. - */ -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb) -{ - return idmap->owner != sb->s_user_ns; -} - /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check @@ -94,8 +81,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, - struct user_namespace *fs_userns, - kuid_t kuid) + struct user_namespace *fs_userns, + kuid_t kuid) { uid_t uid; struct user_namespace *mnt_userns = idmap->owner; diff --git a/fs/namespace.c b/fs/namespace.c index b899cbbe24d9..78366f114515 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4289,7 +4289,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb)) + if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b8da2db4ecd2..cd4d5c8781f5 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb); - #endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit v1.2.3 From 90fbd8b175ee75ee3d37d748b92bc317660b586d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:38 +0100 Subject: mnt_idmapping: remove nop check All mounts default to nop_mnt_idmap and we don't allow creating idmapped mounts that reuse the idmapping of the filesystem. So unless someone passes a non-superblock namespace to these helpers this check will always be false. Remove it and replace it with a simple check for nop_mnt_idmap. Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-2-dae4abdde5bd@kernel.org Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 2674942311c3..35d78cb3c38a 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -39,26 +39,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns) return ns == &init_user_ns; } -/** - * no_idmapping - check whether we can skip remapping a kuid/gid - * @mnt_userns: the mount's idmapping - * @fs_userns: the filesystem's idmapping - * - * This function can be used to check whether a remapping between two - * idmappings is required. - * An idmapped mount is a mount that has an idmapping attached to it that - * is different from the filsystem's idmapping and the initial idmapping. - * If the initial mapping is used or the idmapping of the mount and the - * filesystem are identical no remapping is required. - * - * Return: true if remapping can be skipped, false if not. - */ -static inline bool no_idmapping(const struct user_namespace *mnt_userns, - const struct user_namespace *fs_userns) -{ - return initial_idmapping(mnt_userns) || mnt_userns == fs_userns; -} - /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping @@ -68,8 +48,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns, * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kuid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() @@ -87,7 +67,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid_t uid; struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); @@ -108,8 +88,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid); * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * - * If no_idmapping() determines that this is not an idmapped mount we can - * simply return @kgid unchanged. + * If initial_idmapping() determines that this is not an idmapped mount + * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() @@ -125,7 +105,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid_t gid; struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); @@ -154,7 +134,7 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, uid_t uid; struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); if (uid == (uid_t)-1) @@ -182,7 +162,7 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, gid_t gid; struct user_namespace *mnt_userns = idmap->owner; - if (no_idmapping(mnt_userns, fs_userns)) + if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); if (gid == (gid_t)-1) -- cgit v1.2.3 From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:39 +0100 Subject: mnt_idmapping: decouple from namespaces There's no reason we need to couple mnt idmapping to namespaces in the way we currently do. Copy the idmapping when an idmapped mount is created and don't take any reference on the namespace at all. We also can't easily refcount struct uid_gid_map because it needs to stay the size of a cacheline otherwise we risk performance regressions (Ignoring for a second that right now struct uid_gid_map isn't actually 64 byte but 72 but that's a fix for another patch series.). Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 106 +++++++++++++++++++++++++++++++++++++++++------- include/linux/uidgid.h | 13 ++++++ kernel/user_namespace.c | 4 +- 3 files changed, 106 insertions(+), 17 deletions(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 35d78cb3c38a..64c5205e2b5e 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -9,8 +9,16 @@ #include "internal.h" +/* + * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, + * never from raw values. These are just internal helpers. + */ +#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } +#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } + struct mnt_idmap { - struct user_namespace *owner; + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; refcount_t count; }; @@ -20,7 +28,6 @@ struct mnt_idmap { * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { - .owner = &init_user_ns, .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); @@ -65,7 +72,6 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, kuid_t kuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); @@ -75,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; - return VFSUIDT_INIT(make_kuid(mnt_userns, uid)); + return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); @@ -103,7 +109,6 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); @@ -113,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; - return VFSGIDT_INIT(make_kgid(mnt_userns, gid)); + return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); @@ -132,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); - uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid)); + uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) @@ -160,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; - struct user_namespace *mnt_userns = idmap->owner; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); - gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid)); + gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) @@ -195,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid) #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); +static int copy_mnt_idmap(struct uid_gid_map *map_from, + struct uid_gid_map *map_to) +{ + struct uid_gid_extent *forward, *reverse; + u32 nr_extents = READ_ONCE(map_from->nr_extents); + /* Pairs with smp_wmb() when writing the idmapping. */ + smp_rmb(); + + /* + * Don't blindly copy @map_to into @map_from if nr_extents is + * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we + * read @nr_extents someone could have written an idmapping and + * then we might end up with inconsistent data. So just don't do + * anything at all. + */ + if (nr_extents == 0) + return 0; + + /* + * Here we know that nr_extents is greater than zero which means + * a map has been written. Since idmappings can't be changed + * once they have been written we know that we can safely copy + * from @map_to into @map_from. + */ + + if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + *map_to = *map_from; + return 0; + } + + forward = kmemdup(map_from->forward, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!forward) + return -ENOMEM; + + reverse = kmemdup(map_from->reverse, + nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL_ACCOUNT); + if (!reverse) { + kfree(forward); + return -ENOMEM; + } + + /* + * The idmapping isn't exposed anywhere so we don't need to care + * about ordering between extent pointers and @nr_extents + * initialization. + */ + map_to->forward = forward; + map_to->reverse = reverse; + map_to->nr_extents = nr_extents; + return 0; +} + +static void free_mnt_idmap(struct mnt_idmap *idmap) +{ + if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->uid_map.forward); + kfree(idmap->uid_map.reverse); + } + if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(idmap->gid_map.forward); + kfree(idmap->gid_map.reverse); + } + kfree(idmap); +} + struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; + int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); - idmap->owner = get_user_ns(mnt_userns); refcount_set(&idmap->count, 1); + ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); + if (!ret) + ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); + if (ret) { + free_mnt_idmap(idmap); + idmap = ERR_PTR(ret); + } return idmap; } @@ -234,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get); */ void mnt_idmap_put(struct mnt_idmap *idmap) { - if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { - put_user_ns(idmap->owner); - kfree(idmap); - } + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) + free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb..415a7ca2b882 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -17,6 +17,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct uid_gid_map; typedef struct { uid_t val; @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 625101249e4d..ce4d99df5f0e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -332,7 +332,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } @@ -375,7 +375,7 @@ map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) sizeof(struct uid_gid_extent), cmp_map_id); } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; -- cgit v1.2.3 From 12c1b632d970c0138b4c5c65a1065e7d0604d272 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:40 +0100 Subject: fs: reformat idmapped mounts entry Reformat idmapped mounts to clearly mark where it belongs. Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-4-dae4abdde5bd@kernel.org Signed-off-by: Christian Brauner --- MAINTAINERS | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 97f51d5ec1cf..d0a7b6f357ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8177,6 +8177,16 @@ F: fs/exportfs/ F: fs/fhandle.c F: include/linux/exportfs.h +FILESYSTEMS [IDMAPPED MOUNTS] +M: Christian Brauner +M: Seth Forshee +L: linux-fsdevel@vger.kernel.org +S: Maintained +F: Documentation/filesystems/idmappings.rst +F: fs/mnt_idmapping.c +F: include/linux/mnt_idmapping.* +F: tools/testing/selftests/mount_setattr/ + FILESYSTEMS [IOMAP] M: Christian Brauner R: Darrick J. Wong @@ -10252,16 +10262,6 @@ S: Maintained W: https://github.com/o2genum/ideapad-slidebar F: drivers/input/misc/ideapad_slidebar.c -IDMAPPED MOUNTS -M: Christian Brauner -M: Seth Forshee -L: linux-fsdevel@vger.kernel.org -S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git -F: Documentation/filesystems/idmappings.rst -F: include/linux/mnt_idmapping.* -F: tools/testing/selftests/mount_setattr/ - IDT VersaClock 5 CLOCK DRIVER M: Luca Ceresoli S: Maintained -- cgit v1.2.3 From 055ca83559912f2cfd91c9441427bac4caf3c74e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 24 Nov 2023 16:08:22 +0100 Subject: fs/pipe: Fix lockdep false-positive in watchqueue pipe_write() When you try to splice between a normal pipe and a notification pipe, get_pipe_info(..., true) fails, so splice() falls back to treating the notification pipe like a normal pipe - so we end up in iter_file_splice_write(), which first locks the input pipe, then calls vfs_iter_write(), which locks the output pipe. Lockdep complains about that, because we're taking a pipe lock while already holding another pipe lock. I think this probably (?) can't actually lead to deadlocks, since you'd need another way to nest locking a normal pipe into locking a watch_queue pipe, but the lockdep annotations don't make that clear. Bail out earlier in pipe_write() for notification pipes, before taking the pipe lock. Reported-and-tested-by: Closes: https://syzkaller.appspot.com/bug?extid=011e4ea1da6692cf881c Fixes: c73be61cede5 ("pipe: Add general notification queue support") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20231124150822.2121798-1-jannh@google.com Signed-off-by: Christian Brauner --- fs/pipe.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 804a7d789452..226e7f66b590 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) bool was_empty = false; bool wake_next_writer = false; + /* + * Reject writing to watch queue pipes before the point where we lock + * the pipe. + * Otherwise, lockdep would be unhappy if the caller already has another + * pipe locked. + * If we had to support locking a normal pipe and a notification pipe at + * the same time, we could set up lockdep annotations for that, but + * since we don't actually need that, it's simpler to just bail here. + */ + if (pipe_has_watch_queue(pipe)) + return -EXDEV; + /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; @@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (pipe_has_watch_queue(pipe)) { - ret = -EXDEV; - goto out; - } - /* * If it wasn't empty we try to merge new data into * the last buffer. -- cgit v1.2.3 From 7cb537b6f6d7d6529be04139178f929d9a63b918 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 26 Nov 2023 02:08:34 +0000 Subject: file: massage cleanup of files that failed to open A file that has never gotten FMODE_OPENED will never have RCU-accessed references, its final fput() is equivalent to file_free() and if it doesn't have FMODE_BACKING either, it can be done from any context and won't need task_work treatment. Now that we have SLAB_TYPESAFE_BY_RCU we can simplify this and have other callers benefit. All of that can be achieved easier is to make fput() recoginze that case and call file_free() directly. No need to introduce a special primitive for that. It also allowed things like failing dentry_open() could benefit from that as well. Signed-off-by: Al Viro [Christian Brauner : massage commit message] Link: https://lore.kernel.org/r/20231126020834.GC38156@ZenIV Signed-off-by: Christian Brauner --- fs/file_table.c | 16 ++++------------ fs/internal.h | 1 - fs/namei.c | 5 +---- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index de4a2915bfd4..6deac386486d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -75,18 +75,6 @@ static inline void file_free(struct file *f) } } -void release_empty_file(struct file *f) -{ - WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); - if (atomic_long_dec_and_test(&f->f_count)) { - security_file_free(f); - put_cred(f->f_cred); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - kmem_cache_free(filp_cachep, f); - } -} - /* * Return the total number of open files in the system */ @@ -445,6 +433,10 @@ void fput(struct file *file) if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { init_task_work(&file->f_rcuhead, ____fput); if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) diff --git a/fs/internal.h b/fs/internal.h index 58e43341aebf..273e6fd40d1b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -94,7 +94,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); -void release_empty_file(struct file *f); static inline void file_put_write_access(struct file *file) { diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..f0ead1858267 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3785,10 +3785,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - if (unlikely(file->f_mode & FMODE_OPENED)) - fput(file); - else - release_empty_file(file); + fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; -- cgit v1.2.3 From 253ca8678d30bcf94410b54476fc1e0f1627a137 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Nov 2023 12:24:38 -0800 Subject: Improve __fget_files_rcu() code generation (and thus __fget_light()) Commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") caused a performance regression as reported by the kernel test robot. The __fget_light() function is one of those critical ones for some loads, and the code generation was unnecessarily impacted. Let's just write that function to better. Reported-by: kernel test robot Cc: Christian Brauner Cc: Jann Horn Cc: Mateusz Guzik Closes: https://lore.kernel.org/oe-lkp/202311201406.2022ca3f-oliver.sang@intel.com Signed-off-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wiCJtLbFWNURB34b9a_R_unaH3CiMRXfkR0-iihB_z68A@mail.gmail.com Signed-off-by: Christian Brauner --- fs/file.c | 51 ++++++++++++++++++++++++++++++++----------------- include/linux/fdtable.h | 17 +++++++++++------ 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/fs/file.c b/fs/file.c index 5fb0b146e79e..50df31e104a5 100644 --- a/fs/file.c +++ b/fs/file.c @@ -959,31 +959,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; + unsigned long nospec_mask; - if (unlikely(fd >= fdt->max_fds)) - return NULL; - - fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); + /* Mask is a 0 for invalid fd's, ~0 for valid ones */ + nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* - * Ok, we have a file pointer. However, because we do - * this all locklessly under RCU, we may be racing with - * that file being closed. - * - * Such a race can take two forms: - * - * (a) the file ref already went down to zero and the - * file hasn't been reused yet or the file count - * isn't zero but the file has already been reused. + * fdentry points to the 'fd' offset, or fdt->fd[0]. + * Loading from fdt->fd[0] is always safe, because the + * array always exists. */ - file = __get_file_rcu(fdentry); + fdentry = fdt->fd + (fd & nospec_mask); + + /* Do the load, then mask any invalid result */ + file = rcu_dereference_raw(*fdentry); + file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; - if (unlikely(IS_ERR(file))) + /* + * Ok, we have a file pointer that was valid at + * some point, but it might have become stale since. + * + * We need to confirm it by incrementing the refcount + * and then check the lookup again. + * + * atomic_long_inc_not_zero() gives us a full memory + * barrier. We only really need an 'acquire' one to + * protect the loads below, but we don't have that. + */ + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) continue; /* + * Such a race can take two forms: + * + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. + * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes @@ -991,7 +1005,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { + if (unlikely(file != rcu_dereference_raw(*fdentry)) || + unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } @@ -1128,13 +1143,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask) * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ - if (atomic_read_acquire(&files->count) == 1) { + if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { - file = __fget(fd, mask); + file = __fget_files(files, fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index bc4c3287a65e..80bd7789bab1 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -83,12 +83,17 @@ struct dentry; static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) { - fd = array_index_nospec(fd, fdt->max_fds); - return rcu_dereference_raw(fdt->fd[fd]); - } - return NULL; + unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); + struct file *needs_masking; + + /* + * 'mask' is zero for an out-of-bounds fd, all ones for ok. + * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. + * + * Accessing fdt->fd[0] is ok, but needs masking of the result. + */ + needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); + return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) -- cgit v1.2.3 From a88c955fcfb49727d0ed86b47410f6555a8e69e4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:07 +0100 Subject: file: s/close_fd_get_file()/file_close_fd()/g That really shouldn't have "get" in there as that implies we're bumping the reference count which we don't do at all. We used to but not anmore. Now we're just closing the fd and pick that file from the fdtable without bumping the reference count. Update the wrong documentation while at it. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-1-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- drivers/android/binder.c | 2 +- fs/file.c | 14 +++++++++----- fs/open.c | 2 +- include/linux/fdtable.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 92128aae2d06..7658103ba760 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1921,7 +1921,7 @@ static void binder_deferred_fd_close(int fd) if (!twcb) return; init_task_work(&twcb->twork, binder_do_fd_close); - twcb->file = close_fd_get_file(fd); + twcb->file = file_close_fd(fd); if (twcb->file) { // pin it until binder_do_fd_close(); see comments there get_file(twcb->file); diff --git a/fs/file.c b/fs/file.c index 50df31e104a5..66f04442a384 100644 --- a/fs/file.c +++ b/fs/file.c @@ -796,7 +796,7 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) } /* - * See close_fd_get_file() below, this variant assumes current->files->file_lock + * See file_close_fd() below, this variant assumes current->files->file_lock * is held. */ struct file *__close_fd_get_file(unsigned int fd) @@ -804,11 +804,15 @@ struct file *__close_fd_get_file(unsigned int fd) return pick_file(current->files, fd); } -/* - * variant of close_fd that gets a ref on the file for later fput. - * The caller must ensure that filp_close() called on the file. +/** + * file_close_fd - return file associated with fd + * @fd: file descriptor to retrieve file for + * + * Doesn't take a separate reference count. + * + * Returns: The file associated with @fd (NULL if @fd is not open) */ -struct file *close_fd_get_file(unsigned int fd) +struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; diff --git a/fs/open.c b/fs/open.c index 02dc608d40d8..48775329f3f1 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1577,7 +1577,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) int retval; struct file *file; - file = close_fd_get_file(fd); + file = file_close_fd(fd); if (!file) return -EBADF; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 80bd7789bab1..78c8326d74ae 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -119,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern struct file *close_fd_get_file(unsigned int fd); +extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); -- cgit v1.2.3 From 24fa3ae9467f49dd9698fd884f2c6b13cc8ea12d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:08 +0100 Subject: file: remove pointless wrapper Only io_uring uses __close_fd_get_file(). All it does is hide current->files but io_uring accesses files_struct directly right now anyway so it's a bit pointless. Just rename pick_file() to file_close_fd_locked() and let io_uring use it. Add a lockdep assert in there that we expect the caller to hold file_lock while we're at it. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-2-e73ca6f4ea83@kernel.org Reviewed-by: Jens Axboe Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/file.c | 23 +++++++++-------------- fs/internal.h | 2 +- io_uring/openclose.c | 2 +- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/fs/file.c b/fs/file.c index 66f04442a384..c8eaa0b29a08 100644 --- a/fs/file.c +++ b/fs/file.c @@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); /** - * pick_file - return file associatd with fd + * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * + * Doesn't take a separate reference count. + * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ -static struct file *pick_file(struct files_struct *files, unsigned fd) +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; + lockdep_assert_held(&files->file_lock); + if (fd >= fdt->max_fds) return NULL; @@ -660,7 +664,7 @@ int close_fd(unsigned fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; @@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd, max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); @@ -795,15 +799,6 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) return 0; } -/* - * See file_close_fd() below, this variant assumes current->files->file_lock - * is held. - */ -struct file *__close_fd_get_file(unsigned int fd) -{ - return pick_file(current->files, fd); -} - /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for @@ -818,7 +813,7 @@ struct file *file_close_fd(unsigned int fd) struct file *file; spin_lock(&files->file_lock); - file = pick_file(files, fd); + file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; diff --git a/fs/internal.h b/fs/internal.h index 273e6fd40d1b..a7469ddba9b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -179,7 +179,7 @@ extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); -extern struct file *__close_fd_get_file(unsigned int fd); +struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); diff --git a/io_uring/openclose.c b/io_uring/openclose.c index fb73adb89067..74fc22461f48 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -241,7 +241,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - file = __close_fd_get_file(close->fd); + file = file_close_fd_locked(files, close->fd); spin_unlock(&files->file_lock); if (!file) goto err; -- cgit v1.2.3 From 372a34e66fb7f95124fadae9c600b231c35696a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:09 +0100 Subject: fs: replace f_rcuhead with f_task_work The naming is actively misleading since we switched to SLAB_TYPESAFE_BY_RCU. rcu_head is #define callback_head. Use callback_head directly and rename f_rcuhead to f_task_work. Add comments in there to explain what it's used for. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-3-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/file_table.c | 6 +++--- include/linux/fs.h | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 6deac386486d..3ba764d73fc9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -407,7 +407,7 @@ static void delayed_fput(struct work_struct *unused) static void ____fput(struct callback_head *work) { - __fput(container_of(work, struct file, f_rcuhead)); + __fput(container_of(work, struct file, f_task_work)); } /* @@ -438,8 +438,8 @@ void fput(struct file *file) return; } if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_rcuhead, ____fput); - if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME)) + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; /* * After this task has run exit_task_work(), diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..354fd02e0e11 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) */ struct file { union { + /* fput() uses task work when closing and freeing file (default). */ + struct callback_head f_task_work; + /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; -- cgit v1.2.3 From eac9189c96196574a83a553ca5a7543dd9f5fe3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:10 +0100 Subject: file: stop exposing receive_fd_user() Not every subsystem needs to have their own specialized helper. Just us the __receive_fd() helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-4-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/file.h | 7 ------- include/net/scm.h | 9 +++++++++ net/compat.c | 2 +- net/core/scm.c | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/linux/file.h b/include/linux/file.h index 6e9099d29343..c0d5219c2852 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -101,13 +101,6 @@ extern int __receive_fd(struct file *file, int __user *ufd, extern int receive_fd(struct file *file, unsigned int o_flags); -static inline int receive_fd_user(struct file *file, int __user *ufd, - unsigned int o_flags) -{ - if (ufd == NULL) - return -EFAULT; - return __receive_fd(file, ufd, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); diff --git a/include/net/scm.h b/include/net/scm.h index e8c76b4be2fe..8aae2468bae0 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -208,5 +209,13 @@ static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, scm_destroy_cred(scm); } +static inline int scm_recv_one_fd(struct file *f, int __user *ufd, + unsigned int flags) +{ + if (!ufd) + return -EFAULT; + return __receive_fd(f, ufd, flags); +} + #endif /* __LINUX_NET_SCM_H */ diff --git a/net/compat.c b/net/compat.c index 6564720f32b7..485db8ee9b28 100644 --- a/net/compat.c +++ b/net/compat.c @@ -297,7 +297,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) int err = 0, i; for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } diff --git a/net/core/scm.c b/net/core/scm.c index 880027ecf516..eec78e312550 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -319,7 +319,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); + err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err < 0) break; } -- cgit v1.2.3 From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:11 +0100 Subject: file: remove __receive_fd() Honestly, there's little value in having a helper with and without that int __user *ufd argument. It's just messy and doesn't really give us anything. Just expose receive_fd() with that argument and get rid of that helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- fs/file.c | 11 +++-------- include/linux/file.h | 5 +---- include/net/scm.h | 2 +- kernel/pid.c | 2 +- kernel/seccomp.c | 2 +- 6 files changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 0ddd4b8abecb..fafd4610b185 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -1157,7 +1157,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, fput(f); break; } - ret = receive_fd(f, perm_to_file_flags(entry.perm)); + ret = receive_fd(f, NULL, perm_to_file_flags(entry.perm)); fput(f); break; } diff --git a/fs/file.c b/fs/file.c index c8eaa0b29a08..3b683b9101d8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1296,7 +1296,7 @@ out_unlock: } /** - * __receive_fd() - Install received file into file descriptor table + * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1310,7 +1310,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1335,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) __receive_sock(file); return new_fd; } +EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { @@ -1350,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } -int receive_fd(struct file *file, unsigned int o_flags) -{ - return __receive_fd(file, NULL, o_flags); -} -EXPORT_SYMBOL_GPL(receive_fd); - static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/include/linux/file.h b/include/linux/file.h index c0d5219c2852..6834a29338c4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); - -extern int receive_fd(struct file *file, unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); diff --git a/include/net/scm.h b/include/net/scm.h index 8aae2468bae0..cf68acec4d70 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -214,7 +214,7 @@ static inline int scm_recv_one_fd(struct file *f, int __user *ufd, { if (!ufd) return -EFAULT; - return __receive_fd(f, ufd, flags); + return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 6500ef956f2f..b52b10865454 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -700,7 +700,7 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = receive_fd(file, O_CLOEXEC); + ret = receive_fd(file, NULL, O_CLOEXEC); fput(file); return ret; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 255999ba9190..aca7b437882e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1072,7 +1072,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn */ list_del_init(&addfd->list); if (!addfd->setfd) - fd = receive_fd(addfd->file, addfd->flags); + fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; -- cgit v1.2.3 From e95aada4cb93d42e25c30a0ef9eb2923d9711d4a Mon Sep 17 00:00:00 2001 From: Lukas Schauer Date: Fri, 1 Dec 2023 11:11:28 +0100 Subject: pipe: wakeup wr_wait after setting max_usage Commit c73be61cede5 ("pipe: Add general notification queue support") a regression was introduced that would lock up resized pipes under certain conditions. See the reproducer in [1]. The commit resizing the pipe ring size was moved to a different function, doing that moved the wakeup for pipe->wr_wait before actually raising pipe->max_usage. If a pipe was full before the resize occured it would result in the wakeup never actually triggering pipe_write. Set @max_usage and @nr_accounted before waking writers if this isn't a watch queue. Link: https://bugzilla.kernel.org/show_bug.cgi?id=212295 [1] Link: https://lore.kernel.org/r/20231201-orchideen-modewelt-e009de4562c6@brauner Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reviewed-by: David Howells Cc: Signed-off-by: Lukas Schauer [Christian Brauner : rewrite to account for watch queues] Signed-off-by: Christian Brauner --- fs/pipe.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 226e7f66b590..8d9286a1f2e8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1324,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) pipe->tail = tail; pipe->head = head; + if (!pipe_has_watch_queue(pipe)) { + pipe->max_usage = nr_slots; + pipe->nr_accounted = nr_slots; + } + spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ @@ -1375,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) if (ret < 0) goto out_revert_acct; - pipe->max_usage = nr_slots; - pipe->nr_accounted = nr_slots; return pipe->max_usage * PAGE_SIZE; out_revert_acct: -- cgit v1.2.3 From effa1870b29c39a520c115aa33fd1814e3610249 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Tue, 5 Dec 2023 14:45:45 +0800 Subject: fs/inode: Make relatime_need_update return bool relatime_need_update should return bool to consistent with the function __atime_needs_update that is caller Signed-off-by: Hao Ge Link: https://lore.kernel.org/r/20231205064545.332322-1-gehao@kylinos.cn Signed-off-by: Christian Brauner --- fs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 788aa0aa542b..961540b5f16e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1834,37 +1834,37 @@ EXPORT_SYMBOL(bmap); * earlier than or equal to either the ctime or mtime, * or if at least a day has passed since the last atime update. */ -static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, +static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) - return 1; + return true; /* * Is mtime younger than or equal to atime? If yes, update atime: */ atime = inode_get_atime(inode); mtime = inode_get_mtime(inode); if (timespec64_compare(&mtime, &atime) >= 0) - return 1; + return true; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); if (timespec64_compare(&ctime, &atime) >= 0) - return 1; + return true; /* * Is the previous atime value older than a day? If yes, * update atime: */ if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) - return 1; + return true; /* * Good, we can skip the atime update: */ - return 0; + return false; } /** -- cgit v1.2.3 From 67ca056bf1f6e6078c66032cded8ef74920d52d7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 5 Dec 2023 12:43:37 +0100 Subject: fs: add Jan Kara as reviewer Jan's been really essential in help deal with reviews in a bunch of areas and we should really make him an official reviewer. This is long overdue imho. Link: https://lore.kernel.org/r/20231205-aufkam-neukunden-d14970a0a6cc@brauner Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d0a7b6f357ce..d60c4888e6df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8157,6 +8157,7 @@ F: include/trace/events/fs_dax.h FILESYSTEMS (VFS and infrastructure) M: Alexander Viro M: Christian Brauner +R: Jan Kara L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/* -- cgit v1.2.3 From 02105f18a26c985a47b40b7401541535ab78a1dd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 5 Dec 2023 18:43:17 -0800 Subject: fs/hfsplus: wrapper.c: fix kernel-doc warnings Fix kernel-doc warnings found when using "W=1". wrapper.c:48: warning: No description found for return value of 'hfsplus_submit_bio' wrapper.c:49: warning: Function parameter or member 'opf' not described in 'hfsplus_submit_bio' wrapper.c:49: warning: Excess function parameter 'op' description in 'hfsplus_submit_bio' wrapper.c:49: warning: Excess function parameter 'op_flags' description in 'hfsplus_submit_bio' Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20231206024317.31020-1-rdunlap@infradead.org Cc: Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Signed-off-by: Christian Brauner --- fs/hfsplus/wrapper.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0b791adf02e5..b0cb70400996 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,8 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @op: direction of I/O - * @op_flags: request op flags + * @opf: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -43,6 +42,8 @@ struct hfsplus_wd { * that starts at the rounded-down address. As long as the data was * read using hfsplus_submit_bio() and the same buffer is used things * will work correctly. + * + * Returns: %0 on success else -errno code */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) -- cgit v1.2.3 From 2b46a19db0a1769a25adc65d1784b68bd8b60046 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 8 Dec 2023 16:10:22 +0100 Subject: fs: super: use GFP_KERNEL instead of GFP_USER for super block allocation There is no reason to use a GFP_USER flag for struct super_block allocation in the alloc_super(). Instead, let's use GFP_KERNEL for that. >From the memory management perspective, the only difference between GFP_USER and GFP_KERNEL is that GFP_USER allocations are tied to a cpuset, while GFP_KERNEL ones are not. There is no real issue and this is not a candidate to go to the stable, but let's fix it for a consistency sake. Cc: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Cc: Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20231208151022.156273-1-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/super.c b/fs/super.c index 076392396e72..6fe482371633 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,7 @@ static void destroy_unused_super(struct super_block *s) static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { - struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; -- cgit v1.2.3 From 434225230081b2b3435703decfa9c4214aac22ce Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 10 Dec 2023 18:32:18 +0100 Subject: eventfd: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/575dcecd51097dd30c5515f9f0ed92076b4ef403.1702229520.git.christophe.jaillet@wanadoo.fr Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/eventfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 16bea05a7c78..ad8186d47ba7 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) - ida_simple_remove(&eventfd_ida, ctx->id); + ida_free(&eventfd_ida, ctx->id); kfree(ctx); } @@ -395,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags) init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; - ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL); + ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; -- cgit v1.2.3 From 3efdc78fdc21ab82694707eb234ab93f28d13ba8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 13 Dec 2023 22:44:38 -0800 Subject: fs/proc: show correct device and inode numbers in /proc/pid/maps /proc/pid/maps shows device and inode numbers of vma->vm_file-s. Here is an issue. If a mapped file is on a stackable file system (e.g., overlayfs), vma->vm_file is a backing file whose f_inode is on the underlying filesystem. To show correct numbers, we need to get a user file and shows its numbers. The same trick is used to show file paths in /proc/pid/maps. Cc: Alexander Mikhalitsyn Suggested-by: Amir Goldstein Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20231214064439.1023011-1-avagin@google.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/proc/task_mmu.c | 3 ++- include/linux/fs.h | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef2eb12906da..5de0f70e52ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = NULL; if (file) { - struct inode *inode = file_inode(vma->vm_file); + const struct inode *inode = file_user_inode(vma->vm_file); + dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; diff --git a/include/linux/fs.h b/include/linux/fs.h index f171505940ff..a3a48a5d8728 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2523,20 +2523,28 @@ struct file *backing_file_open(const struct path *user_path, int flags, struct path *backing_file_user_path(struct file *f); /* - * file_user_path - get the path to display for memory mapped file - * * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying - * filesystem. When the mapped file path is displayed to user (e.g. via - * /proc//maps), this helper should be used to get the path to display - * to the user, which is the path of the fd that user has requested to map. + * filesystem. When the mapped file path and inode number are displayed to + * user (e.g. via /proc//maps), these helpers should be used to get the + * path and inode number to display to the user, which is the path of the fd + * that user has requested to map and the inode number that would be returned + * by fstat() on that same fd. */ +/* Get the path to display in /proc//maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } +/* Get the inode whose inode number to display in /proc//maps */ +static inline const struct inode *file_user_inode(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return d_inode(backing_file_user_path(f)->dentry); + return file_inode(f); +} static inline struct file *file_clone_open(struct file *file) { -- cgit v1.2.3 From b5a78c7127f2007cfc7ad322b6ce0aa4bf347138 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 13 Dec 2023 22:44:39 -0800 Subject: selftests/overlayfs: verify device and inode numbers in /proc/pid/maps When mapping a file on overlayfs, the file stored in ->vm_file is a backing file whose f_inode is on the underlying filesystem. We need to verify that /proc/pid/maps contains numbers of the overlayfs file, but not its backing file. Cc: Amir Goldstein Cc: Alexander Mikhalitsyn Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20231214064439.1023011-2-avagin@google.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- tools/testing/selftests/Makefile | 1 + .../selftests/filesystems/overlayfs/.gitignore | 2 + .../selftests/filesystems/overlayfs/Makefile | 7 + .../selftests/filesystems/overlayfs/dev_in_maps.c | 182 +++++++++++++++++++++ .../testing/selftests/filesystems/overlayfs/log.h | 26 +++ 5 files changed, 218 insertions(+) create mode 100644 tools/testing/selftests/filesystems/overlayfs/.gitignore create mode 100644 tools/testing/selftests/filesystems/overlayfs/Makefile create mode 100644 tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c create mode 100644 tools/testing/selftests/filesystems/overlayfs/log.h diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 3b2061d1c1a5..0939a40abb28 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -26,6 +26,7 @@ TARGETS += filesystems TARGETS += filesystems/binderfs TARGETS += filesystems/epoll TARGETS += filesystems/fat +TARGETS += filesystems/overlayfs TARGETS += firmware TARGETS += fpu TARGETS += ftrace diff --git a/tools/testing/selftests/filesystems/overlayfs/.gitignore b/tools/testing/selftests/filesystems/overlayfs/.gitignore new file mode 100644 index 000000000000..52ae618fdd98 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +dev_in_maps diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile new file mode 100644 index 000000000000..56b2b48a765b --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +TEST_GEN_PROGS := dev_in_maps + +CFLAGS := -Wall -Werror + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c new file mode 100644 index 000000000000..e19ab0e85709 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest.h" +#include "log.h" + +static int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} + +static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +static int sys_move_mount(int from_dfd, const char *from_pathname, + int to_dfd, const char *to_pathname, + unsigned int flags) +{ + return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags); +} + +static long get_file_dev_and_inode(void *addr, struct statx *stx) +{ + char buf[4096]; + FILE *mapf; + + mapf = fopen("/proc/self/maps", "r"); + if (mapf == NULL) + return pr_perror("fopen(/proc/self/maps)"); + + while (fgets(buf, sizeof(buf), mapf)) { + unsigned long start, end; + uint32_t maj, min; + __u64 ino; + + if (sscanf(buf, "%lx-%lx %*s %*s %x:%x %llu", + &start, &end, &maj, &min, &ino) != 5) + return pr_perror("unable to parse: %s", buf); + if (start == (unsigned long)addr) { + stx->stx_dev_major = maj; + stx->stx_dev_minor = min; + stx->stx_ino = ino; + return 0; + } + } + + return pr_err("unable to find the mapping"); +} + +static int ovl_mount(void) +{ + int tmpfs, fsfd, ovl; + + fsfd = sys_fsopen("tmpfs", 0); + if (fsfd == -1) + return pr_perror("fsopen(tmpfs)"); + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) + return pr_perror("FSCONFIG_CMD_CREATE"); + + tmpfs = sys_fsmount(fsfd, 0, 0); + if (tmpfs == -1) + return pr_perror("fsmount"); + + close(fsfd); + + /* overlayfs can't be constructed on top of a detached mount. */ + if (sys_move_mount(tmpfs, "", AT_FDCWD, "/tmp", MOVE_MOUNT_F_EMPTY_PATH)) + return pr_perror("move_mount"); + close(tmpfs); + + if (mkdir("/tmp/w", 0755) == -1 || + mkdir("/tmp/u", 0755) == -1 || + mkdir("/tmp/l", 0755) == -1) + return pr_perror("mkdir"); + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) + return pr_perror("fsopen(overlay)"); + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "source", "test", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "lowerdir", "/tmp/l", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "upperdir", "/tmp/u", 0) == -1 || + sys_fsconfig(fsfd, FSCONFIG_SET_STRING, "workdir", "/tmp/w", 0) == -1) + return pr_perror("fsconfig"); + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) == -1) + return pr_perror("fsconfig"); + ovl = sys_fsmount(fsfd, 0, 0); + if (ovl == -1) + return pr_perror("fsmount"); + + return ovl; +} + +/* + * Check that the file device and inode shown in /proc/pid/maps match values + * returned by stat(2). + */ +static int test(void) +{ + struct statx stx, mstx; + int ovl, fd; + void *addr; + + ovl = ovl_mount(); + if (ovl == -1) + return -1; + + fd = openat(ovl, "test", O_RDWR | O_CREAT, 0644); + if (fd == -1) + return pr_perror("openat"); + + addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) + return pr_perror("mmap"); + + if (get_file_dev_and_inode(addr, &mstx)) + return -1; + if (statx(fd, "", AT_EMPTY_PATH | AT_STATX_SYNC_AS_STAT, STATX_INO, &stx)) + return pr_perror("statx"); + + if (stx.stx_dev_major != mstx.stx_dev_major || + stx.stx_dev_minor != mstx.stx_dev_minor || + stx.stx_ino != mstx.stx_ino) + return pr_fail("unmatched dev:ino %x:%x:%llx (expected %x:%x:%llx)\n", + mstx.stx_dev_major, mstx.stx_dev_minor, mstx.stx_ino, + stx.stx_dev_major, stx.stx_dev_minor, stx.stx_ino); + + ksft_test_result_pass("devices are matched\n"); + return 0; +} + +int main(int argc, char **argv) +{ + int fsfd; + + fsfd = sys_fsopen("overlay", 0); + if (fsfd == -1) { + ksft_test_result_skip("unable to create overlay mount\n"); + return 1; + } + close(fsfd); + + /* Create a new mount namespace to not care about cleaning test mounts. */ + if (unshare(CLONE_NEWNS) == -1) { + ksft_test_result_skip("unable to create a new mount namespace\n"); + return 1; + } + + if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + pr_perror("mount"); + return 1; + } + + ksft_set_plan(1); + + if (test()) + return 1; + + ksft_exit_pass(); + return 0; +} diff --git a/tools/testing/selftests/filesystems/overlayfs/log.h b/tools/testing/selftests/filesystems/overlayfs/log.h new file mode 100644 index 000000000000..db64df2a8483 --- /dev/null +++ b/tools/testing/selftests/filesystems/overlayfs/log.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SELFTEST_TIMENS_LOG_H__ +#define __SELFTEST_TIMENS_LOG_H__ + +#define pr_msg(fmt, lvl, ...) \ + ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \ + lvl, __FILE__, __LINE__, ##__VA_ARGS__) + +#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__) + +#define pr_err(fmt, ...) \ + ({ \ + ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_fail(fmt, ...) \ + ({ \ + ksft_test_result_fail(fmt, ##__VA_ARGS__); \ + -1; \ + }) + +#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__) + +#endif -- cgit v1.2.3 From 376870aa2344397d6fbc3e7be036f2f4e9ba77c1 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 15 Dec 2023 14:09:27 +0100 Subject: fs: fix doc comment typo fs tree wide Do the replacement: s/simply passs @nop_mnt_idmap/simply pass @nop_mnt_idmap/ in the fs/ tree. Found by chance while working on support for idmapped mounts in fuse. Cc: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Cc: Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20231215130927.136917-1-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/attr.c | 2 +- fs/inode.c | 2 +- fs/namei.c | 22 +++++++++++----------- fs/posix_acl.c | 4 ++-- fs/stat.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index bdf5deb06ea9..5a13f0c8495f 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. diff --git a/fs/inode.c b/fs/inode.c index 961540b5f16e..287c6269553d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2402,7 +2402,7 @@ EXPORT_SYMBOL(inode_init_owner); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode) diff --git a/fs/namei.c b/fs/namei.c index 53db89e99f97..a1124bf29b3b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) @@ -3158,7 +3158,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -3646,7 +3646,7 @@ static int do_open(struct nameidata *nd, * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ static int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, @@ -3954,7 +3954,7 @@ EXPORT_SYMBOL(user_path_create); * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -4080,7 +4080,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) @@ -4161,7 +4161,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) @@ -4290,7 +4290,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) @@ -4443,7 +4443,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) @@ -4535,7 +4535,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the - * raw inode simply passs @nop_mnt_idmap. + * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, diff --git a/fs/posix_acl.c b/fs/posix_acl.c index a05fe94970ce..e1af20893ebe 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, @@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ diff --git a/fs/stat.c b/fs/stat.c index 24bb0209e459..0ab525f80a49 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -41,7 +41,7 @@ * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before filling in the * uid and gid filds. On non-idmapped mounts or if permission checking is to be - * performed on the raw inode simply passs @nop_mnt_idmap. + * performed on the raw inode simply pass @nop_mnt_idmap. */ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, struct inode *inode, struct kstat *stat) -- cgit v1.2.3 From 4cf8249dc907398f694d310b89b494c144a4d9ec Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Dec 2023 20:54:14 -0800 Subject: ntfs: dir.c: fix kernel-doc function parameter warnings Correct the kernel-doc function parameter warnings for function ntfs_dir_fsync() to prevent the following kernel-doc warnings: dir.c:1489: warning: Function parameter or member 'start' not described in 'ntfs_dir_fsync' dir.c:1489: warning: Function parameter or member 'end' not described in 'ntfs_dir_fsync' dir.c:1489: warning: Excess function parameter 'dentry' description in 'ntfs_dir_fsync' Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20231219045414.24670-1-rdunlap@infradead.org Reviewed-by: Namjae Jeon Cc: Anton Altaparmakov Cc: Namjae Jeon Cc: Cc: Andrew Morton Signed-off-by: Christian Brauner --- fs/ntfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 4596c90e7b7c..629723a8d712 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) /** * ntfs_dir_fsync - sync a directory to disk * @filp: directory to be synced - * @dentry: dentry describing the directory to sync + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) * @datasync: if non-zero only flush user data and not metadata * * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and -- cgit v1.2.3 From 1bfc466b13cf6652ba227c282c27a30ffede69a5 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Dec 2023 12:01:21 +0300 Subject: watch_queue: fix kcalloc() arguments order When compiling with gcc version 14.0.0 20231220 (experimental) and W=1, I've noticed the following warning: kernel/watch_queue.c: In function 'watch_queue_set_size': kernel/watch_queue.c:273:32: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 273 | pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); | ^~~~~~ Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/r/20231221090139.12579-1-dmantipov@yandex.ru Signed-off-by: Christian Brauner --- kernel/watch_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 778b4056700f..03b90d7d2175 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -270,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; ret = -ENOMEM; - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; -- cgit v1.2.3 From dd8f87f21dc3da2eaf46e7401173f935b90b13a8 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 26 Dec 2023 15:16:09 +0800 Subject: reiserfs: fix uninit-value in comp_keys The cpu_key was not initialized in reiserfs_delete_solid_item(), which triggered this issue. Reported-and-tested-by: Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_9EA7E746DE92DBC66049A62EDF6ED64CA706@qq.com Signed-off-by: Christian Brauner --- fs/reiserfs/stree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 2138ee7d271d..5faf702f8d15 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, INITIALIZE_PATH(path); int item_len = 0; int tb_init = 0; - struct cpu_key cpu_key; + struct cpu_key cpu_key = {}; int retval; int quota_cut_bytes = 0; -- cgit v1.2.3