aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig26
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c11
-rw-r--r--fs/afs/Makefile2
-rw-r--r--fs/afs/addr_list.c224
-rw-r--r--fs/afs/addr_prefs.c531
-rw-r--r--fs/afs/afs.h3
-rw-r--r--fs/afs/callback.c141
-rw-r--r--fs/afs/cell.c5
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c66
-rw-r--r--fs/afs/dir_silly.c2
-rw-r--r--fs/afs/file.c20
-rw-r--r--fs/afs/fs_operation.c85
-rw-r--r--fs/afs/fs_probe.c323
-rw-r--r--fs/afs/fsclient.c74
-rw-r--r--fs/afs/inode.c204
-rw-r--r--fs/afs/internal.h370
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/misc.c10
-rw-r--r--fs/afs/proc.c102
-rw-r--r--fs/afs/rotate.c520
-rw-r--r--fs/afs/rxrpc.c107
-rw-r--r--fs/afs/server.c135
-rw-r--r--fs/afs/server_list.c174
-rw-r--r--fs/afs/super.c7
-rw-r--r--fs/afs/validation.c473
-rw-r--r--fs/afs/vl_alias.c69
-rw-r--r--fs/afs/vl_list.c29
-rw-r--r--fs/afs/vl_probe.c60
-rw-r--r--fs/afs/vl_rotate.c215
-rw-r--r--fs/afs/vlclient.c143
-rw-r--r--fs/afs/volume.c61
-rw-r--r--fs/afs/write.c14
-rw-r--r--fs/afs/yfsclient.c25
-rw-r--r--fs/aio.c87
-rw-r--r--fs/attr.c2
-rw-r--r--fs/backing-file.c336
-rw-r--r--fs/bcachefs/fs-ioctl.c4
-rw-r--r--fs/bcachefs/fs.c2
-rw-r--r--fs/bcachefs/super-io.c19
-rw-r--r--fs/bcachefs/super_types.h1
-rw-r--r--fs/bfs/file.c9
-rw-r--r--fs/btrfs/accessors.c98
-rw-r--r--fs/btrfs/accessors.h4
-rw-r--r--fs/btrfs/bio.c17
-rw-r--r--fs/btrfs/bio.h4
-rw-r--r--fs/btrfs/block-group.c169
-rw-r--r--fs/btrfs/block-group.h6
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/compression.c139
-rw-r--r--fs/btrfs/compression.h5
-rw-r--r--fs/btrfs/ctree.c63
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/defrag.c13
-rw-r--r--fs/btrfs/delayed-inode.c109
-rw-r--r--fs/btrfs/dev-replace.c28
-rw-r--r--fs/btrfs/disk-io.c155
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/extent-io-tree.c119
-rw-r--r--fs/btrfs/extent-io-tree.h18
-rw-r--r--fs/btrfs/extent-tree.c104
-rw-r--r--fs/btrfs/extent_io.c1087
-rw-r--r--fs/btrfs/extent_io.h80
-rw-r--r--fs/btrfs/extent_map.c195
-rw-r--r--fs/btrfs/extent_map.h77
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/fs.h18
-rw-r--r--fs/btrfs/inode.c155
-rw-r--r--fs/btrfs/ioctl.c12
-rw-r--r--fs/btrfs/lru_cache.c2
-rw-r--r--fs/btrfs/lzo.c4
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/ordered-data.c5
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/raid56.c7
-rw-r--r--fs/btrfs/raid56.h2
-rw-r--r--fs/btrfs/reflink.c6
-rw-r--r--fs/btrfs/relocation.c7
-rw-r--r--fs/btrfs/scrub.c63
-rw-r--r--fs/btrfs/subpage.c373
-rw-r--r--fs/btrfs/subpage.h82
-rw-r--r--fs/btrfs/super.c2291
-rw-r--r--fs/btrfs/super.h5
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c4
-rw-r--r--fs/btrfs/tests/extent-map-tests.c143
-rw-r--r--fs/btrfs/tests/inode-tests.c60
-rw-r--r--fs/btrfs/tree-checker.h2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c934
-rw-r--r--fs/btrfs/volumes.h47
-rw-r--r--fs/btrfs/xattr.c55
-rw-r--r--fs/btrfs/zlib.c6
-rw-r--r--fs/btrfs/zoned.c66
-rw-r--r--fs/btrfs/zoned.h12
-rw-r--r--fs/btrfs/zstd.c7
-rw-r--r--fs/buffer.c283
-rw-r--r--fs/cachefiles/daemon.c15
-rw-r--r--fs/cachefiles/interface.c7
-rw-r--r--fs/cachefiles/internal.h59
-rw-r--r--fs/cachefiles/io.c5
-rw-r--r--fs/cachefiles/ondemand.c166
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/file.c13
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/dax.c2
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/debugfs/file.c28
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c6
-rw-r--r--fs/dlm/lowcomms.c14
-rw-r--r--fs/dlm/plock.c20
-rw-r--r--fs/efivarfs/inode.c3
-rw-r--r--fs/efivarfs/internal.h8
-rw-r--r--fs/efivarfs/super.c66
-rw-r--r--fs/efivarfs/vars.c5
-rw-r--r--fs/eventfd.c46
-rw-r--r--fs/exec.c3
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c10
-rw-r--r--fs/ext4/ioctl.c4
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/super.c8
-rw-r--r--fs/f2fs/compress.c2
-rw-r--r--fs/f2fs/file.c4
-rw-r--r--fs/f2fs/inode.c2
-rw-r--r--fs/file.c97
-rw-r--r--fs/file_table.c22
-rw-r--r--fs/freevxfs/vxfs_bmap.c8
-rw-r--r--fs/freevxfs/vxfs_immed.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/gfs2/aops.c49
-rw-r--r--fs/gfs2/dentry.c23
-rw-r--r--fs/gfs2/export.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c49
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/glops.c4
-rw-r--r--fs/gfs2/inode.c8
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/log.c63
-rw-r--r--fs/gfs2/lops.c21
-rw-r--r--fs/gfs2/meta_io.c9
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/quota.c22
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/super.c88
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/gfs2/util.c4
-rw-r--r--fs/gfs2/util.h15
-rw-r--r--fs/hfs/inode.c8
-rw-r--r--fs/hfsplus/inode.c8
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/inode.c26
-rw-r--r--fs/internal.h13
-rw-r--r--fs/ioctl.c3
-rw-r--r--fs/iomap/buffered-io.c14
-rw-r--r--fs/jffs2/debug.c2
-rw-r--r--fs/jfs/jfs_dmap.c65
-rw-r--r--fs/jfs/jfs_dtree.c7
-rw-r--r--fs/jfs/jfs_imap.c3
-rw-r--r--fs/jfs/jfs_mount.c6
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/minix/inode.c9
-rw-r--r--fs/mnt_idmapping.c159
-rw-r--r--fs/mount.h27
-rw-r--r--fs/mpage.c62
-rw-r--r--fs/namei.c31
-rw-r--r--fs/namespace.c641
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/nfs42xattr.c8
-rw-r--r--fs/nfs/nfs4file.c5
-rw-r--r--fs/nfs/write.c12
-rw-r--r--fs/nfsd/filecache.c4
-rw-r--r--fs/nfsd/vfs.c7
-rw-r--r--fs/nilfs2/btnode.c62
-rw-r--r--fs/nilfs2/cpfile.c28
-rw-r--r--fs/nilfs2/dir.c244
-rw-r--r--fs/nilfs2/file.c28
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/inode.c15
-rw-r--r--fs/nilfs2/ioctl.c10
-rw-r--r--fs/nilfs2/mdt.c23
-rw-r--r--fs/nilfs2/namei.c38
-rw-r--r--fs/nilfs2/nilfs.h20
-rw-r--r--fs/nilfs2/page.c93
-rw-r--r--fs/nilfs2/page.h12
-rw-r--r--fs/nilfs2/segment.c158
-rw-r--r--fs/nilfs2/sufile.c9
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/notify/fanotify/fanotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.h16
-rw-r--r--fs/notify/fanotify/fanotify_user.c124
-rw-r--r--fs/notify/mark.c52
-rw-r--r--fs/ntfs/aops.c20
-rw-r--r--fs/ntfs/dir.c3
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c17
-rw-r--r--fs/ocfs2/export.c1
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/open.c47
-rw-r--r--fs/overlayfs/Kconfig1
-rw-r--r--fs/overlayfs/copy_up.c30
-rw-r--r--fs/overlayfs/file.c247
-rw-r--r--fs/overlayfs/overlayfs.h8
-rw-r--r--fs/overlayfs/super.c12
-rw-r--r--fs/pipe.c24
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/posix_acl.c4
-rw-r--r--fs/proc/base.c29
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/task_mmu.c23
-rw-r--r--fs/proc_namespace.c13
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c235
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/remap_range.c45
-rw-r--r--fs/smb/client/cifsfs.c5
-rw-r--r--fs/smb/client/file.c6
-rw-r--r--fs/splice.c243
-rw-r--r--fs/squashfs/file.c3
-rw-r--r--fs/squashfs/file_direct.c6
-rw-r--r--fs/stat.c11
-rw-r--r--fs/super.c500
-rw-r--r--fs/sysv/itree.c9
-rw-r--r--fs/ufs/inode.c11
-rw-r--r--fs/userfaultfd.c72
-rw-r--r--fs/xfs/Makefile21
-rw-r--r--fs/xfs/libxfs/xfs_ag.c38
-rw-r--r--fs/xfs/libxfs/xfs_ag.h12
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c116
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h24
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_attr.c125
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c238
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h24
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c201
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h9
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c123
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c28
-rw-r--r--fs/xfs/libxfs/xfs_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c89
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h33
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c69
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h33
-rw-r--r--fs/xfs/libxfs/xfs_defer.c453
-rw-r--r--fs/xfs/libxfs/xfs_defer.h59
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c91
-rw-r--r--fs/xfs/libxfs/xfs_format.h19
-rw-r--r--fs/xfs/libxfs/xfs_health.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c36
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c59
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c78
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h13
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h8
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h (renamed from fs/xfs/xfs_ondisk.h)22
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c57
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c120
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h20
-rw-r--r--fs/xfs/libxfs/xfs_sb.c6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c12
-rw-r--r--fs/xfs/libxfs/xfs_types.h8
-rw-r--r--fs/xfs/scrub/agb_bitmap.c103
-rw-r--r--fs/xfs/scrub/agb_bitmap.h68
-rw-r--r--fs/xfs/scrub/agheader_repair.c19
-rw-r--r--fs/xfs/scrub/alloc.c52
-rw-r--r--fs/xfs/scrub/alloc_repair.c934
-rw-r--r--fs/xfs/scrub/attr.c17
-rw-r--r--fs/xfs/scrub/bitmap.c467
-rw-r--r--fs/xfs/scrub/bitmap.h111
-rw-r--r--fs/xfs/scrub/bmap.c162
-rw-r--r--fs/xfs/scrub/bmap_repair.c867
-rw-r--r--fs/xfs/scrub/common.c35
-rw-r--r--fs/xfs/scrub/common.h56
-rw-r--r--fs/xfs/scrub/cow_repair.c614
-rw-r--r--fs/xfs/scrub/dir.c42
-rw-r--r--fs/xfs/scrub/dqiterate.c211
-rw-r--r--fs/xfs/scrub/fsb_bitmap.h37
-rw-r--r--fs/xfs/scrub/health.c34
-rw-r--r--fs/xfs/scrub/health.h2
-rw-r--r--fs/xfs/scrub/ialloc.c39
-rw-r--r--fs/xfs/scrub/ialloc_repair.c884
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/inode_repair.c1525
-rw-r--r--fs/xfs/scrub/newbt.c559
-rw-r--r--fs/xfs/scrub/newbt.h68
-rw-r--r--fs/xfs/scrub/off_bitmap.h37
-rw-r--r--fs/xfs/scrub/parent.c17
-rw-r--r--fs/xfs/scrub/quota.c107
-rw-r--r--fs/xfs/scrub/quota.h36
-rw-r--r--fs/xfs/scrub/quota_repair.c575
-rw-r--r--fs/xfs/scrub/readdir.c6
-rw-r--r--fs/xfs/scrub/reap.c168
-rw-r--r--fs/xfs/scrub/reap.h5
-rw-r--r--fs/xfs/scrub/refcount.c2
-rw-r--r--fs/xfs/scrub/refcount_repair.c794
-rw-r--r--fs/xfs/scrub/repair.c391
-rw-r--r--fs/xfs/scrub/repair.h99
-rw-r--r--fs/xfs/scrub/rmap.c1
-rw-r--r--fs/xfs/scrub/rtbitmap.c107
-rw-r--r--fs/xfs/scrub/rtbitmap.h22
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c202
-rw-r--r--fs/xfs/scrub/rtsummary.c143
-rw-r--r--fs/xfs/scrub/scrub.c62
-rw-r--r--fs/xfs/scrub/scrub.h15
-rw-r--r--fs/xfs/scrub/symlink.c22
-rw-r--r--fs/xfs/scrub/trace.c3
-rw-r--r--fs/xfs/scrub/trace.h488
-rw-r--r--fs/xfs/scrub/xfarray.h22
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_item.c295
-rw-r--r--fs/xfs/xfs_attr_list.c13
-rw-r--r--fs/xfs/xfs_bmap_item.c200
-rw-r--r--fs/xfs/xfs_bmap_util.c141
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c50
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_dir2_readdir.c9
-rw-r--r--fs/xfs/xfs_dquot.c39
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_extent_busy.c13
-rw-r--r--fs/xfs/xfs_extent_busy.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c332
-rw-r--r--fs/xfs/xfs_fsops.c63
-rw-r--r--fs/xfs/xfs_fsops.h14
-rw-r--r--fs/xfs/xfs_globals.c12
-rw-r--r--fs/xfs/xfs_health.c8
-rw-r--r--fs/xfs/xfs_inode.c65
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c13
-rw-r--r--fs/xfs/xfs_ioctl.c115
-rw-r--r--fs/xfs/xfs_log.c1
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c131
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_notify_failure.c108
-rw-r--r--fs/xfs/xfs_qm.c2
-rw-r--r--fs/xfs/xfs_quota.h5
-rw-r--r--fs/xfs/xfs_refcount_item.c234
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rmap_item.c257
-rw-r--r--fs/xfs/xfs_rtalloc.c659
-rw-r--r--fs/xfs/xfs_rtalloc.h37
-rw-r--r--fs/xfs/xfs_super.c30
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_sysctl.h2
-rw-r--r--fs/xfs/xfs_sysfs.c63
-rw-r--r--fs/xfs/xfs_trace.h42
-rw-r--r--fs/xfs/xfs_trans.c62
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_xattr.c6
-rw-r--r--fs/zonefs/file.c2
380 files changed, 22820 insertions, 9608 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 42837617a55b..a3159831ba98 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool
+# Stackable filesystems
+config FS_STACK
+ bool
+
config BUFFER_HEAD
bool
@@ -254,7 +258,7 @@ config TMPFS_QUOTA
config ARCH_SUPPORTS_HUGETLBFS
def_bool n
-config HUGETLBFS
+menuconfig HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
@@ -266,6 +270,17 @@ config HUGETLBFS
If unsure, say N.
+if HUGETLBFS
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
+ default n
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ help
+ The HugeTLB Vmemmap Optimization (HVO) defaults to off. Say Y here to
+ enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+ (boot command line) or hugetlb_optimize_vmemmap (sysctl).
+endif # HUGETLBFS
+
config HUGETLB_PAGE
def_bool HUGETLBFS
select XARRAY_MULTI
@@ -275,15 +290,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
- bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
- default n
- depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
- help
- The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
- enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
- (boot command line) or hugetlb_optimize_vmemmap (sysctl).
-
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/Makefile b/fs/Makefile
index 75522f88e763..a6962c588962 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
+obj-$(CONFIG_FS_STACK) += backing-file.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3081edb09e46..a183e213a4a5 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 1997-1999 Russell King
*/
#include <linux/buffer_head.h>
+#include <linux/mpage.h>
#include <linux/writeback.h>
#include "adfs.h"
@@ -33,9 +34,10 @@ abort_toobig:
return 0;
}
-static int adfs_writepage(struct page *page, struct writeback_control *wbc)
+static int adfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, adfs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, adfs_get_block);
}
static int adfs_read_folio(struct file *file, struct folio *folio)
@@ -76,10 +78,11 @@ static const struct address_space_operations adfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = adfs_read_folio,
- .writepage = adfs_writepage,
+ .writepages = adfs_writepages,
.write_begin = adfs_write_begin,
.write_end = generic_write_end,
- .bmap = _adfs_bmap
+ .migrate_folio = buffer_migrate_folio,
+ .bmap = _adfs_bmap,
};
/*
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index e8956b65d7ff..dcdc0f1bb76f 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -5,6 +5,7 @@
kafs-y := \
addr_list.o \
+ addr_prefs.o \
callback.o \
cell.o \
cmservice.o \
@@ -27,6 +28,7 @@ kafs-y := \
server.o \
server_list.o \
super.o \
+ validation.o \
vlclient.o \
vl_alias.o \
vl_list.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index de1ae0bead3b..6d42f85c6be5 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -13,26 +13,55 @@
#include "internal.h"
#include "afs_fs.h"
+static void afs_free_addrlist(struct rcu_head *rcu)
+{
+ struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu);
+ unsigned int i;
+
+ for (i = 0; i < alist->nr_addrs; i++)
+ rxrpc_kernel_put_peer(alist->addrs[i].peer);
+ trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free);
+ kfree(alist);
+}
+
/*
* Release an address list.
*/
-void afs_put_addrlist(struct afs_addr_list *alist)
+void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
+{
+ unsigned int debug_id;
+ bool dead;
+ int r;
+
+ if (!alist)
+ return;
+ debug_id = alist->debug_id;
+ dead = __refcount_dec_and_test(&alist->usage, &r);
+ trace_afs_alist(debug_id, r - 1, reason);
+ if (dead)
+ call_rcu(&alist->rcu, afs_free_addrlist);
+}
+
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
{
- if (alist && refcount_dec_and_test(&alist->usage))
- kfree_rcu(alist, rcu);
+ int r;
+
+ if (alist) {
+ __refcount_inc(&alist->usage, &r);
+ trace_afs_alist(alist->debug_id, r + 1, reason);
+ }
+ return alist;
}
/*
* Allocate an address list.
*/
-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
- unsigned short service,
- unsigned short port)
+struct afs_addr_list *afs_alloc_addrlist(unsigned int nr)
{
struct afs_addr_list *alist;
- unsigned int i;
+ static atomic_t debug_id;
- _enter("%u,%u,%u", nr, service, port);
+ _enter("%u", nr);
if (nr > AFS_MAX_ADDRESSES)
nr = AFS_MAX_ADDRESSES;
@@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
refcount_set(&alist->usage, 1);
alist->max_addrs = nr;
-
- for (i = 0; i < nr; i++) {
- struct sockaddr_rxrpc *srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->srx_service = service;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- }
-
+ alist->debug_id = atomic_inc_return(&debug_id);
+ trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc);
return alist;
}
@@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
if (!vllist->servers[0].server)
goto error_vl;
- alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+ alist = afs_alloc_addrlist(nr);
if (!alist)
goto error;
@@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
}
if (family == AF_INET)
- afs_merge_fs_addr4(alist, x[0], xport);
+ ret = afs_merge_fs_addr4(net, alist, x[0], xport);
else
- afs_merge_fs_addr6(alist, x, xport);
+ ret = afs_merge_fs_addr6(net, alist, x, xport);
+ if (ret < 0)
+ goto error;
} while (p < end);
@@ -216,26 +238,13 @@ bad_address:
problem, p - text, (int)len, (int)len, text);
ret = -EINVAL;
error:
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
error_vl:
afs_put_vlserverlist(net, vllist);
return ERR_PTR(ret);
}
/*
- * Compare old and new address lists to see if there's been any change.
- * - How to do this in better than O(Nlog(N)) time?
- * - We don't really want to sort the address list, but would rather take the
- * list as we got it so as not to undo record rotation by the DNS server.
- */
-#if 0
-static int afs_cmp_addr_list(const struct afs_addr_list *a1,
- const struct afs_addr_list *a2)
-{
-}
-#endif
-
-/*
* Perform a DNS query for VL servers and build a up an address list.
*/
struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
@@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
/*
* Merge an IPv4 entry into a fileserver address list.
*/
-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- u32 addr = ntohl(xdr);
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = 0; i < alist->nr_ipv4; i++) {
- struct sockaddr_in *a = &alist->addrs[i].transport.sin;
- u32 a_addr = ntohl(a->sin_addr.s_addr);
- u16 a_port = ntohs(a->sin_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin);
+ srx.transport.sin.sin_family = AF_INET;
+ srx.transport.sin.sin_port = htons(port);
+ srx.transport.sin.sin_addr.s_addr = xdr;
- if (addr == a_addr && port == a_port)
- return;
- if (addr == a_addr && port < a_port)
- break;
- if (addr < a_addr)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.sin.sin_family = AF_INET;
- srx->transport.sin.sin_port = htons(port);
- srx->transport.sin.sin_addr.s_addr = xdr;
+ alist->addrs[i].peer = peer;
alist->nr_ipv4++;
alist->nr_addrs++;
+ return 0;
}
/*
* Merge an IPv6 entry into a fileserver address list.
*/
-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 *xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- int i, diff;
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
+ int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
- u16 a_port = ntohs(a->sin6_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin6);
+ srx.transport.sin6.sin6_family = AF_INET6;
+ srx.transport.sin6.sin6_port = htons(port);
+ memcpy(&srx.transport.sin6.sin6_addr, xdr, 16);
- diff = memcmp(xdr, &a->sin6_addr, 16);
- if (diff == 0 && port == a_port)
- return;
- if (diff == 0 && port < a_port)
- break;
- if (diff < 0)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -337,68 +358,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
memmove(alist->addrs + i + 1,
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
-
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
+ alist->addrs[i].peer = peer;
alist->nr_addrs++;
-}
-
-/*
- * Get an address to try.
- */
-bool afs_iterate_addresses(struct afs_addr_cursor *ac)
-{
- unsigned long set, failed;
- int index;
-
- if (!ac->alist)
- return false;
-
- set = ac->alist->responded;
- failed = ac->alist->failed;
- _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
-
- ac->nr_iterations++;
-
- set &= ~(failed | ac->tried);
-
- if (!set)
- return false;
-
- index = READ_ONCE(ac->alist->preferred);
- if (test_bit(index, &set))
- goto selected;
-
- index = __ffs(set);
-
-selected:
- ac->index = index;
- set_bit(index, &ac->tried);
- ac->responded = false;
- return true;
-}
-
-/*
- * Release an address list cursor.
- */
-int afs_end_cursor(struct afs_addr_cursor *ac)
-{
- struct afs_addr_list *alist;
-
- alist = ac->alist;
- if (alist) {
- if (ac->responded &&
- ac->index != alist->preferred &&
- test_bit(ac->alist->preferred, &ac->tried))
- WRITE_ONCE(alist->preferred, ac->index);
- afs_put_addrlist(alist);
- ac->alist = NULL;
- }
-
- return ac->error;
+ return 0;
}
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
new file mode 100644
index 000000000000..a189ff8a5034
--- /dev/null
+++ b/fs/afs/addr_prefs.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Address preferences management
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/seq_file.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
+{
+ return afs_net(seq_file_single_net(m));
+}
+
+/*
+ * Split a NUL-terminated string up to the first newline around spaces. The
+ * source string will be modified to have NUL-terminations inserted.
+ */
+static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
+{
+ unsigned int count = 0;
+ char *p = *pbuf;
+
+ maxstrv--; /* Allow for terminal NULL */
+ for (;;) {
+ /* Skip over spaces */
+ while (isspace(*p)) {
+ if (*p == '\n') {
+ p++;
+ break;
+ }
+ p++;
+ }
+ if (!*p)
+ break;
+
+ /* Mark start of word */
+ if (count >= maxstrv) {
+ pr_warn("Too many elements in string\n");
+ return -EINVAL;
+ }
+ strv[count++] = p;
+
+ /* Skip over word */
+ while (!isspace(*p))
+ p++;
+ if (!*p)
+ break;
+
+ /* Mark end of word */
+ if (*p == '\n') {
+ *p++ = 0;
+ break;
+ }
+ *p++ = 0;
+ }
+
+ *pbuf = p;
+ strv[count] = NULL;
+ return count;
+}
+
+/*
+ * Parse an address with an optional subnet mask.
+ */
+static int afs_parse_address(char *p, struct afs_addr_preference *pref)
+{
+ const char *stop;
+ unsigned long mask, tmp;
+ char *end = p + strlen(p);
+ bool bracket = false;
+
+ if (*p == '[') {
+ p++;
+ bracket = true;
+ }
+
+#if 0
+ if (*p == '[') {
+ p++;
+ q = memchr(p, ']', end - p);
+ if (!q) {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ } else {
+ for (q = p; q < end; q++)
+ if (*q == '/')
+ break;
+ }
+#endif
+
+ if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) {
+ pref->family = AF_INET;
+ mask = 32;
+ } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) {
+ pref->family = AF_INET6;
+ mask = 128;
+ } else {
+ pr_warn("Can't determine address family\n");
+ return -EINVAL;
+ }
+
+ p = (char *)stop;
+ if (bracket) {
+ if (*p != ']') {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ p++;
+ }
+
+ if (*p == '/') {
+ p++;
+ tmp = simple_strtoul(p, &p, 10);
+ if (tmp > mask) {
+ pr_warn("Subnet mask too large\n");
+ return -EINVAL;
+ }
+ if (tmp == 0) {
+ pr_warn("Subnet mask too small\n");
+ return -EINVAL;
+ }
+ mask = tmp;
+ }
+
+ if (*p) {
+ pr_warn("Invalid address\n");
+ return -EINVAL;
+ }
+
+ pref->subnet_mask = mask;
+ return 0;
+}
+
+enum cmp_ret {
+ CONTINUE_SEARCH,
+ INSERT_HERE,
+ EXACT_MATCH,
+ SUBNET_MATCH,
+};
+
+/*
+ * See if a candidate address matches a listed address.
+ */
+static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a,
+ const struct afs_addr_preference *b)
+{
+ int subnet = min(a->subnet_mask, b->subnet_mask);
+ const __be32 *pa, *pb;
+ u32 mask, na, nb;
+ int diff;
+
+ if (a->family != b->family)
+ return INSERT_HERE;
+
+ switch (a->family) {
+ case AF_INET6:
+ pa = a->ipv6_addr.s6_addr32;
+ pb = b->ipv6_addr.s6_addr32;
+ break;
+ case AF_INET:
+ pa = &a->ipv4_addr.s_addr;
+ pb = &b->ipv4_addr.s_addr;
+ break;
+ }
+
+ while (subnet > 32) {
+ diff = ntohl(*pa++) - ntohl(*pb++);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ subnet -= 32;
+ }
+
+ if (subnet == 0)
+ return EXACT_MATCH;
+
+ mask = 0xffffffffU << (32 - subnet);
+ na = ntohl(*pa);
+ nb = ntohl(*pb);
+ diff = (na & mask) - (nb & mask);
+ //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ if (a->subnet_mask == b->subnet_mask)
+ return EXACT_MATCH;
+ if (a->subnet_mask > b->subnet_mask)
+ return SUBNET_MATCH; /* a binds tighter than b */
+ return CONTINUE_SEARCH; /* b binds tighter than a */
+}
+
+/*
+ * Insert an address preference.
+ */
+static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist,
+ struct afs_addr_preference *pref,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist, *old = preflist;
+ size_t size, max_prefs;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 255)
+ return -ENOSPC;
+ if (preflist->nr >= preflist->max_prefs) {
+ max_prefs = preflist->max_prefs + 1;
+ size = struct_size(preflist, prefs, max_prefs);
+ size = roundup_pow_of_two(size);
+ max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255);
+ preflist = kmalloc(size, GFP_KERNEL);
+ if (!preflist)
+ return -ENOMEM;
+ *preflist = **_preflist;
+ preflist->max_prefs = max_prefs;
+ *_preflist = preflist;
+
+ if (index < preflist->nr)
+ memcpy(preflist->prefs + index + 1, old->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ if (index > 0)
+ memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index);
+ } else {
+ if (index < preflist->nr)
+ memmove(preflist->prefs + index + 1, preflist->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ }
+
+ preflist->prefs[index] = *pref;
+ preflist->nr++;
+ if (pref->family == AF_INET)
+ preflist->ipv6_off++;
+ return 0;
+}
+
+/*
+ * Add an address preference.
+ * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs
+ */
+static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 3) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtou16(argv[2], 10, &pref.prio);
+ if (ret < 0) {
+ pr_warn("Invalid priority\n");
+ return ret;
+ }
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return afs_insert_address_pref(_preflist, &pref, i);
+ case EXACT_MATCH:
+ preflist->prefs[i].prio = pref.prio;
+ return 0;
+ }
+ }
+
+ return afs_insert_address_pref(_preflist, &pref, i);
+}
+
+/*
+ * Delete an address preference.
+ */
+static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 0)
+ return -ENOENT;
+
+ if (index < preflist->nr - 1)
+ memmove(preflist->prefs + index, preflist->prefs + index + 1,
+ sizeof(preflist->prefs[0]) * (preflist->nr - index - 1));
+
+ if (index < preflist->ipv6_off)
+ preflist->ipv6_off--;
+ preflist->nr--;
+ return 0;
+}
+
+/*
+ * Delete an address preference.
+ * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs
+ */
+static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 2) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return 0;
+ case EXACT_MATCH:
+ return afs_delete_address_pref(_preflist, i);
+ }
+ }
+
+ return -ENOANO;
+}
+
+/*
+ * Handle writes to /proc/fs/afs/addr_prefs
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size)
+{
+ struct afs_addr_preference_list *preflist, *old;
+ struct seq_file *m = file->private_data;
+ struct afs_net *net = afs_seq2net_single(m);
+ size_t psize;
+ char *argv[5];
+ int ret, argc, max_prefs;
+
+ inode_lock(file_inode(file));
+
+ /* Allocate a candidate new list and initialise it from the old. */
+ old = rcu_dereference_protected(net->address_prefs,
+ lockdep_is_held(&file_inode(file)->i_rwsem));
+
+ if (old)
+ max_prefs = old->nr + 1;
+ else
+ max_prefs = 1;
+
+ psize = struct_size(old, prefs, max_prefs);
+ psize = roundup_pow_of_two(psize);
+ max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255);
+
+ ret = -ENOMEM;
+ preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL);
+ if (!preflist)
+ goto done;
+
+ if (old)
+ memcpy(preflist, old, struct_size(preflist, prefs, old->nr));
+ else
+ memset(preflist, 0, sizeof(*preflist));
+ preflist->max_prefs = max_prefs;
+
+ do {
+ argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv));
+ if (argc < 0)
+ return argc;
+ if (argc < 2)
+ goto inval;
+
+ if (strcmp(argv[0], "add") == 0)
+ ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1);
+ else if (strcmp(argv[0], "del") == 0)
+ ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1);
+ else
+ goto inval;
+ if (ret < 0)
+ goto done;
+ } while (*buf);
+
+ preflist->version++;
+ rcu_assign_pointer(net->address_prefs, preflist);
+ /* Store prefs before version */
+ smp_store_release(&net->address_pref_version, preflist->version);
+ kfree_rcu(old, rcu);
+ preflist = NULL;
+ ret = 0;
+
+done:
+ kfree(preflist);
+ inode_unlock(file_inode(file));
+ _leave(" = %d", ret);
+ return ret;
+
+inval:
+ pr_warn("Invalid Command\n");
+ ret = -EINVAL;
+ goto done;
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. The caller must hold the RCU read lock.
+ */
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist)
+{
+ const struct afs_addr_preference_list *preflist =
+ rcu_dereference(net->address_prefs);
+ const struct sockaddr_in6 *sin6;
+ const struct sockaddr_in *sin;
+ const struct sockaddr *sa;
+ struct afs_addr_preference test;
+ enum cmp_ret cmp;
+ int i, j;
+
+ if (!preflist || !preflist->nr || !alist->nr_addrs ||
+ smp_load_acquire(&alist->addr_pref_version) == preflist->version)
+ return;
+
+ test.family = AF_INET;
+ test.subnet_mask = 32;
+ test.prio = 0;
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin = (const struct sockaddr_in *)sa;
+ test.ipv4_addr = sin->sin_addr;
+ for (j = 0; j < preflist->ipv6_off; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ test.family = AF_INET6;
+ test.subnet_mask = 128;
+ test.prio = 0;
+ for (; i < alist->nr_addrs; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin6 = (const struct sockaddr_in6 *)sa;
+ test.ipv6_addr = sin6->sin6_addr;
+ for (j = preflist->ipv6_off; j < preflist->nr; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ smp_store_release(&alist->addr_pref_version, preflist->version);
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. Avoid taking the RCU read lock if we can.
+ */
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist)
+{
+ if (!net->address_prefs ||
+ /* Load version before prefs */
+ smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version)
+ return;
+
+ rcu_read_lock();
+ afs_get_address_preferences_rcu(net, alist);
+ rcu_read_unlock();
+}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 81815724db6c..b488072aee87 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -165,7 +165,8 @@ struct afs_status_cb {
* AFS volume synchronisation information
*/
struct afs_volsync {
- time64_t creation; /* volume creation time */
+ time64_t creation; /* Volume creation time (or TIME64_MIN) */
+ time64_t update; /* Volume update time (or TIME64_MIN) */
};
/*
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a484fa642808..99b2c8172021 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work)
unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
-void afs_server_init_callback_work(struct work_struct *work)
+static void afs_volume_init_callback(struct afs_volume *volume)
{
- struct afs_server *server = container_of(work, struct afs_server, initcb_work);
struct afs_vnode *vnode;
- struct afs_cell *cell = server->cell;
- down_read(&cell->fs_open_mmaps_lock);
+ down_read(&volume->open_mmaps_lock);
- list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
- if (vnode->cb_server == server) {
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
+ if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
queue_work(system_unbound_wq, &vnode->cb_work);
}
}
- up_read(&cell->fs_open_mmaps_lock);
+ up_read(&volume->open_mmaps_lock);
}
/*
@@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work)
*/
void afs_init_callback_state(struct afs_server *server)
{
- rcu_read_lock();
- do {
- server->cb_s_break++;
- atomic_inc(&server->cell->fs_s_break);
- if (!list_empty(&server->cell->fs_open_mmaps))
- queue_work(system_unbound_wq, &server->initcb_work);
+ struct afs_server_entry *se;
- } while ((server = rcu_dereference(server->uuid_next)));
- rcu_read_unlock();
+ down_read(&server->cell->vs_lock);
+
+ list_for_each_entry(se, &server->volumes, slink) {
+ se->cb_expires_at = AFS_NO_CB_PROMISE;
+ se->volume->cb_expires_at = AFS_NO_CB_PROMISE;
+ trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break),
+ afs_cb_break_for_s_reinit);
+ if (!list_empty(&se->volume->open_mmaps))
+ afs_volume_init_callback(se->volume);
+ }
+
+ up_read(&server->cell->vs_lock);
}
/*
@@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
_enter("");
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
vnode->cb_break++;
- vnode->cb_v_break = vnode->volume->cb_v_break;
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
afs_clear_permits(vnode);
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
@@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
{
struct afs_volume *volume = NULL;
struct rb_node *p;
- int seq = 0;
+ int seq = 1;
- do {
+ for (;;) {
/* Unfortunately, rbtree walking doesn't give reliable results
* under just the RCU read lock, so we have to check for
* changes.
*/
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&cell->volume_lock, &seq);
p = rcu_dereference_raw(cell->volumes.rb_node);
@@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
volume = NULL;
}
- } while (need_seqretry(&cell->volume_lock, seq));
+ if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback))
+ break;
+ if (!need_seqretry(&cell->volume_lock, seq))
+ break;
+ seq |= 1; /* Want a lock next time */
+ }
done_seqretry(&cell->volume_lock, seq);
return volume;
}
/*
+ * Allow the fileserver to break callbacks at the volume-level. This is
+ * typically done when, for example, a R/W volume is snapshotted to a R/O
+ * volume (the only way to change an R/O volume). It may also, however, happen
+ * when a volserver takes control of a volume (offlining it, moving it, etc.).
+ *
+ * Every file in that volume will need to be reevaluated.
+ */
+static void afs_break_volume_callback(struct afs_server *server,
+ struct afs_volume *volume)
+ __releases(RCU)
+{
+ struct afs_server_list *slist = rcu_dereference(volume->servers);
+ unsigned int i, cb_v_break;
+
+ write_lock(&volume->cb_v_break_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ if (slist->servers[i].server == server)
+ slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE;
+ volume->cb_expires_at = AFS_NO_CB_PROMISE;
+
+ cb_v_break = atomic_inc_return_release(&volume->cb_v_break);
+ trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback);
+
+ write_unlock(&volume->cb_v_break_lock);
+ rcu_read_unlock();
+
+ if (!list_empty(&volume->open_mmaps))
+ afs_volume_init_callback(volume);
+}
+
+/*
* allow the fileserver to explicitly break one callback
* - happens when
* - the backing file is changed
* - a lock is released
*/
-static void afs_break_one_callback(struct afs_volume *volume,
+static void afs_break_one_callback(struct afs_server *server,
+ struct afs_volume *volume,
struct afs_fid *fid)
{
struct super_block *sb;
struct afs_vnode *vnode;
struct inode *inode;
- if (fid->vnode == 0 && fid->unique == 0) {
- /* The callback break applies to an entire volume. */
- write_lock(&volume->cb_v_break_lock);
- volume->cb_v_break++;
- trace_afs_cb_break(fid, volume->cb_v_break,
- afs_cb_break_for_volume_callback, false);
- write_unlock(&volume->cb_v_break_lock);
- return;
- }
-
/* See if we can find a matching inode - even an I_NEW inode needs to
* be marked as it can have its callback broken before we finish
* setting up the local inode.
@@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server,
afs_volid_t vid = cbb->fid.vid;
size_t i;
+ rcu_read_lock();
volume = afs_lookup_volume_rcu(server->cell, vid);
+ if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) {
+ afs_break_volume_callback(server, volume);
+ *_count -= 1;
+ if (*_count)
+ memmove(cbb, cbb + 1, sizeof(*cbb) * *_count);
+ } else {
+ /* TODO: Find all matching volumes if we couldn't match the server and
+ * break them anyway.
+ */
- /* TODO: Find all matching volumes if we couldn't match the server and
- * break them anyway.
- */
-
- for (i = *_count; i > 0; cbb++, i--) {
- if (cbb->fid.vid == vid) {
- _debug("- Fid { vl=%08llx n=%llu u=%u }",
- cbb->fid.vid,
- cbb->fid.vnode,
- cbb->fid.unique);
- --*_count;
- if (volume)
- afs_break_one_callback(volume, &cbb->fid);
- } else {
- *residue++ = *cbb;
+ for (i = *_count; i > 0; cbb++, i--) {
+ if (cbb->fid.vid == vid) {
+ _debug("- Fid { vl=%08llx n=%llu u=%u }",
+ cbb->fid.vid,
+ cbb->fid.vnode,
+ cbb->fid.unique);
+ --*_count;
+ if (volume)
+ afs_break_one_callback(server, volume, &cbb->fid);
+ } else {
+ *residue++ = *cbb;
+ }
}
+ rcu_read_unlock();
}
+
+ afs_put_volume(volume, afs_volume_trace_put_callback);
}
/*
@@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
ASSERT(server != NULL);
- rcu_read_lock();
-
while (count > 0)
afs_break_some_callbacks(server, callbacks, &count);
-
- rcu_read_unlock();
- return;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 926cb1188eba..caa09875f520 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -161,13 +161,12 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
seqlock_init(&cell->fs_lock);
- INIT_LIST_HEAD(&cell->fs_open_mmaps);
- init_rwsem(&cell->fs_open_mmaps_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -817,7 +816,7 @@ done:
final_destruction:
/* The root volume is pinning the cell */
- afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
+ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
afs_put_cell(cell, afs_cell_trace_put_destroy);
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d4ddb20d6732..99a3f20bc786 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
struct afs_server *server;
+ struct rxrpc_peer *peer;
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+ peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
- server = afs_find_server(call->net, &srx);
+ server = afs_find_server(call->net, peer);
if (!server) {
trace_afs_cm_no_server(call, &srx);
return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5219182e52e1..c14533ef108f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -693,8 +693,9 @@ static void afs_do_lookup_success(struct afs_operation *op)
vp = &op->file[0];
abort_code = vp->scb.status.abort_code;
if (abort_code != 0) {
- op->ac.abort_code = abort_code;
- op->error = afs_abort_to_error(abort_code);
+ op->call_abort_code = abort_code;
+ afs_op_set_error(op, afs_abort_to_error(abort_code));
+ op->cumul_error.abort_code = abort_code;
}
break;
@@ -806,8 +807,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
cookie->fids[i].vid = dvnode->fid.vid;
cookie->ctx.actor = afs_lookup_filldir;
cookie->name = dentry->d_name;
- cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want
- * and slot 1 for the directory */
+ cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
+ * and slot 0 for the directory */
if (!afs_server_supports_ibulk(dvnode))
cookie->one_only = true;
@@ -846,13 +847,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
_debug("nr_files %u", op->nr_files);
/* Need space for examining all the selected files */
- op->error = -ENOMEM;
if (op->nr_files > 2) {
op->more_files = kvcalloc(op->nr_files - 2,
sizeof(struct afs_vnode_param),
GFP_KERNEL);
- if (!op->more_files)
+ if (!op->more_files) {
+ afs_op_nomem(op);
goto out_op;
+ }
for (i = 2; i < op->nr_files; i++) {
vp = &op->more_files[i - 2];
@@ -878,14 +880,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* lookups contained therein are stored in the reply without aborting
* the whole operation.
*/
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
if (!cookie->one_only) {
op->ops = &afs_inline_bulk_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- if (op->error == -ENOTSUPP) {
+ if (afs_op_error(op) == -ENOTSUPP) {
/* We could try FS.BulkStatus next, but this aborts the entire
* op if any of the lookups fails - so, for the moment, revert
* to FS.FetchStatus for op->file[1].
@@ -895,10 +897,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(op->error);
+ inode = ERR_PTR(afs_op_error(op));
out_op:
- if (op->error == 0) {
+ if (!afs_op_error(op)) {
inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1116,7 +1118,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
dir = AFS_FS_I(d_inode(parent));
/* validate the parent directory */
- afs_validate(dir, key);
+ ret = afs_validate(dir, key);
+ if (ret == -ERESTARTSYS) {
+ dput(parent);
+ key_put(key);
+ return ret;
+ }
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
@@ -1255,9 +1262,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
{
struct afs_vnode *vnode = op->file[0].vnode;
- switch (op->ac.abort_code) {
+ switch (afs_op_abort_code(op)) {
case VNOVNODE:
set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ clear_nlink(&vnode->netfs.inode);
afs_break_callback(vnode, afs_cb_break_for_deleted);
}
}
@@ -1273,20 +1281,20 @@ static void afs_vnode_new_inode(struct afs_operation *op)
_enter("");
- ASSERTCMP(op->error, ==, 0);
+ ASSERTCMP(afs_op_error(op), ==, 0);
inode = afs_iget(op, vp);
if (IS_ERR(inode)) {
/* ENOMEM or EINTR at a really inconvenient time - just abandon
* the new directory on the server.
*/
- op->error = PTR_ERR(inode);
+ afs_op_accumulate_error(op, PTR_ERR(inode), 0);
return;
}
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (!op->error)
+ if (!afs_op_error(op))
afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
d_instantiate(op->dentry, inode);
}
@@ -1320,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1373,7 +1381,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
}
}
@@ -1480,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
struct dentry *dentry = op->dentry;
int ret;
- if (op->error != 0 ||
+ if (afs_op_error(op) ||
(op->file[1].scb.have_status && op->file[1].scb.have_error))
return;
if (d_really_is_positive(dentry))
@@ -1504,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op)
ret = afs_validate(vnode, op->key);
if (ret != -ESTALE)
- op->error = ret;
+ afs_op_set_error(op, ret);
}
- _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op));
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1538,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
static void afs_unlink_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT)
+ if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
d_rehash(op->dentry);
}
@@ -1579,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* Try to make sure we have a callback promise on the victim. */
ret = afs_validate(vnode, op->key);
if (ret < 0) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
@@ -1588,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(d_inode(dentry), 0);
- op->error = afs_sillyrename(dvnode, vnode, dentry, op->key);
+ afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
goto error;
}
if (!d_unhashed(dentry)) {
@@ -1609,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
@@ -1691,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op)
static void afs_link_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1889,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op)
if (op->rename.rehash)
d_rehash(op->rename.rehash);
dput(op->rename.tmp);
- if (op->error)
+ if (afs_op_error(op))
d_rehash(op->dentry);
}
@@ -1934,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return PTR_ERR(op);
ret = afs_validate(vnode, op->key);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret < 0)
goto error;
@@ -1971,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->rename.tmp = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
if (!op->rename.tmp) {
- op->error = -ENOMEM;
+ afs_op_nomem(op);
goto error;
}
@@ -1979,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
AFS_FS_I(d_inode(new_dentry)),
new_dentry, op->key);
if (ret) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index bb5807e87fa4..a1e581946b93 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d37dd201752b..30914e0d9cb2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -243,12 +243,9 @@ static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
struct netfs_io_subrequest *subreq = req->subreq;
- int error = op->error;
+ int error = afs_op_error(op);
- if (error == -ECONNABORTED)
- error = afs_abort_to_error(op->ac.abort_code);
req->error = error;
-
if (subreq) {
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
@@ -271,7 +268,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
static void afs_fetch_data_put(struct afs_operation *op)
{
- op->fetch.req->error = op->error;
+ op->fetch.req->error = afs_op_error(op);
afs_put_read(op->fetch.req);
}
@@ -517,13 +514,12 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp)
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
if (list_empty(&vnode->cb_mmap_link))
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
}
}
@@ -532,12 +528,12 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
return;
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
if (atomic_read(&vnode->cb_nr_mmap) == 0)
list_del_init(&vnode->cb_mmap_link);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
flush_work(&vnode->cb_work);
}
@@ -573,7 +569,7 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- if (afs_pagecache_valid(vnode))
+ if (afs_check_validity(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
return 0;
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 7a3803ce3a22..3546b087e791 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -35,13 +35,15 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
key_get(key);
}
- op->key = key;
- op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
- op->net = volume->cell->net;
- op->cb_v_break = volume->cb_v_break;
- op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
- op->error = -EDESTADDRREQ;
- op->ac.error = SHRT_MAX;
+ op->key = key;
+ op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
+ op->net = volume->cell->net;
+ op->cb_v_break = atomic_read(&volume->cb_v_break);
+ op->pre_volsync.creation = volume->creation_time;
+ op->pre_volsync.update = volume->update_time;
+ op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
+ op->nr_iterations = -1;
+ afs_op_set_error(op, -EDESTADDRREQ);
_leave(" = [op=%08x]", op->debug_id);
return op;
@@ -71,7 +73,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]");
return false;
@@ -80,7 +82,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock);
op->flags &= ~AFS_OPERATION_LOCK_0;
@@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
afs_prepare_vnode(op, &op->file[0], 0);
afs_prepare_vnode(op, &op->file[1], 1);
- op->cb_v_break = op->volume->cb_v_break;
+ op->cb_v_break = atomic_read(&op->volume->cb_v_break);
_leave(" = true");
return true;
}
@@ -159,16 +161,16 @@ static void afs_end_vnode_operation(struct afs_operation *op)
{
_enter("");
- if (op->error == -EDESTADDRREQ ||
- op->error == -EADDRNOTAVAIL ||
- op->error == -ENETUNREACH ||
- op->error == -EHOSTUNREACH)
+ switch (afs_op_error(op)) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_dump_edestaddrreq(op);
+ break;
+ }
afs_drop_io_locks(op);
-
- if (op->error == -ECONNABORTED)
- op->error = afs_abort_to_error(op->ac.abort_code);
}
/*
@@ -179,37 +181,43 @@ void afs_wait_for_operation(struct afs_operation *op)
_enter("");
while (afs_select_fileserver(op)) {
- op->cb_s_break = op->server->cb_s_break;
+ op->call_responded = false;
+ op->call_error = 0;
+ op->call_abort_code = 0;
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
op->ops->issue_yfs_rpc)
op->ops->issue_yfs_rpc(op);
else if (op->ops->issue_afs_rpc)
op->ops->issue_afs_rpc(op);
else
- op->ac.error = -ENOTSUPP;
-
- if (op->call)
- op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
+ op->call_error = -ENOTSUPP;
+
+ if (op->call) {
+ afs_wait_for_call_to_complete(op->call);
+ op->call_abort_code = op->call->abort_code;
+ op->call_error = op->call->error;
+ op->call_responded = op->call->responded;
+ afs_put_call(op->call);
+ }
}
- switch (op->error) {
- case 0:
+ if (op->call_responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
+
+ if (!afs_op_error(op)) {
_debug("success");
op->ops->success(op);
- break;
- case -ECONNABORTED:
+ } else if (op->cumul_error.aborted) {
if (op->ops->aborted)
op->ops->aborted(op);
- fallthrough;
- default:
+ } else {
if (op->ops->failed)
op->ops->failed(op);
- break;
}
afs_end_vnode_operation(op);
- if (op->error == 0 && op->ops->edit_dir) {
+ if (!afs_op_error(op) && op->ops->edit_dir) {
_debug("edit_dir");
op->ops->edit_dir(op);
}
@@ -221,7 +229,8 @@ void afs_wait_for_operation(struct afs_operation *op)
*/
int afs_put_operation(struct afs_operation *op)
{
- int i, ret = op->error;
+ struct afs_addr_list *alist;
+ int i, ret = afs_op_error(op);
_enter("op=%08x,%d", op->debug_id, ret);
@@ -243,9 +252,19 @@ int afs_put_operation(struct afs_operation *op)
kfree(op->more_files);
}
- afs_end_cursor(&op->ac);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (alist) {
+ if (op->call_responded &&
+ op->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ }
+ }
+
+ afs_clear_server_states(op);
afs_put_serverlist(op->net, op->server_list);
- afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op);
+ afs_put_volume(op->volume, afs_volume_trace_put_put_op);
key_put(op->key);
kfree(op);
return ret;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index daaf3810cc92..580de4adaaf6 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -15,6 +15,42 @@
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where)
+{
+ if (estate) {
+ int r;
+
+ __refcount_inc(&estate->ref, &r);
+ trace_afs_estate(estate->server_id, estate->probe_seq, r, where);
+ }
+ return estate;
+}
+
+static void afs_endpoint_state_rcu(struct rcu_head *rcu)
+{
+ struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_free);
+ afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate);
+ kfree(estate);
+}
+
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where)
+{
+ if (estate) {
+ unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&estate->ref, &r);
+ trace_afs_estate(server_id, probe_seq, r, where);
+ if (dead)
+ call_rcu(&estate->rcu, afs_endpoint_state_rcu);
+ }
+}
+
/*
* Start the probe polling timer. We have to supply it with an inc on the
* outstanding server count.
@@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net,
/*
* Handle the completion of a set of probes.
*/
-static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
- bool responded = server->probe.responded;
+ bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags);
write_seqlock(&net->fs_lock);
if (responded) {
@@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
list_add_tail(&server->probe_link, &net->fs_probe_fast);
}
+
write_sequnlock(&net->fs_lock);
afs_schedule_fs_probe(net, server, !responded);
@@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
/*
* Handle the completion of a probe.
*/
-static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
_enter("");
- if (atomic_dec_and_test(&server->probe_outstanding))
- afs_finished_fs_probe(net, server);
+ if (atomic_dec_and_test(&estate->nr_probing))
+ afs_finished_fs_probe(net, server, estate);
wake_up_all(&server->probe_wq);
}
@@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server
*/
static void afs_fs_probe_not_done(struct afs_net *net,
struct afs_server *server,
- struct afs_addr_cursor *ac)
+ struct afs_endpoint_state *estate,
+ int index)
{
- struct afs_addr_list *alist = ac->alist;
- unsigned int index = ac->index;
-
_enter("");
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
spin_lock(&server->probe_lock);
- server->probe.local_failure = true;
- if (server->probe.error == 0)
- server->probe.error = -ENOMEM;
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
+ if (estate->error == 0)
+ estate->error = -ENOMEM;
- set_bit(index, &alist->failed);
+ set_bit(index, &estate->failed_set);
spin_unlock(&server->probe_lock);
- return afs_done_one_fs_probe(net, server);
+ return afs_done_one_fs_probe(net, server, estate);
}
/*
@@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net,
*/
void afs_fileserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_endpoint_state *estate = call->probe;
+ struct afs_addr_list *alist = estate->addresses;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
struct afs_server *server = call->server;
- unsigned int index = call->addr_ix;
- unsigned int rtt_us = 0, cap0;
+ unsigned int index = call->probe_index;
+ unsigned int rtt_us = -1, cap0;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
+ WRITE_ONCE(addr->last_error, ret);
+
spin_lock(&server->probe_lock);
switch (ret) {
case 0:
- server->probe.error = 0;
+ estate->error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
- server->probe.abort_code = call->abort_code;
- server->probe.error = ret;
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) {
+ estate->abort_code = call->abort_code;
+ estate->error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- clear_bit(index, &alist->responded);
- server->probe.local_failure = true;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
@@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call)
case -ETIMEDOUT:
case -ETIME:
default:
- clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
- if (!server->probe.responded &&
- (server->probe.error == 0 ||
- server->probe.error == -ETIMEDOUT ||
- server->probe.error == -ETIME))
- server->probe.error = ret;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(index, &estate->failed_set);
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) &&
+ (estate->error == 0 ||
+ estate->error == -ETIMEDOUT ||
+ estate->error == -ETIME))
+ estate->error = ret;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
}
responded:
- clear_bit(index, &alist->failed);
+ clear_bit(index, &estate->failed_set);
if (call->service_id == YFS_FS_SERVICE) {
- server->probe.is_yfs = true;
+ set_bit(AFS_ESTATE_IS_YFS, &estate->flags);
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ set_bit(AFS_ESTATE_NOT_YFS, &estate->flags);
+ if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) {
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
cap0 = ntohl(call->tmp);
if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
@@ -167,116 +208,136 @@ responded:
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
- if (rtt_us < server->probe.rtt) {
- server->probe.rtt = rtt_us;
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+ if (rtt_us < estate->rtt) {
+ estate->rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
- set_bit(index, &alist->responded);
+ set_bit(AFS_ESTATE_RESPONDED, &estate->flags);
+ set_bit(index, &estate->responsive_set);
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
out:
spin_unlock(&server->probe_lock);
- _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
- &server->uuid, index, &alist->addrs[index].transport,
+ trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us);
+ _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d",
+ estate->probe_seq, &server->uuid, index,
+ rxrpc_kernel_remote_addr(alist->addrs[index].peer),
rtt_us, ret);
- return afs_done_one_fs_probe(call->net, server);
+ return afs_done_one_fs_probe(call->net, server, estate);
}
/*
- * Probe one or all of a fileserver's addresses to find out the best route and
- * to query its capabilities.
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
*/
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct key *key, bool all)
+ struct afs_addr_list *new_alist, struct key *key)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_endpoint_state *estate, *old;
+ struct afs_addr_list *alist;
+ unsigned long unprobed;
_enter("%pU", &server->uuid);
- read_lock(&server->fs_lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(ac.alist);
- read_unlock(&server->fs_lock);
+ estate = kzalloc(sizeof(*estate), GFP_KERNEL);
+ if (!estate)
+ return;
+
+ refcount_set(&estate->ref, 1);
+ estate->server_id = server->debug_id;
+ estate->rtt = UINT_MAX;
+
+ write_lock(&server->fs_lock);
+
+ old = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&server->fs_lock));
+ estate->responsive_set = old->responsive_set;
+ estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
+ afs_alist_trace_get_estate);
+ alist = estate->addresses;
+ estate->probe_seq = ++server->probe_counter;
+ atomic_set(&estate->nr_probing, alist->nr_addrs);
+
+ rcu_assign_pointer(server->endpoint_state, estate);
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
+ write_unlock(&server->fs_lock);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_probe);
+
+ afs_get_address_preferences(net, alist);
server->probed_at = jiffies;
- atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1);
- memset(&server->probe, 0, sizeof(server->probe));
- server->probe.rtt = UINT_MAX;
-
- ac.index = ac.alist->preferred;
- if (ac.index < 0 || ac.index >= ac.alist->nr_addrs)
- all = true;
-
- if (all) {
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++)
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
- } else {
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ unsigned int index = 0, i;
+ int best_prio = -1;
+
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_fs_probe(server, true, estate, index, 0, 0, 0);
+ if (!afs_fs_get_capabilities(net, server, estate, index, key))
+ afs_fs_probe_not_done(net, server, estate, index);
}
- afs_put_addrlist(ac.alist);
+ afs_put_endpoint_state(old, afs_estate_trace_put_probe);
}
/*
- * Wait for the first as-yet untried fileserver to respond.
+ * Wait for the first as-yet untried fileserver to respond, for the probe state
+ * to be superseded or for all probes to finish.
*/
-int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr)
{
- struct wait_queue_entry *waits;
- struct afs_server *server;
- unsigned int rtt = UINT_MAX, rtt_s;
- bool have_responders = false;
- int pref = -1, i;
+ struct afs_endpoint_state *estate;
+ struct afs_server_list *slist = op->server_list;
+ bool still_probing = true;
+ int ret = 0, i;
- _enter("%u,%lx", slist->nr_servers, untried);
+ _enter("%u", slist->nr_servers);
- /* Only wait for servers that have a probe outstanding. */
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (!atomic_read(&server->probe_outstanding))
- __clear_bit(i, &untried);
- if (server->probe.responded)
- have_responders = true;
- }
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
+ return 2;
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs)
+ return 1;
}
- if (have_responders || !untried)
+ if (!still_probing)
return 0;
- waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
- if (!waits)
- return -ENOMEM;
-
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- init_waitqueue_entry(&waits[i], current);
- add_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
+ for (i = 0; i < slist->nr_servers; i++)
+ add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
for (;;) {
- bool still_probing = false;
+ still_probing = false;
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (server->probe.responded)
- goto stop;
- if (atomic_read(&server->probe_outstanding))
- still_probing = true;
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) {
+ ret = 2;
+ goto stop;
+ }
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs) {
+ ret = 1;
+ goto stop;
}
}
@@ -288,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
stop:
set_current_state(TASK_RUNNING);
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- rtt_s = READ_ONCE(server->rtt);
- if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) &&
- rtt_s < rtt) {
- pref = i;
- rtt = rtt_s;
- }
-
- remove_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
-
- kfree(waits);
-
- if (pref == -1 && signal_pending(current))
- return -ERESTARTSYS;
+ for (i = 0; i < slist->nr_servers; i++)
+ remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
- if (pref >= 0)
- slist->preferred = pref;
- return 0;
+ if (!ret && signal_pending(current))
+ ret = -ERESTARTSYS;
+ return ret;
}
/*
@@ -327,7 +372,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
/*
* Dispatch a probe to a server.
*/
-static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all)
+static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server)
__releases(&net->fs_lock)
{
struct key *key = NULL;
@@ -340,7 +385,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server
afs_get_server(server, afs_server_trace_get_probe);
write_sequnlock(&net->fs_lock);
- afs_fs_probe_fileserver(net, server, key, all);
+ afs_fs_probe_fileserver(net, server, NULL, key);
afs_put_server(net, server, afs_server_trace_put_probe);
}
@@ -352,7 +397,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
{
write_seqlock(&net->fs_lock);
if (!list_empty(&server->probe_link))
- return afs_dispatch_fs_probe(net, server, true);
+ return afs_dispatch_fs_probe(net, server);
write_sequnlock(&net->fs_lock);
}
@@ -412,7 +457,7 @@ again:
_debug("probe %pU", &server->uuid);
if (server && (first_pass || !need_resched())) {
- afs_dispatch_fs_probe(net, server, server == fast);
+ afs_dispatch_fs_probe(net, server);
first_pass = false;
goto again;
}
@@ -436,12 +481,13 @@ again:
/*
* Wait for a probe on a particular fileserver to complete for 2s.
*/
-int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr)
{
struct wait_queue_entry wait;
unsigned long timo = 2 * HZ;
- if (atomic_read(&server->probe_outstanding) == 0)
+ if (atomic_read(&estate->nr_probing) == 0)
goto dont_wait;
init_wait_entry(&wait, 0);
@@ -449,8 +495,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 ||
- server->probe.responded ||
- atomic_read(&server->probe_outstanding) == 0 ||
+ test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) ||
+ (estate->responsive_set & ~exclude) ||
+ atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current)))
break;
timo = schedule_timeout(timo);
@@ -459,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (server->probe.responded)
+ if (estate->responsive_set & ~exclude)
+ return 1;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 7d37f63ef0f0..79cd30775b7a 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -442,6 +443,7 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -476,6 +478,7 @@ void afs_fs_fetch_data(struct afs_operation *op)
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -559,6 +562,7 @@ void afs_fs_create_file(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -612,6 +616,7 @@ void afs_fs_make_dir(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -685,6 +690,7 @@ void afs_fs_remove_file(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -732,6 +738,7 @@ void afs_fs_remove_dir(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -812,6 +819,7 @@ void afs_fs_link(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -907,6 +915,7 @@ void afs_fs_symlink(struct afs_operation *op)
*bp++ = htonl(S_IRWXUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1003,6 +1012,7 @@ void afs_fs_rename(struct afs_operation *op)
bp = (void *) bp + n_padsz;
}
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1090,6 +1100,7 @@ static void afs_fs_store_data64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(op->store.i_size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1140,6 +1151,7 @@ void afs_fs_store_data(struct afs_operation *op)
*bp++ = htonl(lower_32_bits(op->store.size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1206,6 +1218,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */
*bp++ = htonl(lower_32_bits(attr->ia_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1247,6 +1260,7 @@ static void afs_fs_setattr_size(struct afs_operation *op)
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1283,6 +1297,7 @@ void afs_fs_setattr(struct afs_operation *op)
xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1446,6 +1461,7 @@ void afs_fs_get_volume_status(struct afs_operation *op)
bp[0] = htonl(FSGETVOLUMESTATUS);
bp[1] = htonl(vp->fid.vid);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1528,6 +1544,7 @@ void afs_fs_set_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.unique);
*bp++ = htonl(op->lock.type);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1554,6 +1571,7 @@ void afs_fs_extend_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1580,6 +1598,7 @@ void afs_fs_release_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1605,13 +1624,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
/*
* Flush all the callbacks we have on a server.
*/
-int afs_fs_give_up_all_callbacks(struct afs_net *net,
- struct afs_server *server,
- struct afs_addr_cursor *ac,
- struct key *key)
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key)
{
struct afs_call *call;
__be32 *bp;
+ int ret;
_enter("");
@@ -1619,15 +1637,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
if (!call)
return -ENOMEM;
- call->key = key;
+ call->key = key;
+ call->peer = rxrpc_kernel_get_peer(addr->peer);
+ call->service_id = server->service_id;
/* marshall the parameters */
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
call->server = afs_use_server(server, afs_server_trace_give_up_cb);
- afs_make_call(ac, call, GFP_NOFS);
- return afs_wait_for_call_to_complete(call, ac);
+ afs_make_call(call, GFP_NOFS);
+ afs_wait_for_call_to_complete(call);
+ ret = call->error;
+ if (call->responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
+ afs_put_call(call);
+ return ret;
}
/*
@@ -1689,6 +1714,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
return 0;
}
+static void afs_fs_get_capabilities_destructor(struct afs_call *call)
+{
+ afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps);
+ afs_flat_call_destructor(call);
+}
+
/*
* FS.GetCapabilities operation type
*/
@@ -1697,7 +1728,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities,
.done = afs_fileserver_probe_result,
- .destructor = afs_flat_call_destructor,
+ .destructor = afs_fs_get_capabilities_destructor,
};
/*
@@ -1707,7 +1738,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
* ->done() - otherwise we return false to indicate we didn't even try.
*/
bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
- struct afs_addr_cursor *ac, struct key *key)
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key)
{
struct afs_call *call;
__be32 *bp;
@@ -1718,10 +1750,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
if (!call)
return false;
- call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
- call->upgrade = true;
- call->async = true;
+ call->key = key;
+ call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
+ call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
+ call->upgrade = true;
+ call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
/* marshall the parameters */
@@ -1729,7 +1765,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
*bp++ = htonl(FSGETCAPABILITIES);
trace_afs_make_fs_call(call, NULL);
- afs_make_call(ac, call, GFP_NOFS);
+ afs_make_call(call, GFP_NOFS);
afs_put_call(call);
return true;
}
@@ -1853,7 +1889,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
return ret;
bp = call->buffer;
- xdr_decode_AFSVolSync(&bp, &op->volsync);
+ /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled
+ * with rubbish.
+ */
+ xdr_decode_AFSVolSync(&bp, NULL);
call->unmarshall++;
fallthrough;
@@ -1899,7 +1938,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
int i;
if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) {
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
return;
}
@@ -1928,6 +1967,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
*bp++ = htonl(op->more_files[i].fid.unique);
}
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -2033,6 +2073,7 @@ void afs_fs_fetch_acl(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -2078,6 +2119,7 @@ void afs_fs_store_acl(struct afs_operation *op)
if (acl->size != size)
memset((void *)&bp[5] + acl->size, 0, size - acl->size);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 78efc9719349..4f04f6f33f46 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
- vnode->cb_v_break = op->cb_v_break;
- vnode->cb_s_break = op->cb_s_break;
+ vnode->cb_v_check = op->cb_v_break;
vnode->status = *status;
t = status->mtime_client;
@@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op,
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
* didn't give us a callback) */
- vnode->cb_expires_at = ktime_get_real_seconds();
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
} else {
- vnode->cb_expires_at = vp->scb.callback.expires_at;
vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
}
write_sequnlock(&vnode->cb_lock);
@@ -214,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op,
vnode->status = *status;
if (vp->dv_before + vp->dv_delta != status->data_version) {
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
+ atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
@@ -268,9 +267,9 @@ static void afs_apply_callback(struct afs_operation *op,
struct afs_vnode *vnode = vp->vnode;
if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
- vnode->cb_expires_at = cb->expires_at;
- vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ if (op->volume->type == AFSVL_RWVOL)
+ vnode->cb_server = op->server;
+ atomic64_set(&vnode->cb_expires_at, cb->expires_at);
}
}
@@ -331,7 +330,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret == 0)
afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
} else {
@@ -542,7 +541,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_break = as->volume->cb_v_break,
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
@@ -573,180 +572,6 @@ error:
}
/*
- * mark the data attached to an inode as obsolete due to a write on the server
- * - might also want to ditch all the outstanding writes and dirty pages
- */
-static void afs_zap_data(struct afs_vnode *vnode)
-{
- _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
- afs_invalidate_cache(vnode, 0);
-
- /* nuke all the non-dirty pages that aren't locked, mapped or being
- * written back in a regular file and completely discard the pages in a
- * directory or symlink */
- if (S_ISREG(vnode->netfs.inode.i_mode))
- invalidate_remote_inode(&vnode->netfs.inode);
- else
- invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
-}
-
-/*
- * Check to see if we have a server currently serving this volume and that it
- * hasn't been reinitialised or dropped from the list.
- */
-static bool afs_check_server_good(struct afs_vnode *vnode)
-{
- struct afs_server_list *slist;
- struct afs_server *server;
- bool good;
- int i;
-
- if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
- return true;
-
- rcu_read_lock();
-
- slist = rcu_dereference(vnode->volume->servers);
- for (i = 0; i < slist->nr_servers; i++) {
- server = slist->servers[i].server;
- if (server == vnode->cb_server) {
- good = (vnode->cb_s_break == server->cb_s_break);
- rcu_read_unlock();
- return good;
- }
- }
-
- rcu_read_unlock();
- return false;
-}
-
-/*
- * Check the validity of a vnode/inode.
- */
-bool afs_check_validity(struct afs_vnode *vnode)
-{
- enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
- time64_t now = ktime_get_real_seconds();
- unsigned int cb_break;
- int seq = 0;
-
- do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- cb_break = vnode->cb_break;
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- if (vnode->cb_v_break != vnode->volume->cb_v_break)
- need_clear = afs_cb_break_for_v_break;
- else if (!afs_check_server_good(vnode))
- need_clear = afs_cb_break_for_s_reinit;
- else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- need_clear = afs_cb_break_for_zap;
- else if (vnode->cb_expires_at - 10 <= now)
- need_clear = afs_cb_break_for_lapsed;
- } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- ;
- } else {
- need_clear = afs_cb_break_no_promise;
- }
-
- } while (need_seqretry(&vnode->cb_lock, seq));
-
- done_seqretry(&vnode->cb_lock, seq);
-
- if (need_clear == afs_cb_break_no_break)
- return true;
-
- write_seqlock(&vnode->cb_lock);
- if (need_clear == afs_cb_break_no_promise)
- vnode->cb_v_break = vnode->volume->cb_v_break;
- else if (cb_break == vnode->cb_break)
- __afs_break_callback(vnode, need_clear);
- else
- trace_afs_cb_miss(&vnode->fid, need_clear);
- write_sequnlock(&vnode->cb_lock);
- return false;
-}
-
-/*
- * Returns true if the pagecache is still valid. Does not sleep.
- */
-bool afs_pagecache_valid(struct afs_vnode *vnode)
-{
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- return true;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
- return true;
-
- return false;
-}
-
-/*
- * validate a vnode/inode
- * - there are several things we need to check
- * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
- * symlink)
- * - parent dir metadata changed (security changes)
- * - dentry data changed (write, truncate)
- * - dentry metadata changed (security changes)
- */
-int afs_validate(struct afs_vnode *vnode, struct key *key)
-{
- int ret;
-
- _enter("{v={%llx:%llu} fl=%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, vnode->flags,
- key_serial(key));
-
- if (afs_pagecache_valid(vnode))
- goto valid;
-
- down_write(&vnode->validate_lock);
-
- /* if the promise has expired, we need to check the server again to get
- * a new promise - note that if the (parent) directory's metadata was
- * changed then the security may be different and we may no longer have
- * access */
- if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- _debug("not promised");
- ret = afs_fetch_status(vnode, key, false, NULL);
- if (ret < 0) {
- if (ret == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- goto error_unlock;
- }
- _debug("new promise [fl=%lx]", vnode->flags);
- }
-
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _debug("file already deleted");
- ret = -ESTALE;
- goto error_unlock;
- }
-
- /* if the vnode's data version number changed then its contents are
- * different */
- if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- afs_zap_data(vnode);
- up_write(&vnode->validate_lock);
-valid:
- _leave(" = 0");
- return 0;
-
-error_unlock:
- up_write(&vnode->validate_lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* read the attributes of an inode
*/
int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -755,13 +580,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
struct key *key;
- int ret, seq = 0;
+ int ret, seq;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
if (vnode->volume &&
!(query_flags & AT_STATX_DONT_SYNC) &&
- !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) {
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
return PTR_ERR(key);
@@ -772,7 +597,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
}
do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+ seq = read_seqbegin(&vnode->cb_lock);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
@@ -784,9 +609,8 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
*/
if (S_ISDIR(inode->i_mode))
stat->size = vnode->netfs.remote_i_size;
- } while (need_seqretry(&vnode->cb_lock, seq));
+ } while (read_seqretry(&vnode->cb_lock, seq));
- done_seqretry(&vnode->cb_lock, seq);
return 0;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7385d62c8cf5..e33ace259cc6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -33,6 +33,7 @@
struct pagevec;
struct afs_call;
struct afs_vnode;
+struct afs_server_probe;
/*
* Partial file-locking emulation mode. (The problem being that AFS3 only
@@ -73,21 +74,51 @@ enum afs_call_state {
};
/*
+ * Address preferences.
+ */
+struct afs_addr_preference {
+ union {
+ struct in_addr ipv4_addr; /* AF_INET address to compare against */
+ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */
+ };
+ sa_family_t family; /* Which address to use */
+ u16 prio; /* Priority */
+ u8 subnet_mask; /* How many bits to compare */
+};
+
+struct afs_addr_preference_list {
+ struct rcu_head rcu;
+ u16 version; /* Incremented when prefs list changes */
+ u8 ipv6_off; /* Offset of IPv6 addresses */
+ u8 nr; /* Number of addresses in total */
+ u8 max_prefs; /* Number of prefs allocated */
+ struct afs_addr_preference prefs[] __counted_by(max_prefs);
+};
+
+struct afs_address {
+ struct rxrpc_peer *peer;
+ short last_error; /* Last error from this address */
+ u16 prio; /* Address priority */
+};
+
+/*
* List of server addresses.
*/
struct afs_addr_list {
struct rcu_head rcu;
refcount_t usage;
u32 version; /* Version */
+ unsigned int debug_id;
+ unsigned int addr_pref_version; /* Version of address preference list */
unsigned char max_addrs;
unsigned char nr_addrs;
unsigned char preferred; /* Preferred address */
unsigned char nr_ipv4; /* Number of IPv4 addresses */
enum dns_record_source source:8;
enum dns_lookup_status status:8;
- unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */
unsigned long responded; /* Mask of addrs that responded */
- struct sockaddr_rxrpc addrs[] __counted_by(max_addrs);
+ struct afs_address addrs[] __counted_by(max_addrs);
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
@@ -96,11 +127,11 @@ struct afs_addr_list {
*/
struct afs_call {
const struct afs_call_type *type; /* type of call */
- struct afs_addr_list *alist; /* Address is alist[addr_ix] */
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
struct rxrpc_call *rxcall; /* RxRPC call handle */
+ struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* security for this call */
struct afs_net *net; /* The network namespace */
struct afs_server *server; /* The fileserver record if fs op (pins ref) */
@@ -116,11 +147,14 @@ struct afs_call {
};
void *buffer; /* reply receive buffer */
union {
- long ret0; /* Value to reply with instead of 0 */
+ struct afs_endpoint_state *probe;
+ struct afs_addr_list *vl_probe;
struct afs_addr_list *ret_alist;
struct afs_vldb_entry *ret_vldb;
char *ret_str;
};
+ struct afs_fid fid; /* Primary vnode ID (or all zeroes) */
+ unsigned char probe_index; /* Address in ->probe_alist */
struct afs_operation *op;
unsigned int server_index;
refcount_t ref;
@@ -133,13 +167,13 @@ struct afs_call {
unsigned reply_max; /* maximum size of reply */
unsigned count2; /* count used in unmarshalling */
unsigned char unmarshall; /* unmarshalling phase */
- unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
+ bool responded; /* Got a response from the call (may be abort) */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
@@ -306,6 +340,8 @@ struct afs_net {
struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */
struct afs_sysnames *sysnames;
rwlock_t sysnames_lock;
+ struct afs_addr_preference_list __rcu *address_prefs;
+ u16 address_pref_version;
/* Statistics counters */
atomic_t n_lookup; /* Number of lookups done */
@@ -379,6 +415,7 @@ struct afs_cell {
unsigned int debug_id;
/* The volumes belonging to this cell */
+ struct rw_semaphore vs_lock; /* Lock for server->volumes */
struct rb_root volumes; /* Tree of volumes on this server */
struct hlist_head proc_volumes; /* procfs volume list */
seqlock_t volume_lock; /* For volumes */
@@ -386,9 +423,6 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
seqlock_t fs_lock; /* For fs_servers */
- struct rw_semaphore fs_open_mmaps_lock;
- struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */
- atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -412,13 +446,14 @@ struct afs_vlserver {
rwlock_t lock; /* Lock on addresses */
refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
+ unsigned int debug_id;
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT in uS */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
u32 abort_code;
short error;
unsigned short flags;
@@ -428,6 +463,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
+ u16 service_id; /* Service ID we're using */
u16 port;
u16 name_len; /* Length of name */
char name[]; /* Server name, case-flattened */
@@ -477,6 +513,7 @@ struct afs_vldb_entry {
#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+ u8 vlsf_flags[AFS_NMAXNSERVERS];
short error;
u8 nr_servers; /* Number of server records */
u8 name_len;
@@ -484,6 +521,32 @@ struct afs_vldb_entry {
};
/*
+ * Fileserver endpoint state. The records the addresses of a fileserver's
+ * endpoints and the state and result of a round of probing on them. This
+ * allows the rotation algorithm to access those results without them being
+ * erased by a subsequent round of probing.
+ */
+struct afs_endpoint_state {
+ struct rcu_head rcu;
+ struct afs_addr_list *addresses; /* The addresses being probed */
+ unsigned long responsive_set; /* Bitset of responsive endpoints */
+ unsigned long failed_set; /* Bitset of endpoints we failed to probe */
+ refcount_t ref;
+ unsigned int server_id; /* Debug ID of server */
+ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */
+ atomic_t nr_probing; /* Number of outstanding probes */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
+ s32 abort_code;
+ short error;
+ unsigned long flags;
+#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */
+#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */
+#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */
+#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */
+#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */
+};
+
+/*
* Record of fileserver with which we're actively communicating.
*/
struct afs_server {
@@ -493,7 +556,6 @@ struct afs_server {
struct afs_uuid _uuid;
};
- struct afs_addr_list __rcu *addresses;
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node uuid_rb; /* Link in net->fs_servers */
struct afs_server __rcu *uuid_next; /* Next server with same UUID */
@@ -502,7 +564,7 @@ struct afs_server {
struct hlist_node addr4_link; /* Link in net->fs_addresses4 */
struct hlist_node addr6_link; /* Link in net->fs_addresses6 */
struct hlist_node proc_link; /* Link in net->fs_proc */
- struct work_struct initcb_work; /* Work for CB.InitCallBackState* */
+ struct list_head volumes; /* RCU list of afs_server_entry objects */
struct afs_server *gc_next; /* Next server in manager's list */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
@@ -520,44 +582,47 @@ struct afs_server {
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
+ u16 service_id; /* Service ID we're using. */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
/* file service access */
rwlock_t fs_lock; /* access lock */
- /* callback promise management */
- unsigned cb_s_break; /* Break-everything counter. */
-
/* Probe state */
+ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */
unsigned long probed_at; /* Time last probe was dispatched (jiffies) */
wait_queue_head_t probe_wq;
- atomic_t probe_outstanding;
+ unsigned int probe_counter; /* Number of probes issued */
spinlock_t probe_lock;
- struct {
- unsigned int rtt; /* RTT in uS */
- u32 abort_code;
- short error;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
- } probe;
};
+enum afs_ro_replicating {
+ AFS_RO_NOT_REPLICATING, /* Not doing replication */
+ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */
+ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */
+} __mode(byte);
+
/*
* Replaceable volume server list.
*/
struct afs_server_entry {
struct afs_server *server;
+ struct afs_volume *volume;
+ struct list_head slink; /* Link in server->volumes */
+ time64_t cb_expires_at; /* Time at which volume-level callback expires */
+ unsigned long flags;
+#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */
+#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */
+#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */
};
struct afs_server_list {
struct rcu_head rcu;
- afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */
refcount_t usage;
+ bool attached; /* T if attached to servers */
+ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */
unsigned char nr_servers;
- unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock;
@@ -568,25 +633,23 @@ struct afs_server_list {
* Live AFS volume management.
*/
struct afs_volume {
- union {
- struct rcu_head rcu;
- afs_volid_t vid; /* volume ID */
- };
+ struct rcu_head rcu;
+ afs_volid_t vid; /* The volume ID of this volume */
+ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
struct hlist_node proc_link; /* Link in cell->proc_volumes */
struct super_block __rcu *sb; /* Superblock on which inodes reside */
+ struct work_struct destructor; /* Deferred destructor */
unsigned long flags;
#define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */
#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */
#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */
#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */
-#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */
-#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
-#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
-#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */
+#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_volume *cache; /* Caching cookie */
#endif
@@ -594,8 +657,21 @@ struct afs_volume {
rwlock_t servers_lock; /* Lock for ->servers */
unsigned int servers_seq; /* Incremented each time ->servers changes */
- unsigned cb_v_break; /* Break-everything counter. */
+ /* RO release tracking */
+ struct mutex volsync_lock; /* Time/state evaluation lock */
+ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */
+ time64_t update_time; /* Volume update time (or TIME64_MIN) */
+
+ /* Callback management */
+ struct mutex cb_check_lock; /* Lock to control race to check after v_break */
+ time64_t cb_expires_at; /* Earliest volume callback expiry time */
+ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */
+ atomic_t cb_v_break; /* Volume-break event counter. */
+ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */
+ atomic_t cb_scrub; /* Scrub-all-data event counter. */
rwlock_t cb_v_break_lock;
+ struct rw_semaphore open_mmaps_lock;
+ struct list_head open_mmaps; /* List of vnodes that are mmapped */
afs_voltype_t type; /* type of volume */
char type_force; /* force volume type (suppress R/O -> R/W) */
@@ -634,7 +710,6 @@ struct afs_vnode {
spinlock_t wb_lock; /* lock for wb_keys */
spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags;
-#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
@@ -660,13 +735,14 @@ struct afs_vnode {
struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */
void *cb_server; /* Server with callback/filelock */
atomic_t cb_nr_mmap; /* Number of mmaps */
- unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */
- unsigned int cb_s_break; /* Mass break counter on ->server */
- unsigned int cb_v_break; /* Mass break counter on ->volume */
+ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */
+ unsigned int cb_scrub; /* Scrub counter on ->volume */
unsigned int cb_break; /* Break counter on vnode */
+ unsigned int cb_v_check; /* Break check counter on ->volume */
seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */
- time64_t cb_expires_at; /* time at which callback expires */
+ atomic64_t cb_expires_at; /* time at which callback expires */
+#define AFS_NO_CB_PROMISE TIME64_MIN
};
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
@@ -714,40 +790,49 @@ struct afs_permits {
* Error prioritisation and accumulation.
*/
struct afs_error {
- short error; /* Accumulated error */
+ s32 abort_code; /* Cumulative abort code */
+ short error; /* Cumulative error */
bool responded; /* T if server responded */
-};
-
-/*
- * Cursor for iterating over a server's address list.
- */
-struct afs_addr_cursor {
- struct afs_addr_list *alist; /* Current address list (pins ref) */
- unsigned long tried; /* Tried addresses */
- signed char index; /* Current address */
- bool responded; /* T if the current address responded */
- unsigned short nr_iterations; /* Number of address iterations */
- short error;
- u32 abort_code;
+ bool aborted; /* T if ->error is from an abort */
};
/*
* Cursor for iterating over a set of volume location servers.
*/
struct afs_vl_cursor {
- struct afs_addr_cursor ac;
struct afs_cell *cell; /* The cell we're querying */
struct afs_vlserver_list *server_list; /* Current server list (pins ref) */
struct afs_vlserver *server; /* Server on which this resides */
+ struct afs_addr_list *alist; /* Current address list (pins ref) */
struct key *key; /* Key for the server */
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- short error;
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ struct afs_error cumul_error; /* Cumulative error */
+ unsigned int debug_id;
+ s32 call_abort_code;
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ signed char addr_index; /* Current address */
unsigned short flags;
#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
- unsigned short nr_iterations; /* Number of server iterations */
+ short nr_iterations; /* Number of server iterations */
+ bool call_responded; /* T if the current address responded */
+};
+
+/*
+ * Fileserver state tracking for an operation. An array of these is kept,
+ * indexed by server index.
+ */
+struct afs_server_state {
+ /* Tracking of fileserver probe state. Other operations may interfere
+ * by probing a fileserver when accessing other volumes.
+ */
+ unsigned int probe_seq;
+ unsigned long untried_addrs; /* Addresses we haven't tried yet */
+ struct wait_queue_entry probe_waiter;
+ struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */
};
/*
@@ -768,7 +853,7 @@ struct afs_vnode_param {
struct afs_fid fid; /* Fid to access */
struct afs_status_cb scb; /* Returned status and callback promise */
afs_dataversion_t dv_before; /* Data version before the call */
- unsigned int cb_break_before; /* cb_break + cb_s_break before the call */
+ unsigned int cb_break_before; /* cb_break before the call */
u8 dv_delta; /* Expected change in data version */
bool put_vnode:1; /* T if we have a ref on the vnode */
bool need_io_lock:1; /* T if we need the I/O lock on this */
@@ -793,17 +878,17 @@ struct afs_operation {
struct afs_volume *volume; /* Volume being accessed */
struct afs_vnode_param file[2];
struct afs_vnode_param *more_files;
- struct afs_volsync volsync;
+ struct afs_volsync pre_volsync; /* Volsync before op */
+ struct afs_volsync volsync; /* Volsync returned by op */
struct dentry *dentry; /* Dentry to be altered */
struct dentry *dentry_2; /* Second dentry to be altered */
struct timespec64 mtime; /* Modification time to record */
struct timespec64 ctime; /* Change time to set */
+ struct afs_error cumul_error; /* Cumulative error */
short nr_files; /* Number of entries in file[], more_files */
- short error;
unsigned int debug_id;
unsigned int cb_v_break; /* Volume break counter before op */
- unsigned int cb_s_break; /* Server break counter before op */
union {
struct {
@@ -848,13 +933,19 @@ struct afs_operation {
};
/* Fileserver iteration state */
- struct afs_addr_cursor ac;
struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_server *server; /* Server we're using (ref pinned by server_list) */
+ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */
+ struct afs_server_state *server_states; /* States of the servers involved */
struct afs_call *call;
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- unsigned short nr_iterations; /* Number of server iterations */
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ s32 call_abort_code; /* Abort code from single call */
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ short nr_iterations; /* Number of server iterations */
+ signed char addr_index; /* Current address */
+ bool call_responded; /* T if the current address responded */
unsigned int flags;
#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */
@@ -956,31 +1047,32 @@ static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
/*
* addr_list.c
*/
-static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
-{
- if (alist)
- refcount_inc(&alist->usage);
- return alist;
-}
-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
- unsigned short,
- unsigned short);
-extern void afs_put_addrlist(struct afs_addr_list *);
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
+extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr);
+extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
const char *, size_t, char,
unsigned short, unsigned short);
+bool afs_addr_list_same(const struct afs_addr_list *a,
+ const struct afs_addr_list *b);
extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
-extern bool afs_iterate_addresses(struct afs_addr_cursor *);
-extern int afs_end_cursor(struct afs_addr_cursor *);
-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
+extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 xdr, u16 port);
+extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 *xdr, u16 port);
+
+/*
+ * addr_prefs.c
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size);
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist);
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist);
/*
* callback.c
*/
extern void afs_invalidate_mmap_work(struct work_struct *);
-extern void afs_server_init_callback_work(struct work_struct *work);
extern void afs_init_callback_state(struct afs_server *);
extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
@@ -988,13 +1080,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback
static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
{
- return vnode->cb_break + vnode->cb_v_break;
+ return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub;
}
static inline bool afs_cb_is_broken(unsigned int cb_break,
const struct afs_vnode *vnode)
{
- return cb_break != (vnode->cb_break + vnode->volume->cb_v_break);
+ return cb_break != (vnode->cb_break +
+ atomic_read(&vnode->volume->cb_ro_snapshot) +
+ atomic_read(&vnode->volume->cb_scrub));
}
/*
@@ -1110,10 +1204,11 @@ extern void afs_fs_get_volume_status(struct afs_operation *);
extern void afs_fs_set_lock(struct afs_operation *);
extern void afs_fs_extend_lock(struct afs_operation *);
extern void afs_fs_release_lock(struct afs_operation *);
-extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
-extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key);
+bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key);
extern void afs_fs_inline_bulk_status(struct afs_operation *);
struct afs_acl {
@@ -1133,11 +1228,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *);
extern void afs_wait_for_operation(struct afs_operation *);
extern int afs_do_sync_operation(struct afs_operation *);
-static inline void afs_op_nomem(struct afs_operation *op)
-{
- op->error = -ENOMEM;
-}
-
static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n,
struct afs_vnode *vnode)
{
@@ -1154,12 +1244,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n,
/*
* fs_probe.c
*/
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where);
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool);
-extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
+void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_addrs, struct key *key);
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
-extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr);
extern void afs_fs_probe_cleanup(struct afs_net *);
/*
@@ -1173,9 +1268,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *);
extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
-extern bool afs_check_validity(struct afs_vnode *);
-extern int afs_validate(struct afs_vnode *, struct key *);
-bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
@@ -1231,6 +1323,31 @@ static inline void __afs_stat(atomic_t *s)
extern int afs_abort_to_error(u32);
extern void afs_prioritise_error(struct afs_error *, int, u32);
+static inline void afs_op_nomem(struct afs_operation *op)
+{
+ op->cumul_error.error = -ENOMEM;
+}
+
+static inline int afs_op_error(const struct afs_operation *op)
+{
+ return op->cumul_error.error;
+}
+
+static inline s32 afs_op_abort_code(const struct afs_operation *op)
+{
+ return op->cumul_error.abort_code;
+}
+
+static inline int afs_op_set_error(struct afs_operation *op, int error)
+{
+ return op->cumul_error.error = error;
+}
+
+static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code)
+{
+ afs_prioritise_error(&op->cumul_error, error, abort_code);
+}
+
/*
* mntpt.c
*/
@@ -1261,6 +1378,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {}
/*
* rotate.c
*/
+void afs_clear_server_states(struct afs_operation *op);
extern bool afs_select_fileserver(struct afs_operation *);
extern void afs_dump_edestaddrreq(const struct afs_operation *);
@@ -1273,8 +1391,8 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
-extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
+void afs_make_call(struct afs_call *call, gfp_t gfp);
+void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *,
size_t, size_t);
@@ -1287,12 +1405,16 @@ extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
gfp_t gfp)
{
- op->call = call;
- op->type = call->type;
- call->op = op;
- call->key = op->key;
- call->intr = !(op->flags & AFS_OPERATION_UNINTR);
- afs_make_call(&op->ac, call, gfp);
+ struct afs_addr_list *alist = op->estate->addresses;
+
+ op->call = call;
+ op->type = call->type;
+ call->op = op;
+ call->key = op->key;
+ call->intr = !(op->flags & AFS_OPERATION_UNINTR);
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer);
+ call->service_id = op->server->service_id;
+ afs_make_call(call, gfp);
}
static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
@@ -1401,8 +1523,7 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *,
- const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
@@ -1414,7 +1535,7 @@ extern void afs_manage_servers(struct work_struct *);
extern void afs_servers_timer(struct timer_list *);
extern void afs_fs_probe_timer(struct timer_list *);
extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_check_server_record(struct afs_operation *, struct afs_server *);
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
@@ -1442,10 +1563,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list
}
extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *);
-extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *,
- struct afs_vldb_entry *,
- u8);
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
+ struct key *key,
+ struct afs_vldb_entry *vldb);
extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *);
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist);
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist,
+ struct afs_server_list *old);
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist);
/*
* super.c
@@ -1454,13 +1579,24 @@ extern int __init afs_fs_init(void);
extern void afs_fs_exit(void);
/*
+ * validation.c
+ */
+bool afs_check_validity(const struct afs_vnode *vnode);
+int afs_update_volume_state(struct afs_operation *op);
+int afs_validate(struct afs_vnode *vnode, struct key *key);
+
+/*
* vlclient.c
*/
extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
const char *, int);
extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
-extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *,
- struct key *, struct afs_vlserver *, unsigned int);
+struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
+ struct key *key,
+ struct afs_vlserver *server,
+ unsigned int server_index);
extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *);
@@ -1516,7 +1652,7 @@ extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
-extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
@@ -1603,7 +1739,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
struct afs_vnode_param *dir_vp,
struct dentry *dentry)
{
- if (!op->error)
+ if (!op->cumul_error.error)
dentry->d_fsdata =
(void *)(unsigned long)dir_vp->scb.status.data_version;
}
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 6425c81d07de..1b3bd21c168a 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -156,6 +156,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
static struct pernet_operations afs_net_ops = {
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 805328ca5428..b8180bf2281f 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
{
switch (error) {
case 0:
+ e->aborted = false;
+ e->error = 0;
return;
default:
if (e->error == -ETIMEDOUT ||
@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->responded)
return;
e->error = error;
+ e->aborted = false;
return;
case -ECONNABORTED:
- error = afs_abort_to_error(abort_code);
- fallthrough;
+ e->error = afs_abort_to_error(abort_code);
+ e->aborted = true;
+ e->responded = true;
+ return;
case -ENETRESET: /* Responded, but we seem to have changed address */
+ e->aborted = false;
e->responded = true;
e->error = error;
return;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2a0c83d71565..3bd02571f30d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -147,6 +147,55 @@ inval:
}
/*
+ * Display the list of addr_prefs known to the namespace.
+ */
+static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
+{
+ struct afs_addr_preference_list *preflist;
+ struct afs_addr_preference *pref;
+ struct afs_net *net = afs_seq2net_single(m);
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } addr;
+ unsigned int i;
+ char buf[44]; /* Maximum ipv6 + max subnet is 43 */
+
+ rcu_read_lock();
+ preflist = rcu_dereference(net->address_prefs);
+
+ if (!preflist) {
+ seq_puts(m, "NO PREFS\n");
+ return 0;
+ }
+
+ seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
+ preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs);
+
+ memset(&addr, 0, sizeof(addr));
+
+ for (i = 0; i < preflist->nr; i++) {
+ pref = &preflist->prefs[i];
+
+ addr.sin.sin_family = pref->family;
+ if (pref->family == AF_INET) {
+ memcpy(&addr.sin.sin_addr, &pref->ipv4_addr,
+ sizeof(addr.sin.sin_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ } else {
+ memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr,
+ sizeof(addr.sin6.sin6_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ }
+ }
+
+ rcu_read_lock();
+ return 0;
+}
+
+/*
* Display the name of the current workstation cell.
*/
static int afs_proc_rootcell_show(struct seq_file *m, void *v)
@@ -307,7 +356,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
for (i = 0; i < alist->nr_addrs; i++)
seq_printf(m, " %c %pISpc\n",
alist->preferred == i ? '>' : '-',
- &alist->addrs[i].transport);
+ rxrpc_kernel_remote_addr(alist->addrs[i].peer));
}
seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
@@ -375,32 +424,45 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = {
*/
static int afs_proc_servers_show(struct seq_file *m, void *v)
{
- struct afs_server *server;
+ struct afs_endpoint_state *estate;
struct afs_addr_list *alist;
+ struct afs_server *server;
+ unsigned long failed;
int i;
if (v == SEQ_START_TOKEN) {
- seq_puts(m, "UUID REF ACT\n");
+ seq_puts(m, "UUID REF ACT CELL\n");
return 0;
}
server = list_entry(v, struct afs_server, proc_link);
- alist = rcu_dereference(server->addresses);
- seq_printf(m, "%pU %3d %3d\n",
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
- atomic_read(&server->active));
- seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
- server->flags, server->rtt, server->cb_s_break);
- seq_printf(m, " - probe: last=%d out=%d\n",
- (int)(jiffies - server->probed_at) / HZ,
- atomic_read(&server->probe_outstanding));
- seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n",
- alist->version, alist->responded, alist->failed);
- for (i = 0; i < alist->nr_addrs; i++)
- seq_printf(m, " [%x] %pISpc%s\n",
- i, &alist->addrs[i].transport,
- alist->preferred == i ? "*" : "");
+ atomic_read(&server->active),
+ server->cell->name);
+ seq_printf(m, " - info: fl=%lx rtt=%u\n",
+ server->flags, server->rtt);
+ seq_printf(m, " - probe: last=%d\n",
+ (int)(jiffies - server->probed_at) / HZ);
+ failed = estate->failed_set;
+ seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
+ estate->probe_seq, atomic_read(&estate->nr_probing),
+ estate->responsive_set, estate->failed_set);
+ seq_printf(m, " - ALIST v=%u ap=%u\n",
+ alist->version, alist->addr_pref_version);
+ for (i = 0; i < alist->nr_addrs; i++) {
+ const struct afs_address *addr = &alist->addrs[i];
+
+ seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n",
+ i, rxrpc_kernel_remote_addr(addr->peer),
+ alist->preferred == i ? "*" :
+ test_bit(i, &failed) ? "!" : "",
+ rxrpc_kernel_get_srtt(addr->peer),
+ addr->last_error, addr->prio);
+ }
return 0;
}
@@ -681,7 +743,11 @@ int afs_proc_init(struct afs_net *net)
&afs_proc_sysname_ops,
afs_proc_sysname_write,
sizeof(struct seq_net_private),
- NULL))
+ NULL) ||
+ !proc_create_net_single_write("addr_prefs", 0644, p,
+ afs_proc_addr_prefs_show,
+ afs_proc_addr_prefs_write,
+ NULL))
goto error_tree;
net->proc_afs = p;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a840c3588ebb..700a27bc8c25 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -13,6 +13,19 @@
#include <linux/sched/signal.h>
#include "internal.h"
#include "afs_fs.h"
+#include "protocol_uae.h"
+
+void afs_clear_server_states(struct afs_operation *op)
+{
+ unsigned int i;
+
+ if (op->server_states) {
+ for (i = 0; i < op->server_list->nr_servers; i++)
+ afs_put_endpoint_state(op->server_states[i].endpoint_state,
+ afs_estate_trace_put_server_state);
+ kfree(op->server_states);
+ }
+}
/*
* Begin iteration through a server list, starting with the vnode's last used
@@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
void *cb_server;
int i;
+ trace_afs_rotate(op, afs_rotate_trace_start, 0);
+
read_lock(&op->volume->servers_lock);
op->server_list = afs_get_serverlist(
rcu_dereference_protected(op->volume->servers,
lockdep_is_held(&op->volume->servers_lock)));
read_unlock(&op->volume->servers_lock);
- op->untried = (1UL << op->server_list->nr_servers) - 1;
- op->index = READ_ONCE(op->server_list->preferred);
+ op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
+ GFP_KERNEL);
+ if (!op->server_states) {
+ afs_op_nomem(op);
+ trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
+ return false;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < op->server_list->nr_servers; i++) {
+ struct afs_endpoint_state *estate;
+ struct afs_server_state *s = &op->server_states[i];
+
+ server = op->server_list->servers[i].server;
+ estate = rcu_dereference(server->endpoint_state);
+ s->endpoint_state = afs_get_endpoint_state(estate,
+ afs_estate_trace_get_server_state);
+ s->probe_seq = estate->probe_seq;
+ s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
+ init_waitqueue_entry(&s->probe_waiter, current);
+ afs_get_address_preferences(op->net, estate->addresses);
+ }
+ rcu_read_unlock();
+
+
+ op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
+ op->server_index = -1;
cb_server = vnode->cb_server;
if (cb_server) {
@@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
for (i = 0; i < op->server_list->nr_servers; i++) {
server = op->server_list->servers[i].server;
if (server == cb_server) {
- op->index = i;
+ op->server_index = i;
goto found_interest;
}
}
@@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
* and have to return an error.
*/
if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -ESTALE);
+ trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
return false;
}
@@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
ASSERTCMP(cb_server, ==, vnode->cb_server);
vnode->cb_server = NULL;
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
vnode->cb_break++;
write_sequnlock(&vnode->cb_lock);
}
@@ -70,7 +111,7 @@ found_interest:
/*
* Post volume busy note.
*/
-static void afs_busy(struct afs_volume *volume, u32 abort_code)
+static void afs_busy(struct afs_operation *op, u32 abort_code)
{
const char *m;
@@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}
- pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
+ op->volume->vid, op->volume->name, &op->server->uuid, m);
}
/*
@@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
*/
static bool afs_sleep_and_retry(struct afs_operation *op)
{
+ trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
if (!(op->flags & AFS_OPERATION_UNINTR)) {
msleep_interruptible(1000);
if (signal_pending(current)) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
return false;
}
} else {
@@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = op->file[0].vnode;
- struct afs_error e;
- u32 rtt;
- int error = op->ac.error, i;
+ unsigned long set, failed;
+ s32 abort_code = op->call_abort_code;
+ int best_prio = 0;
+ int error = op->call_error, addr_index, i, j;
+
+ op->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- op->untried, op->index,
- op->ac.tried, op->ac.index,
- error, op->ac.abort_code);
+ _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
+ op->debug_id, op->nr_iterations, op->volume->vid,
+ op->server_index, op->untried_servers,
+ op->addr_index, op->addr_tried,
+ error, abort_code);
if (op->flags & AFS_OPERATION_STOP) {
+ trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
_leave(" = f [stopped]");
return false;
}
- op->nr_iterations++;
-
- /* Evaluate the result of the previous operation, if there was one. */
- switch (error) {
- case SHRT_MAX:
+ if (op->nr_iterations == 0)
goto start;
+ WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
+ trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (op->call_error) {
case 0:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
+ op->cumul_error.responded = true;
+
+ /* We succeeded, but we may need to redo the op from another
+ * server if we're looking at a set of RO volumes where some of
+ * the servers have not yet been brought up to date lest we
+ * regress the data. We only switch to the new version once
+ * >=50% of the servers are updated.
+ */
+ error = afs_update_volume_state(op);
+ if (error != 0) {
+ if (error == 1) {
+ afs_sleep_and_retry(op);
+ goto restart_from_beginning;
+ }
+ afs_op_set_error(op, error);
+ goto failed;
+ }
+ fallthrough;
default:
/* Success or local failure. Stop. */
- op->error = error;
+ afs_op_set_error(op, error);
op->flags |= AFS_OPERATION_STOP;
+ trace_afs_rotate(op, afs_rotate_trace_stop, error);
_leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
+ *
+ * Note that various V* errors should not be sent to a cache manager
+ * by a fileserver as they should be translated to more modern UAE*
+ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
+ * these abort codes.
*/
- switch (op->ac.abort_code) {
+ trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
+ op->cumul_error.responded = true;
+ switch (abort_code) {
case VNOVOL:
/* This fileserver doesn't know about the volume.
* - May indicate that the VL is wrong - retry once and compare
* the results.
* - May indicate that the fileserver couldn't attach to the vol.
+ * - The volume might have been temporarily removed so that it can
+ * be replaced by a volume restore. "vos" might have ended one
+ * transaction and has yet to create the next.
+ * - The volume might not be blessed or might not be in-service
+ * (administrative action).
*/
if (op->flags & AFS_OPERATION_VNOVOL) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
write_lock(&op->volume->servers_lock);
- op->server_list->vnovol_mask |= 1 << op->index;
+ op->server_list->vnovol_mask |= 1 << op->server_index;
write_unlock(&op->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
- op->error = -ENOMEDIUM;
+ afs_op_set_error(op, -ENOMEDIUM);
goto failed;
}
@@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* it's the fileserver having trouble.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
@@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op)
_leave(" = t [vnovol]");
return true;
- case VSALVAGE: /* TODO: Should this return an error or iterate? */
case VVOLEXISTS:
- case VNOSERVICE:
case VONLINE:
- case VDISKFULL:
- case VOVERQUOTA:
- op->error = afs_abort_to_error(op->ac.abort_code);
+ /* These should not be returned from the fileserver. */
+ pr_warn("Fileserver returned unexpected abort %d\n",
+ abort_code);
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
+ case VNOSERVICE:
+ /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
+ * if the volume was neither in-service nor administratively
+ * blessed. All usage was replaced by VNOVOL because AFS 3.1 and
+ * earlier cache managers did not handle VNOSERVICE and assumed
+ * it was the client OSes errno 105.
+ *
+ * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
+ * fileserver idle dead time error which was sent in place of
+ * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the
+ * fileserver took too long to send a reply to the client.
+ * RX_CALL_TIMEOUT would have caused the cache manager to mark the
+ * server down whereas VNOSERVICE since AFS 3.2 would cause cache
+ * manager to temporarily (up to 15 minutes) mark the volume
+ * instance as unusable.
+ *
+ * The idle dead logic resulted in cache inconsistency since a
+ * state changing call that the cache manager assumed was dead
+ * could still be processed to completion by the fileserver. This
+ * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
+ * returned. However, many 1.4.8 through 1.6.24 fileservers are
+ * still in existence.
+ *
+ * AuriStorFS fileservers have never returned VNOSERVICE.
+ *
+ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
+ */
+ case RX_CALL_TIMEOUT:
+ afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
+ goto next_server;
+
+ case VSALVAGING: /* This error should not be leaked to cache managers
+ * but is from OpenAFS demand attach fileservers.
+ * It should be treated as an alias for VOFFLINE.
+ */
+ case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
case VOFFLINE:
- if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ /* The volume is in use by the volserver or another volume utility
+ * for an operation that might alter the contents. The volume is
+ * expected to come back but it might take a long time (could be
+ * days).
+ */
+ if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
}
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EADV;
- goto failed;
- }
- if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -EADV);
goto failed;
}
goto busy;
- case VSALVAGING:
- case VRESTARTING:
+ case VRESTARTING: /* The fileserver is either shutting down or starting up. */
case VBUSY:
- /* Retry after going round all the servers unless we
- * have a file lock we need to maintain.
+ /* The volume is in use by the volserver or another volume
+ * utility for an operation that is not expected to alter the
+ * contents of the volume. VBUSY does not need to be returned
+ * for a ROVOL or BACKVOL bound to an ITBusy volserver
+ * transaction. The fileserver is permitted to continue serving
+ * content from ROVOLs and BACKVOLs during an ITBusy transaction
+ * because the content will not change. However, many fileserver
+ * releases do return VBUSY for ROVOL and BACKVOL instances under
+ * many circumstances.
+ *
+ * Retry after going round all the servers unless we have a file
+ * lock we need to maintain.
*/
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EBUSY;
+ afs_op_set_error(op, -EBUSY);
goto failed;
}
- if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
}
busy:
if (op->flags & AFS_OPERATION_CUR_ONLY) {
if (!afs_sleep_and_retry(op))
goto failed;
- /* Retry with same server & address */
+ /* Retry with same server & address */
_leave(" = t [vbusy]");
return true;
}
@@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* honour, just in case someone sets up a loop.
*/
if (op->flags & AFS_OPERATION_VMOVED) {
- op->error = -EREMOTEIO;
+ afs_op_set_error(op, -EREMOTEIO);
goto failed;
}
op->flags |= AFS_OPERATION_VMOVED;
@@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op)
set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
/* If the server list didn't change, then the VLDB is
* out of sync with the fileservers. This is hopefully
@@ -264,22 +401,50 @@ bool afs_select_fileserver(struct afs_operation *op)
* TODO: Retry a few times with sleeps.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -ENOMEDIUM;
+ afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
goto failed;
}
goto restart_from_beginning;
+ case UAEIO:
+ case VIO:
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ if (op->volume->type != AFSVL_RWVOL)
+ goto next_server;
+ goto failed;
+
+ case VDISKFULL:
+ case UAENOSPC:
+ /* The partition is full. Only applies to RWVOLs.
+ * Translate locally and return ENOSPC.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -ENOSPC);
+ goto failed_but_online;
+
+ case VOVERQUOTA:
+ case UAEDQUOT:
+ /* Volume is full. Only applies to RWVOLs.
+ * Translate locally and return EDQUOT.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -EDQUOT);
+ goto failed_but_online;
+
default:
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
- op->error = afs_abort_to_error(op->ac.abort_code);
+ afs_op_accumulate_error(op, error, abort_code);
+ failed_but_online:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
goto failed;
}
case -ETIMEDOUT:
case -ETIME:
- if (op->error != -EDESTADDRREQ)
+ if (afs_op_error(op) != -EDESTADDRREQ)
goto iterate_address;
fallthrough;
case -ERFKILL:
@@ -289,7 +454,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -EHOSTDOWN:
case -ECONNREFUSED:
_debug("no conn");
- op->error = error;
+ afs_op_accumulate_error(op, error, 0);
goto iterate_address;
case -ENETRESET:
@@ -298,24 +463,31 @@ bool afs_select_fileserver(struct afs_operation *op)
fallthrough;
case -ECONNRESET:
_debug("call reset");
- op->error = error;
+ afs_op_set_error(op, error);
goto failed;
}
restart_from_beginning:
+ trace_afs_rotate(op, afs_rotate_trace_restart, 0);
_debug("restart");
- afs_end_cursor(&op->ac);
+ op->estate = NULL;
op->server = NULL;
+ afs_clear_server_states(op);
+ op->server_states = NULL;
afs_put_serverlist(op->net, op->server_list);
op->server_list = NULL;
start:
_debug("start");
+ ASSERTCMP(op->estate, ==, NULL);
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (!afs_start_fs_iteration(op, vnode))
goto failed;
@@ -323,52 +495,83 @@ start:
_debug("__ VOL %llx __", op->volume->vid);
pick_server:
- _debug("pick [%lx]", op->untried);
+ _debug("pick [%lx]", op->untried_servers);
+ ASSERTCMP(op->estate, ==, NULL);
- error = afs_wait_for_fs_probes(op->server_list, op->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_fs_probes(op, op->server_states,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ switch (error) {
+ case 0: /* No untried responsive servers and no outstanding probes */
+ trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
+ goto no_more_servers;
+ case 1: /* Got a response */
+ trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
+ break;
+ case 2: /* Probe data superseded */
+ trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
+ goto restart_from_beginning;
+ default:
+ trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
+ afs_op_set_error(op, error);
+ goto failed;
+ }
- /* Pick the untried server with the lowest RTT. If we have outstanding
- * callbacks, we stick with the server we're already using if we can.
+ /* Pick the untried server with the highest priority untried endpoint.
+ * If we have outstanding callbacks, we stick with the server we're
+ * already using if we can.
*/
if (op->server) {
- _debug("server %u", op->index);
- if (test_bit(op->index, &op->untried))
+ _debug("server %u", op->server_index);
+ if (test_bit(op->server_index, &op->untried_servers))
goto selected_server;
op->server = NULL;
_debug("no server");
}
- op->index = -1;
- rtt = U32_MAX;
+ rcu_read_lock();
+ op->server_index = -1;
+ best_prio = -1;
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *es;
+ struct afs_server_entry *se = &op->server_list->servers[i];
+ struct afs_addr_list *sal;
+ struct afs_server *s = se->server;
- if (!test_bit(i, &op->untried) ||
+ if (!test_bit(i, &op->untried_servers) ||
+ test_bit(AFS_SE_EXCLUDED, &se->flags) ||
!test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- op->index = i;
- rtt = s->probe.rtt;
+ es = op->server_states->endpoint_state;
+ sal = es->addresses;
+
+ afs_get_address_preferences_rcu(op->net, sal);
+ for (j = 0; j < sal->nr_addrs; j++) {
+ if (!sal->addrs[j].peer)
+ continue;
+ if (sal->addrs[j].prio > best_prio) {
+ op->server_index = i;
+ best_prio = sal->addrs[j].prio;
+ }
}
}
+ rcu_read_unlock();
- if (op->index == -1)
+ if (op->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", op->index);
- __clear_bit(op->index, &op->untried);
+ trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
+ _debug("use %d prio %u", op->server_index, best_prio);
+ __clear_bit(op->server_index, &op->untried_servers);
/* We're starting on a different fileserver from the list. We need to
* check it, create a callback intercept, find its address list and
* probe its capabilities before we use it.
*/
- ASSERTCMP(op->ac.alist, ==, NULL);
- server = op->server_list->servers[op->index].server;
+ ASSERTCMP(op->estate, ==, NULL);
+ server = op->server_list->servers[op->server_index].server;
- if (!afs_check_server_record(op, server))
+ if (!afs_check_server_record(op, server, op->key))
goto failed;
_debug("USING SERVER: %pU", &server->uuid);
@@ -377,58 +580,73 @@ selected_server:
op->server = server;
if (vnode->cb_server != server) {
vnode->cb_server = server;
- vnode->cb_s_break = server->cb_s_break;
- vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
- vnode->cb_v_break = vnode->volume->cb_v_break;
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
}
- read_lock(&server->fs_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(alist);
- read_unlock(&server->fs_lock);
-
retry_server:
- memset(&op->ac, 0, sizeof(op->ac));
-
- if (!op->ac.alist)
- op->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- op->ac.index = -1;
+ op->addr_tried = 0;
+ op->addr_index = -1;
iterate_address:
- ASSERT(op->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&op->ac))
- goto out_of_addresses;
+ op->estate = op->server_states[op->server_index].endpoint_state;
+ set = READ_ONCE(op->estate->responsive_set);
+ failed = READ_ONCE(op->estate->failed_set);
+ _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
+ set &= ~(failed | op->addr_tried);
+ trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
+ if (!set)
+ goto wait_for_more_probe_results;
+
+ alist = op->estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (alist->addrs[i].prio > best_prio) {
+ addr_index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
- _debug("address [%u] %u/%u %pISp",
- op->index, op->ac.index, op->ac.alist->nr_addrs,
- &op->ac.alist->addrs[op->ac.index].transport);
+ addr_index = READ_ONCE(alist->preferred);
+ if (!test_bit(addr_index, &set))
+ addr_index = __ffs(set);
+ op->addr_index = addr_index;
+ set_bit(addr_index, &op->addr_tried);
+
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
+ _debug("address [%u] %u/%u %pISp",
+ op->server_index, addr_index, alist->nr_addrs,
+ rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
_leave(" = t");
return true;
-out_of_addresses:
+wait_for_more_probe_results:
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ if (!error)
+ goto iterate_address;
+
/* We've now had a failure to respond on all of a server's addresses -
* immediately probe them again and consider retrying the server.
*/
+ trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
afs_probe_fileserver(op->net, op->server);
if (op->flags & AFS_OPERATION_RETRY_SERVER) {
- alist = op->ac.alist;
- error = afs_wait_for_one_fs_probe(
- op->server, !(op->flags & AFS_OPERATION_UNINTR));
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
switch (error) {
case 0:
op->flags &= ~AFS_OPERATION_RETRY_SERVER;
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
goto retry_server;
case -ERESTARTSYS:
- goto failed_set_error;
+ afs_op_set_error(op, error);
+ goto failed;
case -ETIME:
case -EDESTADDRREQ:
goto next_server;
@@ -436,34 +654,51 @@ out_of_addresses:
}
next_server:
+ trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
_debug("next");
- afs_end_cursor(&op->ac);
+ ASSERT(op->estate);
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
goto pick_server;
no_more_servers:
/* That's all the servers poked to no good effect. Try again if some
* of them were busy.
*/
- if (op->flags & AFS_OPERATION_VBUSY)
+ trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
+ if (op->flags & AFS_OPERATION_VBUSY) {
+ afs_sleep_and_retry(op);
+ op->flags &= ~AFS_OPERATION_VBUSY;
goto restart_from_beginning;
+ }
- e.error = -EDESTADDRREQ;
- e.responded = false;
+ rcu_read_lock();
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *estate;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
- s->probe.abort_code);
+ estate = op->server_states->endpoint_state;
+ error = READ_ONCE(estate->error);
+ if (error < 0)
+ afs_op_accumulate_error(op, error, estate->abort_code);
}
+ rcu_read_unlock();
- error = e.error;
-
-failed_set_error:
- op->error = error;
failed:
+ trace_afs_rotate(op, afs_rotate_trace_failed, 0);
op->flags |= AFS_OPERATION_STOP;
- afs_end_cursor(&op->ac);
- _leave(" = f [failed %d]", op->error);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
+ }
+ _leave(" = f [failed %d]", afs_op_error(op));
return false;
}
@@ -482,37 +717,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
rcu_read_lock();
pr_notice("EDESTADDR occurred\n");
- pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
+ pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
op->file[0].cb_break_before,
- op->file[1].cb_break_before, op->flags, op->error);
- pr_notice("FC: ut=%lx ix=%d ni=%u\n",
- op->untried, op->index, op->nr_iterations);
+ op->file[1].cb_break_before, op->flags, op->cumul_error.error);
+ pr_notice("OP: ut=%lx ix=%d ni=%u\n",
+ op->untried_servers, op->server_index, op->nr_iterations);
+ pr_notice("OP: call er=%d ac=%d r=%u\n",
+ op->call_error, op->call_abort_code, op->call_responded);
if (op->server_list) {
const struct afs_server_list *sl = op->server_list;
- pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
- sl->nr_servers, sl->preferred, sl->vnovol_mask);
+
+ pr_notice("FC: SL nr=%u vnov=%hx\n",
+ sl->nr_servers, sl->vnovol_mask);
for (i = 0; i < sl->nr_servers; i++) {
const struct afs_server *s = sl->servers[i].server;
+ const struct afs_endpoint_state *e =
+ rcu_dereference(s->endpoint_state);
+ const struct afs_addr_list *a = e->addresses;
+
pr_notice("FC: server fl=%lx av=%u %pU\n",
s->flags, s->addr_version, &s->uuid);
- if (s->addresses) {
- const struct afs_addr_list *a =
- rcu_dereference(s->addresses);
+ pr_notice("FC: - pq=%x R=%lx F=%lx\n",
+ e->probe_seq, e->responsive_set, e->failed_set);
+ if (a) {
pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
a->version,
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
- pr_notice("FC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == op->ac.alist)
+ if (a == e->addresses)
pr_notice("FC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
- op->ac.responded, op->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
rcu_read_unlock();
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d642d06a453b..c453428f3c8b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -178,6 +178,8 @@ void afs_put_call(struct afs_call *call)
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
+ rxrpc_kernel_put_peer(call->peer);
+
if (call->rxcall) {
rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
rxrpc_kernel_put_call(net->socket, call->rxcall);
@@ -187,7 +189,6 @@ void afs_put_call(struct afs_call *call)
call->type->destructor(call);
afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
- afs_put_addrlist(call->alist);
kfree(call->request);
trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
@@ -294,9 +295,8 @@ static void afs_notify_end_request_tx(struct sock *sock,
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
*/
-void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+void afs_make_call(struct afs_call *call, gfp_t gfp)
{
- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
@@ -304,7 +304,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
s64 tx_total_len;
int ret;
- _enter(",{%pISp},", &srx->transport);
+ _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id);
ASSERT(call->type != NULL);
ASSERT(call->type->name != NULL);
@@ -313,8 +313,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
call, call->type->name, key_serial(call->key),
atomic_read(&call->net->nr_outstanding_calls));
- call->addr_ix = ac->index;
- call->alist = afs_get_addrlist(ac->alist);
+ trace_afs_make_call(call);
/* Work out the length we're going to transmit. This is awkward for
* calls such as FS.StoreData where there's an extra injection of data
@@ -333,7 +332,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
}
/* create a call */
- rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
+ rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key,
(unsigned long)call,
tx_total_len,
call->max_lifespan,
@@ -341,6 +340,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
(call->async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter),
+ call->service_id,
call->upgrade,
(call->intr ? RXRPC_PREINTERRUPTIBLE :
RXRPC_UNINTERRUPTIBLE),
@@ -390,7 +390,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* Note that at this point, we may have received the reply or an abort
* - and an asynchronous call may already have completed.
*
- * afs_wait_for_call_to_complete(call, ac)
+ * afs_wait_for_call_to_complete(call)
* must be called to synchronously clean up.
*/
return;
@@ -406,8 +406,7 @@ error_do_abort:
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
- ac->abort_code = call->abort_code;
- ac->responded = true;
+ call->responded = true;
}
call->error = ret;
trace_afs_call_done(call);
@@ -427,7 +426,7 @@ error_kill_call:
afs_set_call_complete(call, ret, 0);
}
- ac->error = ret;
+ call->error = ret;
call->state = AFS_CALL_COMPLETE;
_leave(" = %d", ret);
}
@@ -461,7 +460,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
max = m + 1;
pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
msg, call->type->name,
- &call->alist->addrs[call->addr_ix].transport);
+ rxrpc_kernel_remote_addr(call->peer));
}
}
@@ -508,6 +507,7 @@ static void afs_deliver_to_call(struct afs_call *call)
ret = -EBADMSG;
switch (ret) {
case 0:
+ call->responded = true;
afs_queue_call_work(call);
if (state == AFS_CALL_CL_PROC_REPLY) {
if (call->op)
@@ -522,9 +522,11 @@ static void afs_deliver_to_call(struct afs_call *call)
goto out;
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+ call->responded = true;
afs_log_error(call, call->abort_code);
goto done;
case -ENOTSUPP:
+ call->responded = true;
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret,
@@ -571,50 +573,46 @@ call_complete:
}
/*
- * Wait synchronously for a call to complete and clean up the call struct.
+ * Wait synchronously for a call to complete.
*/
-long afs_wait_for_call_to_complete(struct afs_call *call,
- struct afs_addr_cursor *ac)
+void afs_wait_for_call_to_complete(struct afs_call *call)
{
- long ret;
bool rxrpc_complete = false;
- DECLARE_WAITQUEUE(myself, current);
-
_enter("");
- ret = call->error;
- if (ret < 0)
- goto out;
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
+ DECLARE_WAITQUEUE(myself, current);
+
+ add_wait_queue(&call->waitq, &myself);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* deliver any messages that are in the queue */
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+ call->need_attention) {
+ call->need_attention = false;
+ __set_current_state(TASK_RUNNING);
+ afs_deliver_to_call(call);
+ continue;
+ }
- add_wait_queue(&call->waitq, &myself);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /* deliver any messages that are in the queue */
- if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
- call->need_attention) {
- call->need_attention = false;
- __set_current_state(TASK_RUNNING);
- afs_deliver_to_call(call);
- continue;
- }
+ if (afs_check_call_state(call, AFS_CALL_COMPLETE))
+ break;
- if (afs_check_call_state(call, AFS_CALL_COMPLETE))
- break;
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
+ /* rxrpc terminated the call. */
+ rxrpc_complete = true;
+ break;
+ }
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
- /* rxrpc terminated the call. */
- rxrpc_complete = true;
- break;
+ schedule();
}
- schedule();
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
}
- remove_wait_queue(&call->waitq, &myself);
- __set_current_state(TASK_RUNNING);
-
if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
if (rxrpc_complete) {
afs_set_call_complete(call, call->error, call->abort_code);
@@ -627,29 +625,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
afs_set_call_complete(call, -EINTR, 0);
}
}
-
- spin_lock_bh(&call->state_lock);
- ac->abort_code = call->abort_code;
- ac->error = call->error;
- spin_unlock_bh(&call->state_lock);
-
- ret = ac->error;
- switch (ret) {
- case 0:
- ret = call->ret0;
- call->ret0 = 0;
-
- fallthrough;
- case -ECONNABORTED:
- ac->responded = true;
- break;
- }
-
-out:
- _debug("call complete");
- afs_put_call(call);
- _leave(" = %p", (void *)ret);
- return ret;
}
/*
diff --git a/fs/afs/server.c b/fs/afs/server.c
index b5237206eac3..e169121f603e 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -21,13 +21,13 @@ static void __afs_put_server(struct afs_net *, struct afs_server *);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net,
- const struct sockaddr_rxrpc *srx)
+struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server = NULL;
unsigned int i;
- int seq = 0, diff;
+ int seq = 1;
rcu_read_lock();
@@ -35,39 +35,15 @@ struct afs_server *afs_find_server(struct afs_net *net,
if (server)
afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
server = NULL;
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
- if (srx->transport.family == AF_INET6) {
- const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
- alist = rcu_dereference(server->addresses);
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- b = &alist->addrs[i].transport.sin6;
- diff = ((u16 __force)a->sin6_port -
- (u16 __force)b->sin6_port);
- if (diff == 0)
- diff = memcmp(&a->sin6_addr,
- &b->sin6_addr,
- sizeof(struct in6_addr));
- if (diff == 0)
- goto found;
- }
- }
- } else {
- const struct sockaddr_in *a = &srx->transport.sin, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
- alist = rcu_dereference(server->addresses);
- for (i = 0; i < alist->nr_ipv4; i++) {
- b = &alist->addrs[i].transport.sin;
- diff = ((u16 __force)a->sin_port -
- (u16 __force)b->sin_port);
- if (diff == 0)
- diff = ((u32 __force)a->sin_addr.s_addr -
- (u32 __force)b->sin_addr.s_addr);
- if (diff == 0)
- goto found;
- }
- }
+ hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++)
+ if (alist->addrs[i].peer == peer)
+ goto found;
}
server = NULL;
@@ -90,7 +66,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
{
struct afs_server *server = NULL;
struct rb_node *p;
- int diff, seq = 0;
+ int diff, seq = 1;
_enter("%pU", uuid);
@@ -102,7 +78,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
if (server)
afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
server = NULL;
-
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_lock, &seq);
p = net->fs_servers.rb_node;
@@ -137,6 +113,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
static struct afs_server *afs_install_server(struct afs_cell *cell,
struct afs_server *candidate)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server, *next;
struct afs_net *net = cell->net;
@@ -188,8 +165,9 @@ static struct afs_server *afs_install_server(struct afs_cell *cell,
added_dup:
write_seqlock(&net->fs_addr_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&net->fs_addr_lock.lock));
+ estate = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&net->fs_addr_lock.lock));
+ alist = estate->addresses;
/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
* it in the IPv4 and/or IPv6 reverse-map lists.
@@ -219,6 +197,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
const uuid_t *uuid,
struct afs_addr_list *alist)
{
+ struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -228,25 +207,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
+ estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
+ if (!estate)
+ goto enomem_server;
+
refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- RCU_INIT_POINTER(server->addresses, alist);
server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
- INIT_WORK(&server->initcb_work, afs_server_init_callback_work);
+ INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
+ server->service_id = FS_SERVICE;
+
+ server->probe_counter = 1;
+ server->probed_at = jiffies - LONG_MAX / 2;
+ refcount_set(&estate->ref, 1);
+ estate->addresses = alist;
+ estate->server_id = server->debug_id;
+ estate->probe_seq = 1;
+ rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
+enomem_server:
+ kfree(server);
enomem:
_leave(" = NULL [nomem]");
return NULL;
@@ -301,20 +296,20 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
candidate = afs_alloc_server(cell, uuid, alist);
if (!candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
server = afs_install_server(cell, candidate);
if (server != candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
kfree(candidate);
} else {
/* Immediately dispatch an asynchronous probe to each interface
* on the fileserver. This will make sure the repeat-probing
* service is started.
*/
- afs_fs_probe_fileserver(cell->net, server, key, true);
+ afs_fs_probe_fileserver(cell->net, server, alist, key);
}
return server;
@@ -447,7 +442,8 @@ static void afs_server_rcu(struct rcu_head *rcu)
trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
- afs_put_addrlist(rcu_access_pointer(server->addresses));
+ afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
+ afs_estate_trace_put_server);
kfree(server);
}
@@ -459,14 +455,10 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server)
static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server)
{
- struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
- struct afs_addr_cursor ac = {
- .alist = alist,
- .index = alist->preferred,
- .error = 0,
- };
-
- afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state);
+ struct afs_addr_list *alist = estate->addresses;
+
+ afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL);
}
/*
@@ -477,7 +469,6 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_give_up_callbacks(net, server);
- flush_work(&server->initcb_work);
afs_put_server(net, server, afs_server_trace_destroy);
}
@@ -636,9 +627,12 @@ void afs_purge_servers(struct afs_net *net)
* Get an update for a server's address list.
*/
static noinline bool afs_update_server_record(struct afs_operation *op,
- struct afs_server *server)
+ struct afs_server *server,
+ struct key *key)
{
- struct afs_addr_list *alist, *discard;
+ struct afs_endpoint_state *estate;
+ struct afs_addr_list *alist;
+ bool has_addrs;
_enter("");
@@ -648,29 +642,27 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
if (IS_ERR(alist)) {
+ rcu_read_lock();
+ estate = rcu_dereference(server->endpoint_state);
+ has_addrs = estate->addresses;
+ rcu_read_unlock();
+
if ((PTR_ERR(alist) == -ERESTARTSYS ||
PTR_ERR(alist) == -EINTR) &&
(op->flags & AFS_OPERATION_UNINTR) &&
- server->addresses) {
+ has_addrs) {
_leave(" = t [intr]");
return true;
}
- op->error = PTR_ERR(alist);
- _leave(" = f [%d]", op->error);
+ afs_op_set_error(op, PTR_ERR(alist));
+ _leave(" = f [%d]", afs_op_error(op));
return false;
}
- discard = alist;
- if (server->addr_version != alist->version) {
- write_lock(&server->fs_lock);
- discard = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- rcu_assign_pointer(server->addresses, alist);
- server->addr_version = alist->version;
- write_unlock(&server->fs_lock);
- }
+ if (server->addr_version != alist->version)
+ afs_fs_probe_fileserver(op->net, server, alist, key);
- afs_put_addrlist(discard);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_update);
_leave(" = t");
return true;
}
@@ -678,7 +670,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
/*
* See if a server's address list needs updating.
*/
-bool afs_check_server_record(struct afs_operation *op, struct afs_server *server)
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server,
+ struct key *key)
{
bool success;
int ret, retries = 0;
@@ -698,7 +691,7 @@ retry:
update:
if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
- success = afs_update_server_record(op, server);
+ success = afs_update_server_record(op, server, key);
clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
_leave(" = %d", success);
@@ -710,7 +703,7 @@ wait:
(op->flags & AFS_OPERATION_UNINTR) ?
TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
- op->error = ret;
+ afs_op_set_error(op, ret);
_leave(" = f [intr]");
return false;
}
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index b59896b1de0a..7e7e567a7f8a 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -24,35 +24,62 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
/*
* Build a server list from a VLDB record.
*/
-struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
struct key *key,
- struct afs_vldb_entry *vldb,
- u8 type_mask)
+ struct afs_vldb_entry *vldb)
{
struct afs_server_list *slist;
struct afs_server *server;
- int ret = -ENOMEM, nr_servers = 0, i, j;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ unsigned int type_mask = 1 << volume->type;
+ bool use_newrepsites = false;
+ int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0;
+
+ /* Work out if we're going to restrict to NEWREPSITE-marked servers or
+ * not. If at least one site is marked as NEWREPSITE, then it's likely
+ * that "vos release" is busy updating RO sites. We cut over from one
+ * to the other when >=50% of the sites have been updated. Sites that
+ * are in the process of being updated are marked DONTUSE.
+ */
+ for (i = 0; i < vldb->nr_servers; i++) {
+ if (!(vldb->fs_mask[i] & type_mask))
+ continue;
+ nr_servers++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ continue;
+ usable++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
+ newrep++;
+ }
slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
+ if (newrep) {
+ if (newrep < usable / 2) {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
+ } else {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
+ use_newrepsites = true;
+ }
+ }
+
refcount_set(&slist->usage, 1);
rwlock_init(&slist->lock);
- for (i = 0; i < AFS_MAXTYPES; i++)
- slist->vids[i] = vldb->vid[i];
-
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
+ unsigned long se_flags = 0;
+ bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE;
+
if (!(vldb->fs_mask[i] & type_mask))
continue;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
+ if (newrep && (newrepsite ^ use_newrepsites))
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
- server = afs_lookup_server(cell, key, &vldb->fs_server[i],
+ server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
vldb->addr_version[i]);
if (IS_ERR(server)) {
ret = PTR_ERR(server);
@@ -70,7 +97,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_put_server(cell->net, server,
+ afs_put_server(volume->cell->net, server,
afs_server_trace_put_slist_isort);
continue;
}
@@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
}
slist->servers[j].server = server;
+ slist->servers[j].volume = volume;
+ slist->servers[j].flags = se_flags;
+ slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
slist->nr_servers++;
}
@@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
return slist;
error_2:
- afs_put_serverlist(cell->net, slist);
+ afs_put_serverlist(volume->cell->net, slist);
error:
return ERR_PTR(ret);
}
@@ -103,27 +133,117 @@ error:
bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old)
{
- struct afs_server *cur;
- int i, j;
+ unsigned long mask = 1UL << AFS_SE_EXCLUDED;
+ int i;
- if (old->nr_servers != new->nr_servers)
+ if (old->nr_servers != new->nr_servers ||
+ old->ro_replicating != new->ro_replicating)
goto changed;
- for (i = 0; i < old->nr_servers; i++)
+ for (i = 0; i < old->nr_servers; i++) {
if (old->servers[i].server != new->servers[i].server)
goto changed;
-
+ if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
+ goto changed;
+ }
return false;
-
changed:
- /* Maintain the same preferred server as before if possible. */
- cur = old->servers[old->preferred].server;
- for (j = 0; j < new->nr_servers; j++) {
- if (new->servers[j].server == cur) {
- new->preferred = j;
- break;
+ return true;
+}
+
+/*
+ * Attach a volume to the servers it is going to use.
+ */
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ struct afs_server_entry *se, *pe;
+ struct afs_server *server;
+ struct list_head *p;
+ unsigned int i;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ server = se->server;
+
+ list_for_each(p, &server->volumes) {
+ pe = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= pe->volume->vid)
+ break;
}
+ list_add_tail(&se->slink, p);
}
- return true;
+ slist->attached = true;
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Reattach a volume to the servers it is going to use when server list is
+ * replaced. We try to switch the attachment points to avoid rewalking the
+ * lists.
+ */
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new,
+ struct afs_server_list *old)
+{
+ unsigned int n = 0, o = 0;
+
+ down_write(&volume->cell->vs_lock);
+
+ while (n < new->nr_servers || o < old->nr_servers) {
+ struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL;
+ struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL;
+ struct afs_server_entry *s;
+ struct list_head *p;
+ int diff;
+
+ if (pn && po && pn->server == po->server) {
+ pn->cb_expires_at = po->cb_expires_at;
+ list_replace(&po->slink, &pn->slink);
+ n++;
+ o++;
+ continue;
+ }
+
+ if (pn && po)
+ diff = memcmp(&pn->server->uuid, &po->server->uuid,
+ sizeof(pn->server->uuid));
+ else
+ diff = pn ? -1 : 1;
+
+ if (diff < 0) {
+ list_for_each(p, &pn->server->volumes) {
+ s = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= s->volume->vid)
+ break;
+ }
+ list_add_tail(&pn->slink, p);
+ n++;
+ } else {
+ list_del(&po->slink);
+ o++;
+ }
+ }
+
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Detach a volume from the servers it has been using.
+ */
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ unsigned int i;
+
+ if (!slist->attached)
+ return;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ list_del(&slist->servers[i].slink);
+
+ slist->attached = false;
+ up_write(&volume->cell->vs_lock);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a01a0fb2cdbb..ae2d66a52add 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -381,8 +381,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = key;
if (ctx->volume) {
- afs_put_volume(ctx->net, ctx->volume,
- afs_volume_trace_put_validate_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc);
ctx->volume = NULL;
}
@@ -529,7 +528,7 @@ static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
struct afs_net *net = afs_net(as->net_ns);
- afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi);
+ afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
@@ -615,7 +614,7 @@ static void afs_free_fc(struct fs_context *fc)
struct afs_fs_context *ctx = fc->fs_private;
afs_destroy_sbi(fc->s_fs_info);
- afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
new file mode 100644
index 000000000000..46b37f2cce7d
--- /dev/null
+++ b/fs/afs/validation.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* vnode and volume validity verification.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+/*
+ * Data validation is managed through a number of mechanisms from the server:
+ *
+ * (1) On first contact with a server (such as if it has just been rebooted),
+ * the server sends us a CB.InitCallBackState* request.
+ *
+ * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
+ * calls, the server maintains a time-limited per-vnode promise that it
+ * will send us a CB.CallBack request if a third party alters the vnodes
+ * accessed.
+ *
+ * Note that a vnode-level callbacks may also be sent for other reasons,
+ * such as filelock release.
+ *
+ * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
+ * calls, each server maintains a time-limited per-volume promise that it
+ * will send us a CB.CallBack request if the RO volume is updated to a
+ * snapshot of the RW volume ("vos release"). This is an atomic event
+ * that cuts over all instances of the RO volume across multiple servers
+ * simultaneously.
+ *
+ * Note that a volume-level callbacks may also be sent for other reasons,
+ * such as the volumeserver taking over control of the volume from the
+ * fileserver.
+ *
+ * Note also that each server maintains an independent time limit on an
+ * independent callback.
+ *
+ * (4) Certain RPC calls include a volume information record "VolSync" in
+ * their reply. This contains a creation date for the volume that should
+ * remain unchanged for a RW volume (but will be changed if the volume is
+ * restored from backup) or will be bumped to the time of snapshotting
+ * when a RO volume is released.
+ *
+ * In order to track this events, the following are provided:
+ *
+ * ->cb_v_break. A counter of events that might mean that the contents of
+ * a volume have been altered since we last checked a vnode.
+ *
+ * ->cb_v_check. A counter of the number of events that we've sent a
+ * query to the server for. Everything's up to date if this equals
+ * cb_v_break.
+ *
+ * ->cb_scrub. A counter of the number of regression events for which we
+ * have to completely wipe the cache.
+ *
+ * ->cb_ro_snapshot. A counter of the number of times that we've
+ * recognised that a RO volume has been updated.
+ *
+ * ->cb_break. A counter of events that might mean that the contents of a
+ * vnode have been altered.
+ *
+ * ->cb_expires_at. The time at which the callback promise expires or
+ * AFS_NO_CB_PROMISE if we have no promise.
+ *
+ * The way we manage things is:
+ *
+ * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
+ * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
+ * volume and volume's server record.
+ *
+ * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
+ * callback break on all the volumes that have been using that volume
+ * (ie. increment ->cb_v_break and reset ->cb_expires_at).
+ *
+ * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
+ * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
+ * dispatch a work item to unmap all PTEs to the vnode's pagecache to
+ * force reentry to the filesystem for revalidation.
+ *
+ * (4) When entering the filesystem, we call afs_validate() to check the
+ * validity of a vnode. This first checks to see if ->cb_v_check and
+ * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
+ * exclusively and perform an FS.FetchStatus on the vnode.
+ *
+ * After checking the volume, we check the vnode. If there's a mismatch
+ * between the volume counters and the vnode's mirrors of those counters,
+ * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
+ *
+ * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
+ * parsed:
+ *
+ * (A) If the Creation timestamp has changed on a RW volume or regressed
+ * on a RO volume, we try to increment ->cb_scrub; if it advances on a
+ * RO volume, we assume "vos release" happened and try to increment
+ * ->cb_ro_snapshot.
+ *
+ * (B) If the Update timestamp has regressed, we try to increment
+ * ->cb_scrub.
+ *
+ * Note that in both of these cases, we only do the increment if we can
+ * cmpxchg the value of the timestamp from the value we noted before the
+ * op. This tries to prevent parallel ops from fighting one another.
+ *
+ * volume->cb_v_check is then set to ->cb_v_break.
+ *
+ * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
+ * parsed and used to set the promise in ->cb_expires_at for the vnode,
+ * the volume and the volume's server record.
+ *
+ * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
+ * the vnode.
+ */
+
+/*
+ * Check the validity of a vnode/inode and its parent volume.
+ */
+bool afs_check_validity(const struct afs_vnode *vnode)
+{
+ const struct afs_volume *volume = vnode->volume;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+
+ if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline ||
+ volume->cb_expires_at <= deadline ||
+ vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
+ vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
+ test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ _debug("inval");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * See if the server we've just talked to is currently excluded.
+ */
+static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ const struct afs_server_entry *se;
+ const struct afs_server_list *slist;
+ bool is_excluded = true;
+ int i;
+
+ rcu_read_lock();
+
+ slist = rcu_dereference(volume->servers);
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ if (op->server == se->server) {
+ is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return is_excluded;
+}
+
+/*
+ * Update the volume's server list when the creation time changes and see if
+ * the server we've just talked to is currently excluded.
+ */
+static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret;
+
+ if (__afs_is_server_excluded(op, volume))
+ return 1;
+
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+ ret = afs_check_volume_status(op->volume, op);
+ if (ret < 0)
+ return ret;
+
+ return __afs_is_server_excluded(op, volume);
+}
+
+/*
+ * Handle a change to the volume creation time in the VolSync record.
+ */
+static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ unsigned int snap;
+ time64_t cur = volume->creation_time;
+ time64_t old = op->pre_volsync.creation;
+ time64_t new = op->volsync.creation;
+ int ret;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->creation_time = new;
+ return 0;
+ }
+
+ if (new == cur)
+ return 0;
+
+ /* Try to advance the creation timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur != old)
+ return 0;
+
+ /* If the creation time changes in an unexpected way, we need to scrub
+ * our caches. For a RW vol, this will only change if the volume is
+ * restored from a backup; for a RO/Backup vol, this will advance when
+ * the volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (volume->type == AFSVL_RWVOL)
+ goto regressed;
+ if (volume->type == AFSVL_BACKVOL) {
+ if (new < old)
+ goto regressed;
+ goto advance;
+ }
+
+ /* We have an RO volume, we need to query the VL server and look at the
+ * server flags to see if RW->RO replication is in progress.
+ */
+ ret = afs_is_server_excluded(op, volume);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ snap = atomic_read(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
+ return ret;
+ }
+
+advance:
+ snap = atomic_inc_return(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
+ volume->creation_time = new;
+ return 0;
+
+regressed:
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
+ volume->creation_time = new;
+ return 0;
+}
+
+/*
+ * Handle a change to the volume update time in the VolSync record.
+ */
+static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ enum afs_cb_break_reason reason = afs_cb_break_no_break;
+ time64_t cur = volume->update_time;
+ time64_t old = op->pre_volsync.update;
+ time64_t new = op->volsync.update;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->update_time = new;
+ return;
+ }
+
+ if (new == cur)
+ return;
+
+ /* If the volume update time changes in an unexpected way, we need to
+ * scrub our caches. For a RW vol, this will advance on every
+ * modification op; for a RO/Backup vol, this will advance when the
+ * volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (new < old)
+ reason = afs_cb_break_for_update_regress;
+
+ /* Try to advance the update timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur == old) {
+ if (reason == afs_cb_break_for_update_regress) {
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, reason);
+ }
+ volume->update_time = new;
+ }
+}
+
+static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret = 0;
+
+ if (likely(op->volsync.creation == volume->creation_time &&
+ op->volsync.update == volume->update_time))
+ return 0;
+
+ mutex_lock(&volume->volsync_lock);
+ if (op->volsync.creation != volume->creation_time) {
+ ret = afs_update_volume_creation_time(op, volume);
+ if (ret < 0)
+ goto out;
+ }
+ if (op->volsync.update != volume->update_time)
+ afs_update_volume_update_time(op, volume);
+out:
+ mutex_unlock(&volume->volsync_lock);
+ return ret;
+}
+
+/*
+ * Update the state of a volume, including recording the expiration time of the
+ * callback promise. Returns 1 to redo the operation from the start.
+ */
+int afs_update_volume_state(struct afs_operation *op)
+{
+ struct afs_server_list *slist = op->server_list;
+ struct afs_server_entry *se = &slist->servers[op->server_index];
+ struct afs_callback *cb = &op->file[0].scb.callback;
+ struct afs_volume *volume = op->volume;
+ unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
+ unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
+ int ret;
+
+ _enter("%llx", op->volume->vid);
+
+ if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
+ ret = afs_update_volume_times(op, volume);
+ if (ret != 0) {
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
+
+ if (op->cb_v_break == cb_v_break &&
+ (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
+ time64_t expires_at = cb->expires_at;
+
+ if (!op->file[0].scb.have_cb)
+ expires_at = op->file[1].scb.callback.expires_at;
+
+ se->cb_expires_at = expires_at;
+ volume->cb_expires_at = expires_at;
+ }
+ if (cb_v_check < op->cb_v_break)
+ atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
+ return 0;
+}
+
+/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+static void afs_zap_data(struct afs_vnode *vnode)
+{
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ afs_invalidate_cache(vnode, 0);
+
+ /* nuke all the non-dirty pages that aren't locked, mapped or being
+ * written back in a regular file and completely discard the pages in a
+ * directory or symlink */
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
+ else
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ * symlink)
+ * - parent dir metadata changed (security changes)
+ * - dentry data changed (write, truncate)
+ * - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+ struct afs_volume *volume = vnode->volume;
+ unsigned int cb_ro_snapshot, cb_scrub;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+ bool zap = false, locked_vol = false;
+ int ret;
+
+ _enter("{v={%llx:%llu} fl=%lx},%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+ key_serial(key));
+
+ if (afs_check_validity(vnode))
+ return 0;
+
+ ret = down_write_killable(&vnode->validate_lock);
+ if (ret < 0)
+ goto error;
+
+ /* Validate a volume after the v_break has changed or the volume
+ * callback expired. We only want to do this once per volume per
+ * v_break change. The actual work will be done when parsing the
+ * status fetch reply.
+ */
+ if (volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
+ ret = mutex_lock_interruptible(&volume->cb_check_lock);
+ if (ret < 0)
+ goto error_unlock;
+ locked_vol = true;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub)
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
+
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub ||
+ volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline
+ ) {
+ ret = afs_fetch_status(vnode, key, false, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ ret = -ESTALE;
+ }
+ goto error_unlock;
+ }
+
+ _debug("new promise [fl=%lx]", vnode->flags);
+ }
+
+ /* We can drop the volume lock now as. */
+ if (locked_vol) {
+ mutex_unlock(&volume->cb_check_lock);
+ locked_vol = false;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ _debug("vnode inval %x==%x %x==%x",
+ vnode->cb_ro_snapshot, cb_ro_snapshot,
+ vnode->cb_scrub, cb_scrub);
+ if (vnode->cb_scrub != cb_scrub)
+ zap = true;
+ vnode->cb_ro_snapshot = cb_ro_snapshot;
+ vnode->cb_scrub = cb_scrub;
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ _debug("file already deleted");
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
+ /* if the vnode's data version number changed then its contents are
+ * different */
+ zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+ if (zap)
+ afs_zap_data(vnode);
+ up_write(&vnode->validate_lock);
+ _leave(" = 0");
+ return 0;
+
+error_unlock:
+ if (locked_vol)
+ mutex_unlock(&volume->cb_check_lock);
+ up_write(&vnode->validate_lock);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f04a80e4f5c3..9f36e14f1c2d 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -33,55 +33,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
}
/*
- * Compare two addresses.
- */
-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
- const struct sockaddr_rxrpc *srx_b)
-{
- short port_a, port_b;
- int addr_a, addr_b, diff;
-
- diff = (short)srx_a->transport_type - (short)srx_b->transport_type;
- if (diff)
- goto out;
-
- switch (srx_a->transport_type) {
- case AF_INET: {
- const struct sockaddr_in *a = &srx_a->transport.sin;
- const struct sockaddr_in *b = &srx_b->transport.sin;
- addr_a = ntohl(a->sin_addr.s_addr);
- addr_b = ntohl(b->sin_addr.s_addr);
- diff = addr_a - addr_b;
- if (diff == 0) {
- port_a = ntohs(a->sin_port);
- port_b = ntohs(b->sin_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- case AF_INET6: {
- const struct sockaddr_in6 *a = &srx_a->transport.sin6;
- const struct sockaddr_in6 *b = &srx_b->transport.sin6;
- diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16);
- if (diff == 0) {
- port_a = ntohs(a->sin6_port);
- port_b = ntohs(b->sin6_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- default:
- WARN_ON(1);
- diff = 1;
- }
-
-out:
- return diff;
-}
-
-/*
* Compare the address lists of a pair of fileservers.
*/
static int afs_compare_fs_alists(const struct afs_server *server_a,
@@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
const struct afs_addr_list *la, *lb;
int a = 0, b = 0, addr_matches = 0;
- la = rcu_dereference(server_a->addresses);
- lb = rcu_dereference(server_b->addresses);
+ la = rcu_dereference(server_a->endpoint_state)->addresses;
+ lb = rcu_dereference(server_b->endpoint_state)->addresses;
while (a < la->nr_addrs && b < lb->nr_addrs) {
- const struct sockaddr_rxrpc *srx_a = &la->addrs[a];
- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b];
- int diff = afs_compare_addrs(srx_a, srx_b);
+ unsigned long pa = (unsigned long)la->addrs[a].peer;
+ unsigned long pb = (unsigned long)lb->addrs[b].peer;
+ long diff = pa - pb;
if (diff < 0) {
a++;
@@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a,
lb = rcu_dereference(vol_b->servers);
for (i = 0; i < AFS_MAXTYPES; i++)
- if (la->vids[i] != lb->vids[i])
+ if (vol_a->vids[i] != vol_b->vids[i])
return 0;
while (a < la->nr_servers && b < lb->nr_servers) {
@@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
/* And see if it's in the new cell. */
volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len);
if (IS_ERR(volume)) {
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
if (PTR_ERR(volume) != -ENOMEDIUM)
return PTR_ERR(volume);
/* That volume is not in the new cell, so not an alias */
@@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
rcu_read_unlock();
}
- afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias);
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(volume, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
return ret;
}
@@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
while (afs_select_vlserver(&vc)) {
if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) {
- vc.ac.error = -EOPNOTSUPP;
+ vc.call_error = -EOPNOTSUPP;
skipped = true;
continue;
}
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index acc48216136a..9b1c20daac53 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
unsigned short port)
{
struct afs_vlserver *vlserver;
+ static atomic_t debug_ids;
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
@@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->debug_id = atomic_inc_return(&debug_ids);
vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
+ vlserver->service_id = VL_SERVICE;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
}
@@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
{
struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
- afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+ afs_put_addrlist(rcu_access_pointer(vlserver->addresses),
+ afs_alist_trace_put_vlserver);
kfree_rcu(vlserver, rcu);
}
@@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b)
/*
* Build a VL server address list from a DNS queried server list.
*/
-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net,
+ const u8 **_b, const u8 *end,
u8 nr_addrs, u16 port)
{
struct afs_addr_list *alist;
const u8 *b = *_b;
int ret = -EINVAL;
- alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+ alist = afs_alloc_addrlist(nr_addrs);
if (!alist)
return ERR_PTR(-ENOMEM);
if (nr_addrs == 0)
@@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 4);
- afs_merge_fs_addr4(alist, x[0], port);
+ ret = afs_merge_fs_addr4(net, alist, x[0], port);
+ if (ret < 0)
+ goto error;
b += 4;
break;
@@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 16);
- afs_merge_fs_addr6(alist, x, port);
+ ret = afs_merge_fs_addr6(net, alist, x, port);
+ if (ret < 0)
+ goto error;
b += 16;
break;
@@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
error:
*_b = b;
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
return ERR_PTR(ret);
}
@@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
/* Extract the addresses - note that we can't skip this as we
* have to advance the payload pointer.
*/
- addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port);
if (IS_ERR(addrs)) {
ret = PTR_ERR(addrs);
goto error_2;
@@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
if (vllist->nr_servers >= nr_servers) {
_debug("skip %u >= %u", vllist->nr_servers, nr_servers);
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
afs_put_vlserver(cell->net, server);
continue;
}
@@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
addrs->status = bs.status;
if (addrs->nr_addrs == 0) {
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
if (!rcu_access_pointer(server->addresses)) {
afs_put_vlserver(cell->net, server);
continue;
@@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
old = rcu_replace_pointer(server->addresses, old,
lockdep_is_held(&server->lock));
write_unlock(&server->lock);
- afs_put_addrlist(old);
+ afs_put_addrlist(old, afs_alist_trace_put_vlserver_old);
}
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 58452b86e672..3d2e0c925460 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
*/
void afs_vlserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_addr_list *alist = call->vl_probe;
struct afs_vlserver *server = call->vlserver;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
unsigned int server_index = call->server_index;
unsigned int rtt_us = 0;
- unsigned int index = call->addr_ix;
+ unsigned int index = call->probe_index;
bool have_result = false;
int ret = call->error;
@@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
case -ETIME:
default:
clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
+ set_bit(index, &alist->probe_failed);
if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
@@ -101,21 +102,21 @@ void afs_vlserver_probe_result(struct afs_call *call)
responded:
set_bit(index, &alist->responded);
- clear_bit(index, &alist->failed);
+ clear_bit(index, &alist->probe_failed);
if (call->service_id == YFS_VL_SERVICE) {
server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
@@ -130,8 +131,10 @@ responded:
out:
spin_unlock(&server->probe_lock);
- _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport, rtt_us, ret);
+ trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us);
+ _debug("probe [%u][%u] %pISpc rtt=%d ret=%d",
+ server_index, index, rxrpc_kernel_remote_addr(addr->peer),
+ rtt_us, ret);
afs_done_one_vl_probe(server, have_result);
}
@@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
unsigned int server_index,
struct afs_error *_e)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_addr_list *alist;
struct afs_call *call;
+ unsigned long unprobed;
+ unsigned int index, i;
bool in_progress = false;
+ int best_prio;
_enter("%s", server->name);
read_lock(&server->lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->lock));
+ alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->lock));
+ afs_get_addrlist(alist, afs_alist_trace_get_vlprobe);
read_unlock(&server->lock);
- atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ atomic_set(&server->probe_outstanding, alist->nr_addrs);
memset(&server->probe, 0, sizeof(server->probe));
server->probe.rtt = UINT_MAX;
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
- call = afs_vl_get_capabilities(net, &ac, key, server,
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ best_prio = -1;
+ index = 0;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_vl_probe(server, true, alist, index, 0, 0, 0);
+ call = afs_vl_get_capabilities(net, alist, index, key, server,
server_index);
if (!IS_ERR(call)) {
+ afs_prioritise_error(_e, call->error, call->abort_code);
afs_put_call(call);
in_progress = true;
} else {
- afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_prioritise_error(_e, PTR_ERR(call), 0);
afs_done_one_vl_probe(server, false);
}
}
+ afs_put_addrlist(alist, afs_alist_trace_put_vlprobe);
return in_progress;
}
@@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
struct afs_vlserver_list *vllist)
{
struct afs_vlserver *server;
- struct afs_error e;
+ struct afs_error e = {};
bool in_progress = false;
int i;
- e.error = 0;
- e.responded = false;
for (i = 0; i < vllist->nr_servers; i++) {
server = vllist->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index eb415ce56360..d8f79f6ada3d 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -17,18 +17,21 @@
bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
struct key *key)
{
+ static atomic_t debug_ids;
+
memset(vc, 0, sizeof(*vc));
vc->cell = cell;
vc->key = key;
- vc->error = -EDESTADDRREQ;
- vc->ac.error = SHRT_MAX;
+ vc->cumul_error.error = -EDESTADDRREQ;
+ vc->nr_iterations = -1;
if (signal_pending(current)) {
- vc->error = -EINTR;
+ vc->cumul_error.error = -EINTR;
vc->flags |= AFS_VL_CURSOR_STOP;
return false;
}
+ vc->debug_id = atomic_inc_return(&debug_ids);
return true;
}
@@ -52,7 +55,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
&cell->dns_lookup_count,
smp_load_acquire(&cell->dns_lookup_count)
!= dns_lookup_count) < 0) {
- vc->error = -ERESTARTSYS;
+ vc->cumul_error.error = -ERESTARTSYS;
return false;
}
}
@@ -60,12 +63,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
/* Status load is ordered after lookup counter load */
if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
pr_warn("No record of cell %s\n", cell->name);
- vc->error = -ENOENT;
+ vc->cumul_error.error = -ENOENT;
return false;
}
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
- vc->error = -EDESTADDRREQ;
+ vc->cumul_error.error = -EDESTADDRREQ;
return false;
}
}
@@ -78,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
if (!vc->server_list->nr_servers)
return false;
- vc->untried = (1UL << vc->server_list->nr_servers) - 1;
- vc->index = -1;
+ vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1;
+ vc->server_index = -1;
return true;
}
@@ -89,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
*/
bool afs_select_vlserver(struct afs_vl_cursor *vc)
{
- struct afs_addr_list *alist;
+ struct afs_addr_list *alist = vc->alist;
struct afs_vlserver *vlserver;
- struct afs_error e;
- u32 rtt;
- int error = vc->ac.error, i;
+ unsigned long set, failed;
+ unsigned int rtt;
+ s32 abort_code = vc->call_abort_code;
+ int error = vc->call_error, i;
+
+ vc->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- vc->untried, vc->index,
- vc->ac.tried, vc->ac.index,
- error, vc->ac.abort_code);
+ _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d",
+ vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers,
+ vc->addr_index, vc->addr_tried,
+ error, abort_code);
if (vc->flags & AFS_VL_CURSOR_STOP) {
_leave(" = f [stopped]");
return false;
}
- vc->nr_iterations++;
+ if (vc->nr_iterations == 0)
+ goto start;
+
+ WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error);
/* Evaluate the result of the previous operation, if there was one. */
switch (error) {
- case SHRT_MAX:
- goto start;
-
default:
case 0:
/* Success or local failure. Stop. */
- vc->error = error;
+ vc->cumul_error.error = error;
vc->flags |= AFS_VL_CURSOR_STOP;
- _leave(" = f [okay/local %d]", vc->ac.error);
+ _leave(" = f [okay/local %d]", vc->cumul_error.error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
*/
- switch (vc->ac.abort_code) {
+ switch (abort_code) {
case AFSVL_IO:
case AFSVL_BADVOLOPER:
case AFSVL_NOMEM:
/* The server went weird. */
- vc->error = -EREMOTEIO;
+ afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
//write_lock(&vc->cell->vl_servers_lock);
- //vc->server_list->weird_mask |= 1 << vc->index;
+ //vc->server_list->weird_mask |= 1 << vc->server_index;
//write_unlock(&vc->cell->vl_servers_lock);
goto next_server;
default:
- vc->error = afs_abort_to_error(vc->ac.abort_code);
+ afs_prioritise_error(&vc->cumul_error, error, abort_code);
goto failed;
}
@@ -149,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
case -ETIMEDOUT:
case -ETIME:
_debug("no conn %d", error);
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
goto iterate_address;
case -ECONNRESET:
_debug("call reset");
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
vc->flags |= AFS_VL_CURSOR_RETRY;
goto next_server;
@@ -165,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
restart_from_beginning:
_debug("restart");
- afs_end_cursor(&vc->ac);
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart);
+ alist = vc->alist = NULL;
+
afs_put_vlserverlist(vc->cell->net, vc->server_list);
vc->server_list = NULL;
if (vc->flags & AFS_VL_CURSOR_RETRIED)
@@ -173,53 +185,58 @@ restart_from_beginning:
vc->flags |= AFS_VL_CURSOR_RETRIED;
start:
_debug("start");
+ ASSERTCMP(alist, ==, NULL);
if (!afs_start_vl_iteration(vc))
goto failed;
error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
pick_server:
- _debug("pick [%lx]", vc->untried);
+ _debug("pick [%lx]", vc->untried_servers);
+ ASSERTCMP(alist, ==, NULL);
- error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers);
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
/* Pick the untried server with the lowest RTT. */
- vc->index = vc->server_list->preferred;
- if (test_bit(vc->index, &vc->untried))
+ vc->server_index = vc->server_list->preferred;
+ if (test_bit(vc->server_index, &vc->untried_servers))
goto selected_server;
- vc->index = -1;
- rtt = U32_MAX;
+ vc->server_index = -1;
+ rtt = UINT_MAX;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) ||
+ if (!test_bit(i, &vc->untried_servers) ||
!test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- vc->index = i;
+ if (s->probe.rtt <= rtt) {
+ vc->server_index = i;
rtt = s->probe.rtt;
}
}
- if (vc->index == -1)
+ if (vc->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", vc->index);
- __clear_bit(vc->index, &vc->untried);
+ _debug("use %d", vc->server_index);
+ __clear_bit(vc->server_index, &vc->untried_servers);
/* We're starting on a different vlserver from the list. We need to
* check it, find its address list and probe its capabilities before we
* use it.
*/
- ASSERTCMP(vc->ac.alist, ==, NULL);
- vlserver = vc->server_list->servers[vc->index].server;
+ vlserver = vc->server_list->servers[vc->server_index].server;
vc->server = vlserver;
_debug("USING VLSERVER: %s", vlserver->name);
@@ -227,34 +244,48 @@ selected_server:
read_lock(&vlserver->lock);
alist = rcu_dereference_protected(vlserver->addresses,
lockdep_is_held(&vlserver->lock));
- afs_get_addrlist(alist);
+ vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set);
read_unlock(&vlserver->lock);
- memset(&vc->ac, 0, sizeof(vc->ac));
-
- if (!vc->ac.alist)
- vc->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- vc->ac.index = -1;
+ vc->addr_tried = 0;
+ vc->addr_index = -1;
iterate_address:
- ASSERT(vc->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&vc->ac))
+ set = READ_ONCE(alist->responded);
+ failed = READ_ONCE(alist->probe_failed);
+ vc->addr_index = READ_ONCE(alist->preferred);
+
+ _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index);
+
+ set &= ~(failed | vc->addr_tried);
+
+ if (!set)
goto next_server;
- _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ if (!test_bit(vc->addr_index, &set))
+ vc->addr_index = __ffs(set);
+
+ set_bit(vc->addr_index, &vc->addr_tried);
+ vc->alist = alist;
- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+ _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs);
+
+ vc->call_responded = false;
+ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer));
return true;
next_server:
_debug("next");
- afs_end_cursor(&vc->ac);
+ ASSERT(alist);
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next);
+ alist = vc->alist = NULL;
goto pick_server;
no_more_servers:
@@ -264,25 +295,26 @@ no_more_servers:
if (vc->flags & AFS_VL_CURSOR_RETRY)
goto restart_from_beginning;
- e.error = -EDESTADDRREQ;
- e.responded = false;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
- e.responded = true;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ vc->cumul_error.responded = true;
+ afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
- error = e.error;
-
-failed_set_error:
- vc->error = error;
failed:
+ if (alist) {
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail);
+ alist = vc->alist = NULL;
+ }
vc->flags |= AFS_VL_CURSOR_STOP;
- afs_end_cursor(&vc->ac);
- _leave(" = f [failed %d]", vc->error);
+ _leave(" = f [failed %d]", vc->cumul_error.error);
return false;
}
@@ -305,7 +337,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
pr_notice("DNS: src=%u st=%u lc=%x\n",
cell->dns_source, cell->dns_status, cell->dns_lookup_count);
pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
- vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+ vc->untried_servers, vc->server_index, vc->nr_iterations,
+ vc->flags, vc->cumul_error.error);
+ pr_notice("VC: call er=%d ac=%d r=%u\n",
+ vc->call_error, vc->call_abort_code, vc->call_responded);
if (vc->server_list) {
const struct afs_vlserver_list *sl = vc->server_list;
@@ -322,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
pr_notice("VC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == vc->ac.alist)
+ a->responded, a->probe_failed);
+ if (a == vc->alist)
pr_notice("VC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
- vc->ac.responded, vc->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index);
rcu_read_unlock();
}
@@ -342,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
{
struct afs_net *net = vc->cell->net;
- if (vc->error == -EDESTADDRREQ ||
- vc->error == -EADDRNOTAVAIL ||
- vc->error == -ENETUNREACH ||
- vc->error == -EHOSTUNREACH)
+ _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations);
+
+ switch (vc->cumul_error.error) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_vl_dump_edestaddrreq(vc);
+ break;
+ }
- afs_end_cursor(&vc->ac);
+ if (vc->alist) {
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(vc->alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(vc->alist->preferred, vc->addr_index);
+ afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end);
+ vc->alist = NULL;
+ }
afs_put_vlserverlist(net, vc->server_list);
-
- if (vc->error == -ECONNABORTED)
- vc->error = afs_abort_to_error(vc->ac.abort_code);
-
- return vc->error;
+ return vc->cumul_error.error;
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 00fca3c66ba6..cac75f89b64a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
{
struct afs_uvldbentry__xdr *uvldb;
struct afs_vldb_entry *entry;
- bool new_only = false;
- u32 tmp, nr_servers, vlflags;
+ u32 nr_servers, vlflags;
int i, ret;
_enter("");
@@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
entry->name[i] = 0;
entry->name_len = strlen(entry->name);
- /* If there is a new replication site that we can use, ignore all the
- * sites that aren't marked as new.
- */
- for (i = 0; i < nr_servers; i++) {
- tmp = ntohl(uvldb->serverFlags[i]);
- if (!(tmp & AFS_VLSF_DONTUSE) &&
- (tmp & AFS_VLSF_NEWREPSITE))
- new_only = true;
- }
-
vlflags = ntohl(uvldb->flags);
for (i = 0; i < nr_servers; i++) {
struct afs_uuid__xdr *xdr;
struct afs_uuid *uuid;
+ u32 tmp = ntohl(uvldb->serverFlags[i]);
int j;
int n = entry->nr_servers;
- tmp = ntohl(uvldb->serverFlags[i]);
- if (tmp & AFS_VLSF_DONTUSE ||
- (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
- continue;
if (tmp & AFS_VLSF_RWVOL) {
entry->fs_mask[n] |= AFS_VOL_VTM_RW;
if (vlflags & AFS_VLF_BACKEXISTS)
@@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
for (j = 0; j < 6; j++)
uuid->node[j] = (u8)ntohl(xdr->node[j]);
+ entry->vlsf_flags[n] = tmp;
entry->addr_version[n] = ntohl(uvldb->serverUnique[i]);
entry->nr_servers++;
}
@@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
return 0;
}
-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
-{
- kfree(call->ret_vldb);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetEntryByNameU operation type.
*/
@@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
.name = "VL.GetEntryByNameU",
.op = afs_VL_GetEntryByNameU,
.deliver = afs_deliver_vl_get_entry_by_name_u,
- .destructor = afs_destroy_vl_get_entry_by_name_u,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_vldb = entry;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
memset((void *)bp + volnamesz, 0, padsz);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(entry);
+ return ERR_PTR(vc->call_error);
+ }
+ return entry;
}
/*
@@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = ntohl(*bp);
nentries = min(nentries, count);
- alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(nentries);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
alist = call->ret_alist;
bp = call->buffer;
count = min(call->count, 4U);
- for (i = 0; i < count; i++)
- if (alist->nr_addrs < call->count2)
- afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
+ for (i = 0; i < count; i++) {
+ if (alist->nr_addrs < call->count2) {
+ ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT);
+ if (ret < 0)
+ return ret;
+ }
+ }
call->count -= count;
if (call->count > 0)
@@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
return 0;
}
-static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
-{
- afs_put_addrlist(call->ret_alist);
- return afs_flat_call_destructor(call);
-}
-
/*
* VL.GetAddrsU operation type.
*/
@@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
.name = "VL.GetAddrsU",
.op = afs_VL_GetAddrsU,
.deliver = afs_deliver_vl_get_addrs_u,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_ListAddrByAttributes__xdr *r;
+ struct afs_addr_list *alist;
const struct afs_uuid *u = (const struct afs_uuid *)uuid;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
@@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
static void afs_destroy_vl_get_capabilities(struct afs_call *call)
{
+ afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps);
afs_put_vlserver(call->net, call->vlserver);
afs_flat_call_destructor(call);
}
@@ -378,7 +382,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
* other end supports.
*/
struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
- struct afs_addr_cursor *ac,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
struct key *key,
struct afs_vlserver *server,
unsigned int server_index)
@@ -395,6 +400,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
call->key = key;
call->vlserver = afs_get_vlserver(server);
call->server_index = server_index;
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer);
+ call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
call->upgrade = true;
call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
@@ -405,7 +414,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(ac, call, GFP_KERNEL);
+ afs_make_call(call, GFP_KERNEL);
return call;
}
@@ -450,7 +459,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (call->count > YFS_MAXENDPOINTS)
return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num);
- alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(call->count);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -488,14 +497,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ntohl(bp[0]) != sizeof(__be32) * 2)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt4_len);
- afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
+ ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2]));
+ if (ret < 0)
+ return ret;
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt6_len);
- afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
+ ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5]));
+ if (ret < 0)
+ return ret;
bp += 6;
break;
default:
@@ -610,7 +623,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
.name = "YFSVL.GetEndpoints",
.op = afs_YFSVL_GetEndpoints,
.deliver = afs_deliver_yfsvl_get_endpoints,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -620,6 +633,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
+ struct afs_addr_list *alist;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
@@ -635,6 +649,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -643,8 +659,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -709,12 +735,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
return 0;
}
-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call)
-{
- kfree(call->ret_str);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetCapabilities operation type
*/
@@ -722,7 +742,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = {
.name = "YFSVL.GetCellName",
.op = afs_YFSVL_GetCellName,
.deliver = afs_deliver_yfsvl_get_cell_name,
- .destructor = afs_destroy_yfsvl_get_cell_name,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -737,6 +757,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
+ char *cellname;
_enter("");
@@ -747,6 +768,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
call->key = vc->key;
call->ret_str = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* marshall the parameters */
bp = call->request;
@@ -754,6 +777,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (char *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ cellname = call->ret_str;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(cellname);
+ return ERR_PTR(vc->call_error);
+ }
+ return cellname;
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 115c081a8e2c..020ecd45e476 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -11,6 +11,8 @@
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static void afs_destroy_volume(struct work_struct *work);
+
/*
* Insert a volume into a cell. If there's an existing volume record, that is
* returned instead with a ref held.
@@ -72,11 +74,11 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
*/
static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_server_list **_slist)
{
struct afs_server_list *slist;
struct afs_volume *volume;
- int ret = -ENOMEM;
+ int ret = -ENOMEM, i;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
@@ -88,20 +90,30 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
+ volume->creation_time = TIME64_MIN;
+ volume->update_time = TIME64_MIN;
refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
+ INIT_WORK(&volume->destructor, afs_destroy_volume);
rwlock_init(&volume->servers_lock);
+ mutex_init(&volume->volsync_lock);
+ mutex_init(&volume->cb_check_lock);
rwlock_init(&volume->cb_v_break_lock);
+ INIT_LIST_HEAD(&volume->open_mmaps);
+ init_rwsem(&volume->open_mmaps_lock);
memcpy(volume->name, vldb->name, vldb->name_len + 1);
- slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
+ for (i = 0; i < AFS_MAXTYPES; i++)
+ volume->vids[i] = vldb->vid[i];
+
+ slist = afs_alloc_server_list(volume, params->key, vldb);
if (IS_ERR(slist)) {
ret = PTR_ERR(slist);
goto error_1;
}
- refcount_set(&slist->usage, 1);
+ *_slist = slist;
rcu_assign_pointer(volume->servers, slist);
trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
return volume;
@@ -117,18 +129,20 @@ error_0:
* Look up or allocate a volume record.
*/
static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params,
- struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_vldb_entry *vldb)
{
+ struct afs_server_list *slist;
struct afs_volume *candidate, *volume;
- candidate = afs_alloc_volume(params, vldb, type_mask);
+ candidate = afs_alloc_volume(params, vldb, &slist);
if (IS_ERR(candidate))
return candidate;
volume = afs_insert_volume_into_cell(params->cell, candidate);
- if (volume != candidate)
- afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup);
+ if (volume == candidate)
+ afs_attach_volume_to_servers(volume, slist);
+ else
+ afs_put_volume(candidate, afs_volume_trace_put_cell_dup);
return volume;
}
@@ -208,8 +222,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params)
goto error;
}
- type_mask = 1UL << params->type;
- volume = afs_lookup_volume(params, vldb, type_mask);
+ volume = afs_lookup_volume(params, vldb);
error:
kfree(vldb);
@@ -219,16 +232,20 @@ error:
/*
* Destroy a volume record
*/
-static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
+static void afs_destroy_volume(struct work_struct *work)
{
+ struct afs_volume *volume = container_of(work, struct afs_volume, destructor);
+ struct afs_server_list *slist = rcu_access_pointer(volume->servers);
+
_enter("%p", volume);
#ifdef CONFIG_AFS_FSCACHE
ASSERTCMP(volume->cache, ==, NULL);
#endif
+ afs_detach_volume_from_servers(volume, slist);
afs_remove_volume_from_cell(volume);
- afs_put_serverlist(net, rcu_access_pointer(volume->servers));
+ afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
@@ -270,8 +287,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
/*
* Drop a reference on a volume record.
*/
-void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
- enum afs_volume_trace reason)
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
afs_volid_t vid = volume->vid;
@@ -281,7 +297,7 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
zero = __refcount_dec_and_test(&volume->ref, &r);
trace_afs_volume(vid, r - 1, reason);
if (zero)
- afs_destroy_volume(net, volume);
+ schedule_work(&volume->destructor);
}
}
@@ -362,8 +378,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
}
/* See if the volume's server list got updated. */
- new = afs_alloc_server_list(volume->cell, key,
- vldb, (1 << volume->type));
+ new = afs_alloc_server_list(volume, key, vldb);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto error_vldb;
@@ -382,11 +397,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
discard = old;
}
- volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+ /* Check more often if replication is ongoing. */
+ if (new->ro_replicating)
+ volume->update_at = ktime_get_real_seconds() + 10 * 60;
+ else
+ volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
write_unlock(&volume->servers_lock);
- ret = 0;
+ if (discard == old)
+ afs_reattach_volume_to_servers(volume, new, old);
afs_put_serverlist(volume->cell->net, discard);
+ ret = 0;
error_vldb:
kfree(vldb);
error:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4a168781936b..61d34ad2ca7d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -242,7 +242,7 @@ static void afs_kill_pages(struct address_space *mapping,
folio_clear_uptodate(folio);
folio_end_writeback(folio);
folio_lock(folio);
- generic_error_remove_page(mapping, &folio->page);
+ generic_error_remove_folio(mapping, folio);
folio_unlock(folio);
folio_put(folio);
@@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
- if (op->error == 0) {
+ if (!afs_op_error(op)) {
if (!op->store.laundering)
afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
@@ -428,7 +428,7 @@ try_next_key:
afs_wait_for_operation(op);
- switch (op->error) {
+ switch (afs_op_error(op)) {
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -447,7 +447,7 @@ try_next_key:
}
afs_put_wb_key(wbk);
- _leave(" = %d", op->error);
+ _leave(" = %d", afs_op_error(op));
return afs_put_operation(op);
}
@@ -559,8 +559,7 @@ static void afs_extend_writeback(struct address_space *mapping,
if (!folio_clear_dirty_for_io(folio))
BUG();
- if (folio_start_writeback(folio))
- BUG();
+ folio_start_writeback(folio);
afs_folio_start_fscache(caching, folio);
*_count -= folio_nr_pages(folio);
@@ -595,8 +594,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
_enter(",%lx,%llx-%llx", folio_index(folio), start, end);
- if (folio_start_writeback(folio))
- BUG();
+ folio_start_writeback(folio);
afs_folio_start_fscache(caching, folio);
count -= folio_nr_pages(folio);
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 11571cca86c1..f521e66d3bf6 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp,
struct afs_volsync *volsync)
{
struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
- u64 creation;
+ u64 creation, update;
if (volsync) {
creation = xdr_to_u64(x->vol_creation_date);
do_div(creation, 10 * 1000 * 1000);
volsync->creation = creation;
+ update = xdr_to_u64(x->vol_update_date);
+ do_div(update, 10 * 1000 * 1000);
+ volsync->update = update;
}
*_bp += xdr_size(x);
@@ -490,6 +493,7 @@ void yfs_fs_fetch_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, req->len);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -572,6 +576,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -620,6 +625,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -704,6 +710,7 @@ void yfs_fs_remove_file2(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -773,6 +780,7 @@ void yfs_fs_remove_file(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -814,6 +822,7 @@ void yfs_fs_remove_dir(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -887,6 +896,7 @@ void yfs_fs_link(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -968,6 +978,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1047,6 +1058,7 @@ void yfs_fs_rename(struct afs_operation *op)
bp = xdr_encode_name(bp, new_name);
yfs_check_req(call, bp);
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1102,6 +1114,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, op->store.i_size);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1158,6 +1171,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op)
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1196,6 +1210,7 @@ void yfs_fs_setattr(struct afs_operation *op)
bp = xdr_encode_YFS_StoreStatus(bp, attr);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1366,6 +1381,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op)
bp = xdr_encode_u64(bp, vp->fid.vid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1430,6 +1446,7 @@ void yfs_fs_set_lock(struct afs_operation *op)
bp = xdr_encode_u32(bp, op->lock.type);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1460,6 +1477,7 @@ void yfs_fs_extend_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1490,6 +1508,7 @@ void yfs_fs_release_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1556,6 +1575,7 @@ void yfs_fs_fetch_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1736,6 +1756,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1898,6 +1919,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -1948,6 +1970,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op)
bp += size / sizeof(__be32);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
diff --git a/fs/aio.c b/fs/aio.c
index f8589caef9c1..ffe65c1aab4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -266,7 +266,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
return ERR_CAST(inode);
inode->i_mapping->a_ops = &aio_ctx_aops;
- inode->i_mapping->private_data = ctx;
+ inode->i_mapping->i_private_data = ctx;
inode->i_size = PAGE_SIZE * nr_pages;
file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
@@ -316,10 +316,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
/* Prevent further access to the kioctx from migratepages */
i_mapping = aio_ring_file->f_mapping;
- spin_lock(&i_mapping->private_lock);
- i_mapping->private_data = NULL;
+ spin_lock(&i_mapping->i_private_lock);
+ i_mapping->i_private_data = NULL;
ctx->aio_ring_file = NULL;
- spin_unlock(&i_mapping->private_lock);
+ spin_unlock(&i_mapping->i_private_lock);
fput(aio_ring_file);
}
@@ -422,9 +422,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
rc = 0;
- /* mapping->private_lock here protects against the kioctx teardown. */
- spin_lock(&mapping->private_lock);
- ctx = mapping->private_data;
+ /* mapping->i_private_lock here protects against the kioctx teardown. */
+ spin_lock(&mapping->i_private_lock);
+ ctx = mapping->i_private_data;
if (!ctx) {
rc = -EINVAL;
goto out;
@@ -476,7 +476,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
out_unlock:
mutex_unlock(&ctx->ring_lock);
out:
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return rc;
}
#else
@@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb)
kmem_cache_free(kiocb_cachep, iocb);
}
+struct aio_waiter {
+ struct wait_queue_entry w;
+ size_t min_nr;
+};
+
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
@@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb)
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
- unsigned tail, pos, head;
+ unsigned tail, pos, head, avail;
unsigned long flags;
/*
@@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb)
ctx->completed_events++;
if (ctx->completed_events > 1)
refill_reqs_available(ctx, head, tail);
+
+ avail = tail > head
+ ? tail - head
+ : tail + ctx->nr_events - head;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1166,7 +1175,7 @@ static void aio_complete(struct aio_kiocb *iocb)
* from IRQ context.
*/
if (iocb->ki_eventfd)
- eventfd_signal(iocb->ki_eventfd, 1);
+ eventfd_signal(iocb->ki_eventfd);
/*
* We have to order our ring_info tail store above and test
@@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb)
*/
smp_mb();
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->wait)) {
+ struct aio_waiter *curr, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->wait.lock, flags);
+ list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry)
+ if (avail >= curr->min_nr) {
+ list_del_init_careful(&curr->w.entry);
+ wake_up_process(curr->w.private);
+ }
+ spin_unlock_irqrestore(&ctx->wait.lock, flags);
+ }
}
static inline void iocb_put(struct aio_kiocb *iocb)
@@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
ktime_t until)
{
- long ret = 0;
+ struct hrtimer_sleeper t;
+ struct aio_waiter w;
+ long ret = 0, ret2 = 0;
/*
* Note that aio_read_events() is being called as the conditional - i.e.
@@ -1306,12 +1327,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ aio_read_events(ctx, min_nr, nr, event, &ret);
+ if (until == 0 || ret < 0 || ret >= min_nr)
+ return ret;
+
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ if (until != KTIME_MAX) {
+ hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns);
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
+ }
+
+ init_wait(&w.w);
+
+ while (1) {
+ unsigned long nr_got = ret;
+
+ w.min_nr = min_nr - ret;
+
+ ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE);
+ if (!ret2 && !t.task)
+ ret2 = -ETIME;
+
+ if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2)
+ break;
+
+ if (nr_got == ret)
+ schedule();
+ }
+
+ finish_wait(&ctx->wait, &w.w);
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
return ret;
}
@@ -1498,7 +1545,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
size_t len = iocb->aio_nbytes;
if (!vectored) {
- ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
+ ssize_t ret = import_ubuf(rw, buf, len, iter);
*iovec = NULL;
return ret;
}
diff --git a/fs/attr.c b/fs/attr.c
index bdf5deb06ea9..5a13f0c8495f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*
* Should be called as the first thing in ->setattr implementations,
* possibly after taking additional locks.
diff --git a/fs/backing-file.c b/fs/backing-file.c
new file mode 100644
index 000000000000..a681f38d84d8
--- /dev/null
+++ b/fs/backing-file.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Forked from fs/overlayfs/file.c.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-file.h>
+#include <linux/splice.h>
+#include <linux/mm.h>
+
+#include "internal.h"
+
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @user_path: path that the user reuqested to open
+ * @flags: open flags
+ * @real_path: path of the backing file
+ * @cred: credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
+ */
+struct file *backing_file_open(const struct path *user_path, int flags,
+ const struct path *real_path,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ error = vfs_open(real_path, f);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(backing_file_open);
+
+struct backing_aio {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *orig_iocb;
+ /* used for aio completion */
+ void (*end_write)(struct file *);
+ struct work_struct work;
+ long res;
+};
+
+static struct kmem_cache *backing_aio_cachep;
+
+#define BACKING_IOCB_MASK \
+ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
+{
+ return (__force rwf_t)(flags & BACKING_IOCB_MASK);
+}
+
+static void backing_aio_put(struct backing_aio *aio)
+{
+ if (refcount_dec_and_test(&aio->ref)) {
+ fput(aio->iocb.ki_filp);
+ kmem_cache_free(backing_aio_cachep, aio);
+ }
+}
+
+static void backing_aio_cleanup(struct backing_aio *aio, long res)
+{
+ struct kiocb *iocb = &aio->iocb;
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ if (aio->end_write)
+ aio->end_write(orig_iocb->ki_filp);
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ backing_aio_put(aio);
+}
+
+static void backing_aio_rw_complete(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ if (iocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(iocb);
+
+ backing_aio_cleanup(aio, res);
+ orig_iocb->ki_complete(orig_iocb, res);
+}
+
+static void backing_aio_complete_work(struct work_struct *work)
+{
+ struct backing_aio *aio = container_of(work, struct backing_aio, work);
+
+ backing_aio_rw_complete(&aio->iocb, aio->res);
+}
+
+static void backing_aio_queue_completion(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+
+ /*
+ * Punt to a work queue to serialize updates of mtime/size.
+ */
+ aio->res = res;
+ INIT_WORK(&aio->work, backing_aio_complete_work);
+ queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &aio->work);
+}
+
+static int backing_aio_init_wq(struct kiocb *iocb)
+{
+ struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
+
+ if (sb->s_dio_done_wq)
+ return 0;
+
+ return sb_init_dio_done_wq(sb);
+}
+
+
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ struct backing_aio *aio = NULL;
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ } else {
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ /*
+ * Stacked filesystems don't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ flags &= ~IOCB_DIO_CALLER_COMP;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+ } else {
+ struct backing_aio *aio;
+
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ goto out;
+
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ aio->end_write = ctx->end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_write_iter);
+
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ old_cred = override_creds(ctx->cred);
+ ret = vfs_splice_read(in, ppos, pipe, len, flags);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_read);
+
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ old_cred = override_creds(ctx->cred);
+ file_start_write(out);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ file_end_write(out);
+ revert_creds(old_cred);
+
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_write);
+
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ int ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
+ WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+ return -EIO;
+
+ if (!file->f_op->mmap)
+ return -ENODEV;
+
+ vma_set_file(vma, file);
+
+ old_cred = override_creds(ctx->cred);
+ ret = call_mmap(vma->vm_file, vma);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_mmap);
+
+static int __init backing_aio_init(void)
+{
+ backing_aio_cachep = kmem_cache_create("backing_aio",
+ sizeof(struct backing_aio),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!backing_aio_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(backing_aio_init);
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 14d5cc6f90d7..94e5a567fa44 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -289,14 +289,14 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(c->vfs_sb->s_bdev);
+ ret = bdev_freeze(c->vfs_sb->s_bdev);
if (ret)
goto err;
bch2_journal_flush(&c->journal);
c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
- thaw_bdev(c->vfs_sb->s_bdev);
+ bdev_thaw(c->vfs_sb->s_bdev);
break;
case FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 49da8db1d9e9..c1895df1bffe 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1131,7 +1131,7 @@ static const struct address_space_operations bch_address_space_operations = {
#ifdef CONFIG_MIGRATION
.migrate_folio = filemap_migrate_folio,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
struct bcachefs_fid {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4c98d8cc2a79..78013deda9df 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -164,8 +164,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, sb->holder);
+ if (!IS_ERR_OR_NULL(sb->bdev_handle))
+ bdev_release(sb->bdev_handle);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -725,21 +725,22 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev) &&
- PTR_ERR(sb->bdev) == -EACCES &&
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->bdev_handle) &&
+ PTR_ERR(sb->bdev_handle) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev))
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->bdev_handle))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev)) {
- ret = PTR_ERR(sb->bdev);
+ if (IS_ERR(sb->bdev_handle)) {
+ ret = PTR_ERR(sb->bdev_handle);
goto out;
}
+ sb->bdev = sb->bdev_handle->bdev;
ret = bch2_sb_realloc(sb, 0);
if (ret) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 9c1fd4ca2b10..b2119686e2e1 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,6 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index adc2230079c6..a778411574a9 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -11,6 +11,7 @@
*/
#include <linux/fs.h>
+#include <linux/mpage.h>
#include <linux/buffer_head.h>
#include "bfs.h"
@@ -150,9 +151,10 @@ out:
return err;
}
-static int bfs_writepage(struct page *page, struct writeback_control *wbc)
+static int bfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, bfs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, bfs_get_block);
}
static int bfs_read_folio(struct file *file, struct folio *folio)
@@ -190,9 +192,10 @@ const struct address_space_operations bfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = bfs_read_folio,
- .writepage = bfs_writepage,
+ .writepages = bfs_writepages,
.write_begin = bfs_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = bfs_bmap,
};
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..1925a0919ca6 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
+ token->kaddr = folio_address(eb->folios[0]);
token->offset = 0;
}
@@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
* phyusical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
@@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ return get_unaligned_le##bits(token->kaddr + oil); \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(token->kaddr + oil); \
\
- memcpy(lebytes, token->kaddr + oip, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes, token->kaddr + oil, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(lebytes + part, token->kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
- return get_unaligned_le##bits(kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(kaddr + oil); \
\
- memcpy(lebytes, kaddr + oip, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes, kaddr + oil, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(lebytes + part, kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oip, lebytes, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr + oil, lebytes, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(token->kaddr, lebytes + part, size - part); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, kaddr + oil); \
return; \
} \
\
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oip, lebytes, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr + oil, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(kaddr, lebytes + part, size - part); \
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index aa0844535644..ed7aa32972ad 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]) + \
+ const type *p = folio_address(eb->folios[0]) + \
offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 4f3b693a16b1..928f512cdb4a 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
+ /*
+ * We can only trigger this for data bio, which doesn't support larger
+ * folios yet.
+ */
+ ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- bv->bv_page, bv->bv_offset, mirror);
+ page_folio(bv->bv_page), bv->bv_offset, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
/*
* Submit bio to an async queue.
*
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
*/
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
@@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, length, pg_offset);
+ ret = bio_add_folio(&bio, folio, length, folio_offset);
+ ASSERT(ret);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..bbaed317161a 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6e5dc68ff661..a9be9ac99222 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);
kfree(cache->free_space_ctl);
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
@@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
@@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
@@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
out:
/* Once for the lookup reference */
@@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
@@ -1927,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
@@ -1938,23 +1925,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
- read_unlock(&em_tree->lock);
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
- if (em->start != key->objectid || em->len != key->offset) {
+ if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
- key->objectid, key->offset, em->start, em->len);
+ key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
- goto out_free_em;
+ goto out_free_map;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -1962,16 +1946,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
-out_free_em:
- free_extent_map(em);
+out_free_map:
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2024,8 +2008,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -2033,14 +2016,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->orig_block_len;
+ data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
- chunk_start = em->start;
+ chunk_start = map->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2094,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2199,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
+ bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
+ if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
@@ -2369,28 +2349,25 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- struct extent_map *em;
- struct map_lookup *map;
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- em = rb_entry(node, struct extent_map, rb_node);
- map = em->map_lookup;
- bg = btrfs_create_block_group_cache(fs_info, em->start);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
- bg->length = em->len;
+ bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
- bg->used = em->len;
+ bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
@@ -2618,19 +2595,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_offset;
- u64 stripe_size;
int i;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2647,13 +2619,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
- stripe_size);
+ map->stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2910,7 +2882,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
goto unlock_out;
/*
- * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
* chunk allocation storm to exhaust the system chunk array. Otherwise
* we still want to try our best to mark the block group read-only.
*/
@@ -4406,8 +4378,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
@@ -4416,17 +4386,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
+
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2bdbcb834f95..c4a1f01cc1c2 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -5,6 +5,8 @@
#include "free-space-cache.h"
+struct btrfs_chunk_map;
+
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
@@ -243,7 +245,7 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
- struct map_lookup *physical_map;
+ struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
@@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
+ struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5572ae52444e..7f7c5a92d2b8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -69,6 +69,8 @@ enum {
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
+ /* Set when there are no capabilities in XATTs for the inode. */
+ BTRFS_INODE_NO_CAP_XATTR,
};
/* in memory btrfs inode */
@@ -107,9 +109,11 @@ struct btrfs_inode {
/*
* Keep track of where the inode has extent items mapped in order to
- * make sure the i_size adjustments are accurate
+ * make sure the i_size adjustments are accurate. Not required when the
+ * filesystem is NO_HOLES, the status can't be set while mounted as
+ * it's a mkfs-time feature.
*/
- struct extent_io_tree file_extent_tree;
+ struct extent_io_tree *file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end);
+ u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19b22b4653c8..193168214eeb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <linux/shrinker.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
@@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws,
static void btrfs_free_compressed_pages(struct compressed_bio *cb)
{
for (unsigned int i = 0; i < cb->nr_pages; i++)
- put_page(cb->compressed_pages[i]);
+ btrfs_free_compr_page(cb->compressed_pages[i]);
kfree(cb->compressed_pages);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+ struct shrinker *shrinker;
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+ int ret;
+
+ /*
+ * We must not read the values more than once if 'ret' gets expanded in
+ * the return statement so we don't accidentally return a negative
+ * number, even if the first condition finds it positive.
+ */
+ ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+ return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+ struct list_head remove;
+ struct list_head *tmp, *next;
+ int freed;
+
+ if (compr_pool.count == 0)
+ return SHRINK_STOP;
+
+ INIT_LIST_HEAD(&remove);
+
+ /* For now, just simply drain the whole list. */
+ spin_lock(&compr_pool.lock);
+ list_splice_init(&compr_pool.list, &remove);
+ freed = compr_pool.count;
+ compr_pool.count = 0;
+ spin_unlock(&compr_pool.lock);
+
+ list_for_each_safe(tmp, next, &remove) {
+ struct page *page = list_entry(tmp, struct page, lru);
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+ }
+
+ return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct page *btrfs_alloc_compr_page(void)
+{
+ struct page *page = NULL;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > 0) {
+ page = list_first_entry(&compr_pool.list, struct page, lru);
+ list_del_init(&page->lru);
+ compr_pool.count--;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (page)
+ return page;
+
+ return alloc_page(GFP_NOFS);
+}
+
+void btrfs_free_compr_page(struct page *page)
+{
+ bool do_free = false;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > compr_pool.thresh) {
+ do_free = true;
+ } else {
+ list_add(&page->lru, &compr_pool.list);
+ compr_pool.count++;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (!do_free)
+ return;
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+}
+
+static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
- cb->start, cb->len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio,
+ cb->start, cb->len);
}
folio_batch_release(&fbatch);
}
@@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_compressed_bio_write);
+ end_bbio_comprssed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
cb->compressed_pages = compressed_pages;
@@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ btrfs_subpage_start_reader(fs_info, page_folio(page),
+ cur, add_size);
put_page(page);
cur += add_size;
}
@@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out;
}
- ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(extent_map_is_compressed(em));
compressed_len = em->block_len;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_compressed_bio_read);
+ end_bbio_comprssed_read);
cb->start = em->orig_start;
em_len = em->len;
@@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = em->compress_type;
+ cb->compress_type = extent_map_compression(em);
cb->orig_bbio = bbio;
free_extent_map(em);
@@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -960,15 +1056,36 @@ int __init btrfs_init_compress(void)
offsetof(struct compressed_bio, bbio.bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+
+ compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+ if (!compr_pool.shrinker)
+ return -ENOMEM;
+
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+
+ spin_lock_init(&compr_pool.lock);
+ INIT_LIST_HEAD(&compr_pool.list);
+ compr_pool.count = 0;
+ /* 128K / 4K = 32, for 8 threads is 256 pages. */
+ compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+ compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+ compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+ compr_pool.shrinker->batch = 32;
+ compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+ shrinker_register(compr_pool.shrinker);
+
return 0;
}
void __cold btrfs_exit_compress(void)
{
+ /* For now scan drains all pages and does not touch the parameters. */
+ btrfs_compr_pool_scan(NULL, NULL);
+ shrinker_free(compr_pool.shrinker);
+
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 03bb9d143fa7..93cc92974dee 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
+struct page;
+
struct compressed_bio {
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+struct page *btrfs_alloc_compr_page(void);
+void btrfs_free_compr_page(struct page *page);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c1d24d4a78..e65e012bac55 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
{
+ const u64 buf_gen = btrfs_header_generation(buf);
+
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node &&
- (btrfs_header_generation(buf) <=
- btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
- if (buf != root->commit_root)
- return 1;
- /*
- * An extent buffer that used to be the commit root may still be
- * shared because the tree height may have increased and it
- * became a child of a higher level root. This can happen when
- * snapshotting a subvolume created in the current transaction.
- */
- if (btrfs_header_generation(buf) == trans->transid)
- return 1;
- }
- return 0;
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return false;
+
+ if (buf == root->node)
+ return false;
+
+ if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return false;
+
+ if (buf != root->commit_root)
+ return true;
+
+ /*
+ * An extent buffer that used to be the commit root may still be shared
+ * because the tree height may have increased and it became a child of a
+ * higher level root. This can happen when snapshotting a subvolume
+ * created in the current transaction.
+ */
+ if (buf_gen == trans->transid)
+ return true;
+
+ return false;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- unsigned long oip;
+ const int unit_size = folio_size(eb->folios[0]);
+ unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
@@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
mid = (low + high) / 2;
offset = p + mid * item_size;
- oip = offset_in_page(offset);
+ oil = get_eb_offset_in_folio(eb, offset);
- if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = get_eb_page_index(offset);
- char *kaddr = page_address(eb->pages[idx]);
+ if (oil + key_size <= unit_size) {
+ const unsigned long idx = get_eb_folio_index(eb, offset);
+ char *kaddr = folio_address(eb->folios[idx]);
- oip = get_eb_offset_in_page(eb, offset);
- tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ oil = get_eb_offset_in_folio(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oil);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 196c005c31f6..70e828d33177 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -212,8 +212,6 @@ struct btrfs_root {
u64 last_trans;
- u32 type;
-
u64 free_objectid;
struct btrfs_key defrag_progress;
@@ -224,18 +222,15 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t log_extents_lock[2];
- struct list_head logged_list[2];
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by @inode_lock.
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5244561e2016..c276b136ab63 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* this em, as either we don't care about the generation, or the
* merged extent map will be rejected anyway.
*/
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ if (em && (em->flags & EXTENT_FLAG_MERGED) &&
newer_than && em->generation >= newer_than) {
free_extent_map(em);
em = NULL;
@@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
/*
* If the next extent is at its max capacity, defragging current extent
@@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
em->len <= inode->root->fs_info->max_inline)
goto next;
- /* Skip hole/delalloc/preallocated extents */
+ /* Skip holes and preallocated extents. */
if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ (em->flags & EXTENT_FLAG_PREALLOC))
goto next;
/* Skip older extent */
@@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7381241334e8..08102883f560 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
int ret;
+ void *ptr;
again:
node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +132,30 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
+ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
+ /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+ if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
+ return ERR_PTR(-ENOMEM);
}
-
spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
+ ptr = xa_load(&root->delayed_nodes, ino);
+ if (ptr) {
+ /* Somebody inserted it, go back and read it. */
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
+ node = NULL;
goto again;
}
+ ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ASSERT(xa_err(ptr) != -EINVAL);
+ ASSERT(xa_err(ptr) != -ENOMEM);
+ ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf))
- goto search;
-again:
+ /*
+ * Now we're going to delete the INODE_REF/EXTREF, which should be the
+ * only one ref left. Check if the next item is an INODE_REF/EXTREF.
+ *
+ * But if we're the last item already, release and search for the last
+ * INODE_REF/EXTREF.
+ */
+ if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = (u64)-1;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret > 0);
+ ASSERT(path->slots[0] > 0);
+ ret = 0;
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ } else {
+ path->slots[0]++;
+ }
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != node->inode_id)
goto out;
-
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
goto out;
@@ -1070,22 +1093,6 @@ err_out:
btrfs_abort_transaction(trans, ret);
return ret;
-
-search:
- btrfs_release_path(path);
-
- key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = -1;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto err_out;
- ASSERT(ret);
-
- ret = 0;
- leaf = path->nodes[0];
- path->slots[0]--;
- goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ struct btrfs_delayed_node *node;
+ int count;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ count = 0;
+ xa_for_each_start(&root->delayed_nodes, index, node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&node->refs)) {
+ delayed_nodes[count] = node;
+ count++;
+ }
+ if (count >= ARRAY_SIZE(delayed_nodes))
+ break;
}
spin_unlock(&root->inode_lock);
+ index++;
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f9544fda38e9..1502d664c892 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
@@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
spin_unlock(&cache->lock);
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- ASSERT(!IS_ERR(em));
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(map));
num_extents = 0;
cur_extent = 0;
@@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
cur_extent = i;
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
@@ -812,25 +810,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
u64 start = 0;
int i;
- write_lock(&em_tree->lock);
+ write_lock(&fs_info->mapping_tree_lock);
do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
+ if (!map)
break;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
} while (start);
- write_unlock(&em_tree->lock);
+ write_unlock(&fs_info->mapping_tree_lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62cb97f7c94f..c6907d533fe8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + folio_size(folio));
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
@@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ /* GFP flags are compatible with XA_FLAGS_*. */
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -2618,9 +2639,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2724,7 +2742,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2813,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2931,17 +2953,6 @@ out:
}
/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
@@ -2953,7 +2964,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3276,13 +3291,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,28 +3304,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
-
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -3494,29 +3504,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3542,7 +3529,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3570,8 +3557,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3608,7 +3593,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -4391,7 +4376,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 50dab8f639dc..9413726b329b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
- struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ea149be28dff..e3ee5449cc4a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct btrfs_inode *inode = tree->inode;
+ const struct btrfs_inode *inode;
u64 isize;
- if (!inode)
+ if (tree->owner != IO_TREE_INODE_IO)
return;
+ inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
+
/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
*/
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+ return tree->owner == IO_TREE_INODE_IO;
+}
+
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode->root->fs_info;
+ return tree->fs_info;
+}
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner)
{
- tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->inode = NULL;
+ tree->fs_info = fs_info;
tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
/*
@@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
+ btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ "extent io tree error on %s state start %llu end %llu",
+ opname, state->start, state->end);
}
static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
@@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
@@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, next);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
@@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_set_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
}
node = &(*node)->rb_right;
} else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, state->end);
return ERR_PTR(-EEXIST);
}
}
@@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->inode)
- btrfs_split_delalloc_extent(tree->inode, orig, split);
+ if (is_inode_io_tree(tree))
+ btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+ split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_clear_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+ bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -695,7 +718,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -717,7 +740,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
if (wake)
wake_up(&state->wq);
@@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state *state;
int ret = 1;
+ ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
@@ -1152,7 +1177,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -1200,7 +1225,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
@@ -1228,7 +1253,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1382,7 +1407,7 @@ hit_next:
}
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
goto out;
@@ -1430,7 +1455,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
@@ -1453,7 +1478,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 5602b0137fcd..ebe6390d65e9 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -87,9 +87,17 @@ enum {
struct extent_io_tree {
struct rb_root state;
- struct btrfs_fs_info *fs_info;
- /* Inode associated with this tree, or NULL. */
- struct btrfs_inode *inode;
+ /*
+ * The fs_info is needed for trace points, a tree attached to an inode
+ * needs the inode.
+ *
+ * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+ * accessed as inode->root->fs_info
+ */
+ union {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_inode *inode;
+ };
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -112,6 +120,10 @@ struct extent_state {
#endif
};
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01423670bc8a..f396aba92c57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3447,6 +3447,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_block_group *bg;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3460,67 +3461,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
}
- if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group *cache;
- bool must_pin = false;
-
- if (root_id != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, buf->start);
- if (!ret) {
- btrfs_redirty_list_add(trans->transaction, buf);
- goto out;
- }
- }
+ if (!last_ref)
+ return;
- cache = btrfs_lookup_block_group(fs_info, buf->start);
+ if (btrfs_header_generation(buf) != trans->transid)
+ goto out;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret)
goto out;
- }
+ }
- /*
- * If there are tree mod log users we may have recorded mod log
- * operations for this node. If we re-allocate this node we
- * could replay operations on this node that happened when it
- * existed in a completely different root. For example if it
- * was part of root A, then was reallocated to root B, and we
- * are doing a btrfs_old_search_slot(root b), we could replay
- * operations that happened when the block was part of root A,
- * giving us an inconsistent view of the btree.
- *
- * We are safe from races here because at this point no other
- * node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins we will not have an
- * existing log of operations on this node that we have to
- * contend with.
- */
- if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
- must_pin = true;
+ bg = btrfs_lookup_block_group(fs_info, buf->start);
- if (must_pin || btrfs_is_zoned(fs_info)) {
- btrfs_redirty_list_add(trans->transaction, buf);
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
- goto out;
- }
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
+ }
- WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
- btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_free_reserved_bytes(cache, buf->len, 0);
- btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+ || btrfs_is_zoned(fs_info)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
}
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(bg, buf->start, buf->len);
+ btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_put_block_group(bg);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
out:
- if (last_ref) {
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- }
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}
/* Can return -ENOMEM */
@@ -5061,7 +5059,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
__btrfs_tree_lock(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
set_extent_buffer_uptodate(buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f724c54fc8e..cfd2967f04a2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
struct page *page, struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
+ struct folio *folio = page_folio(page);
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
- btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
- btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
- btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
- btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (page != locked_page && (page_ops & PAGE_UNLOCK))
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
static void __process_pages_contig(struct address_space *mapping,
@@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode,
goto out;
for (i = 0; i < found_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+ struct page *page = folio_page(folio, 0);
u32 len = end + 1 - start;
if (page == locked_page)
continue;
- if (btrfs_page_start_writer_lock(fs_info, page, start,
- len))
+ if (btrfs_folio_start_writer_lock(fs_info, folio, start,
+ len))
goto out;
if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start,
- len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start,
+ len);
goto out;
}
@@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start)
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate && btrfs_verify_page(page, start))
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
unlock_page(page);
else
- btrfs_subpage_end_reader(fs_info, page, start, len);
+ btrfs_subpage_end_reader(fs_info, folio, start, len);
}
/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback() if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
- u64 start = page_offset(page) + bvec->bv_offset;
- u32 len = bvec->bv_len;
+ u64 start = folio_pos(folio) + fi.offset;
+ u32 len = fi.length;
+
+ /* Only order 0 (single page) folios are allowed for data. */
+ ASSERT(folio_order(folio) == 0);
/* Our read/write should always be sector aligned. */
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ "partial page write in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page write with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page write with offset %zu and length %zu",
+ fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered,
+ folio_page(folio, 0), start, len, !error);
if (error)
- mapping_set_error(page->mapping, error);
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ mapping_set_error(folio->mapping, error);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
bio_put(bio);
@@ -562,98 +569,102 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
- ASSERT(PageLocked(page));
- if (!btrfs_is_subpage(fs_info, page))
+ struct folio *folio = page_folio(page);
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page));
- btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(folio_test_private(folio));
+ btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
}
/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
struct processed_extent processed = { 0 };
+ struct folio_iter fi;
/*
* The offset to the beginning of a bio, since one bio can never be
* larger than UINT_MAX, u32 here is enough.
*/
u32 bio_offset = 0;
- struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
u64 start;
u64 end;
u32 len;
+ /* For now only order 0 folios are supported for data. */
+ ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
- "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- bio->bi_iter.bi_sector, bio->bi_status,
+ "%s: bi_sector=%llu, err=%d, mirror=%u",
+ __func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
/*
* We always issue full-sector reads, but if some block in a
- * page fails to read, blk_update_request() will advance
+ * folio fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
- sectorsize))
+ "partial page read in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page read with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page read with offset %zu and length %zu",
+ fi.offset, fi.length);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
- len = bvec->bv_len;
+ start = folio_pos(folio) + fi.offset;
+ end = start + fi.length - 1;
+ len = fi.length;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
+ pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
* i_size.
*
- * Here we should only zero the range inside the bvec,
+ * Here we should only zero the range inside the folio,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive.
*/
- if (page->index == end_index && i_size <= end) {
- u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(start));
+ if (folio_index(folio) == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_folio(folio, i_size),
+ offset_in_folio(folio, start));
+ u32 zero_len = offset_in_folio(folio, end) + 1 -
+ zero_start;
- zero_user_segment(page, zero_start,
- offset_in_page(end) + 1);
+ folio_zero_range(folio, zero_start, zero_len);
}
}
/* Update page status and unlock. */
- end_page_read(page, uptodate, start, len);
+ end_page_read(folio_page(folio, 0), uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
@@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
* the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation.
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp)
{
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+ allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp,
+ nr_pages, page_array);
if (allocated == nr_pages)
return 0;
@@ -707,6 +721,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+{
+ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+ int num_pages = num_extent_pages(eb);
+ int ret;
+
+ ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < num_pages; i++)
+ eb->folios[i] = page_folio(page_array[i]);
+ return 0;
+}
+
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
unsigned int pg_offset)
@@ -861,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
} while (size);
}
-static int attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page,
- struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+ struct folio *folio,
+ struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -874,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
- if (page->mapping)
- lockdep_assert_held(&page->mapping->private_lock);
+ if (folio->mapping)
+ lockdep_assert_held(&folio->mapping->i_private_lock);
if (fs_info->nodesize >= PAGE_SIZE) {
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, eb);
else
- WARN_ON(page->private != (unsigned long)eb);
+ WARN_ON(folio_get_private(folio) != eb);
return 0;
}
/* Already mapped, just free prealloc */
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
btrfs_free_subpage(prealloc);
return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
- attach_page_private(page, prealloc);
+ folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, page,
- BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}
void clear_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_detach_subpage(fs_info, page);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_detach_subpage(fs_info, folio);
- detach_page_private(page);
+ folio_detach_private(folio);
}
static struct extent_map *
@@ -1001,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
- bio_ctrl->end_io_func = end_bio_extent_readpage;
+ bio_ctrl->end_io_func = end_bbio_data_read;
begin_page_read(fs_info, page);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
@@ -1027,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- compress_type = em->compress_type;
+ compress_type = extent_map_compression(em);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
@@ -1037,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
else
disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
/*
@@ -1074,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ if (compress_type != BTRFS_COMPRESS_NONE &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
@@ -1240,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct folio *folio = page_folio(page);
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
@@ -1252,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (!btrfs_is_subpage(fs_info, page)) {
+ if (!btrfs_is_subpage(fs_info, page->mapping)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -1305,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
return 1;
}
- bio_ctrl->end_io_func = end_bio_extent_writepage;
+ bio_ctrl->end_io_func = end_bbio_data_write;
while (cur <= end) {
u32 len = end - cur + 1;
u64 disk_bytenr;
@@ -1325,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, len);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
break;
}
@@ -1352,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
block_start = em->block_start;
disk_bytenr = em->block_start + extent_offset;
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
@@ -1377,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
cur - page_offset(page));
@@ -1385,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_page_assert_not_dirty(fs_info, page);
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
*nr_ret = nr;
return 0;
@@ -1607,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
if (!uptodate)
set_btree_ioerr(eb);
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ struct folio *folio = fi.folio;
+ u32 len = fi.length;
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
bio_offset += len;
}
@@ -1673,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, extent_buffer_write_end_io, eb);
+ eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
if (fs_info->nodesize < PAGE_SIZE) {
- struct page *p = eb->pages[0];
+ struct folio *folio = eb->folios[0];
+ bool ret;
- lock_page(p);
- btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ folio_lock(folio);
+ btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
eb->len)) {
- clear_page_dirty_for_io(p);
+ folio_clear_dirty_for_io(folio);
wbc->nr_to_write--;
}
- __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
- wbc_account_cgroup_owner(wbc, p, eb->len);
- unlock_page(p);
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ folio_unlock(folio);
} else {
- for (int i = 0; i < num_extent_pages(eb); i++) {
- struct page *p = eb->pages[i];
-
- lock_page(p);
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
- wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
- wbc->nr_to_write--;
- unlock_page(p);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ bool ret;
+
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+ folio_size(folio));
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
}
btrfs_submit_bio(bbio, 0);
@@ -1725,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
@@ -1732,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
/* Lock and write each dirty extent buffers in the range */
while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
u64 start;
@@ -1741,16 +1784,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
bit_start++;
continue;
}
@@ -1764,7 +1807,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1807,38 +1850,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
struct address_space *mapping = page->mapping;
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
int ret;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc);
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
if (eb == ctx->eb) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (!ret)
return 0;
@@ -2201,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
cur, cur_len, !ret);
mapping_set_error(page->mapping, ret);
}
- btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2352,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
write_unlock(&map->lock);
break;
}
- if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PINNED) ||
em->start != start) {
write_unlock(&map->lock);
free_extent_map(em);
@@ -2369,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
* extra reference on the em.
*/
if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ (em->flags & EXTENT_FLAG_LOGGING))
goto remove_em;
/*
* If it's in the list of modified extents, remove it
@@ -3058,14 +3102,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- lockdep_assert_held(&page->mapping->private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- if (PagePrivate(page)) {
- subpage = (struct btrfs_subpage *)page->private;
+ if (folio_test_private(folio)) {
+ subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
/*
@@ -3078,21 +3122,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
return false;
}
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
- * For mapped eb, we're going to change the page private, which should
- * be done under the private_lock.
+ * For mapped eb, we're going to change the folio private, which should
+ * be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
- if (!PagePrivate(page)) {
+ if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
@@ -3101,66 +3145,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
+ * only clear folio if it's still connected to
* this eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
+ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- detach_page_private(page);
+ BUG_ON(folio_test_dirty(folio));
+ BUG_ON(folio_test_writeback(folio));
+ /* We need to make sure we haven't be attached to a new eb. */
+ folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
/*
- * For subpage, we can have dummy eb with page private. In this case,
- * we can directly detach the private as such page is only attached to
- * one dummy eb, no sharing.
+ * For subpage, we can have dummy eb with folio private attached. In
+ * this case, we can directly detach the private as such folio is only
+ * attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, page);
+ btrfs_detach_subpage(fs_info, folio);
return;
}
- btrfs_page_dec_eb_refs(fs_info, page);
+ btrfs_folio_dec_eb_refs(fs_info, folio);
/*
- * We can only detach the page private if there are no other ebs in the
+ * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!page_range_has_eb(fs_info, page))
- btrfs_detach_subpage(fs_info, page);
+ if (!folio_range_has_eb(fs_info, folio))
+ btrfs_detach_subpage(fs_info, folio);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
}
/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
- int i;
- int num_pages;
-
ASSERT(!extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+ struct folio *folio = eb->folios[i];
- if (!page)
+ if (!folio)
continue;
- detach_extent_buffer_page(eb, page);
+ detach_extent_buffer_folio(eb, folio);
- /* One for when we allocated the page */
- put_page(page);
+ /* One for when we allocated the folio. */
+ folio_put(folio);
}
}
@@ -3198,9 +3234,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
- int i;
struct extent_buffer *new;
- int num_pages = num_extent_pages(src);
+ int num_folios = num_extent_folios(src);
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3214,22 +3249,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = btrfs_alloc_page_array(num_pages, new->pages);
+ ret = alloc_eb_folio_array(new, 0);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
}
- for (i = 0; i < num_pages; i++) {
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = new->folios[i];
int ret;
- struct page *p = new->pages[i];
- ret = attach_extent_buffer_page(new, p, NULL);
+ ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- WARN_ON(PageDirty(p));
+ WARN_ON(folio_test_dirty(folio));
}
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
@@ -3241,23 +3276,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
- int num_pages;
- int i;
+ int num_folios = 0;
int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
- num_pages = num_extent_pages(eb);
- ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ ret = alloc_eb_folio_array(eb, 0);
if (ret)
goto err;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = attach_extent_buffer_page(eb, p, NULL);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
goto err;
}
@@ -3268,10 +3300,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i]) {
- detach_extent_buffer_page(eb, eb->pages[i]);
- __free_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++) {
+ if (eb->folios[i]) {
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ __folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3320,20 +3352,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
- struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_pages, i;
+ int num_folios= num_extent_folios(eb);
check_buffer_tree_ref(eb);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (p != accessed)
- mark_page_accessed(p);
- }
+ for (int i = 0; i < num_folios; i++)
+ folio_mark_accessed(eb->folios[i]);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3361,7 +3387,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
- mark_extent_buffer_accessed(eb, NULL);
+ mark_extent_buffer_accessed(eb);
return eb;
}
@@ -3410,6 +3436,7 @@ free_eb:
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *exists;
/*
@@ -3421,21 +3448,21 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/* Page not yet attached to an extent buffer */
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
- * just overwrite page->private.
+ * just overwrite folio private.
*/
- exists = (struct extent_buffer *)page->private;
+ exists = folio_get_private(folio);
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
- detach_page_private(page);
+ folio_detach_private(folio);
return NULL;
}
@@ -3469,19 +3496,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return 0;
}
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct extent_buffer **found_eb_ret)
+{
+
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ const unsigned long index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
+ int ret;
+
+ ASSERT(found_eb_ret);
+
+ /* Caller should ensure the folio exists. */
+ ASSERT(eb->folios[i]);
+
+retry:
+ ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (!ret)
+ return 0;
+
+ existing_folio = filemap_lock_folio(mapping, index + i);
+ /* The page cache only exists for a very short time, just retry. */
+ if (IS_ERR(existing_folio))
+ goto retry;
+
+ /* For now, we should only have single-page folios for btree inode. */
+ ASSERT(folio_nr_pages(existing_folio) == 1);
+
+ if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return -EAGAIN;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE) {
+ /*
+ * We're going to reuse the existing page, can drop our page
+ * and subpage structure now.
+ */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ } else {
+ struct extent_buffer *existing_eb;
+
+ existing_eb = grab_extent_buffer(fs_info,
+ folio_page(existing_folio, 0));
+ if (existing_eb) {
+ /* The extent buffer still exists, we can use it directly. */
+ *found_eb_ret = existing_eb;
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return 1;
+ }
+ /* The extent buffer no longer exists, we can reuse the folio. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
- int num_pages;
- int i;
- unsigned long index = start >> PAGE_SHIFT;
+ int num_folios;
+ int attached = 0;
struct extent_buffer *eb;
- struct extent_buffer *exists = NULL;
- struct page *p;
+ struct extent_buffer *existing_eb = NULL;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
+ bool page_contig = true;
int uptodate = 1;
int ret;
@@ -3516,11 +3612,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- num_pages = num_extent_pages(eb);
-
/*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock nor page lock hold.
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
*
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
@@ -3528,47 +3622,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
- exists = ERR_CAST(prealloc);
- goto free_eb;
+ ret = PTR_ERR(prealloc);
+ goto out;
}
}
- for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p) {
- exists = ERR_PTR(-ENOMEM);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+reallocate:
+ /* Allocate all pages first. */
+ ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ if (ret < 0) {
+ btrfs_free_subpage(prealloc);
+ goto out;
+ }
+
+ num_folios = num_extent_folios(eb);
+ /* Attach all pages to the filemap. */
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio;
+
+ ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ if (ret > 0) {
+ ASSERT(existing_eb);
+ goto out;
}
- spin_lock(&mapping->private_lock);
- exists = grab_extent_buffer(fs_info, p);
- if (exists) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+ /*
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
+ *
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
+ if (unlikely(ret == -EAGAIN)) {
+ ASSERT(0);
+ goto reallocate;
}
+ attached++;
+
+ /*
+ * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+ * reliable, as we may choose to reuse the existing page cache
+ * and free the allocated page.
+ */
+ folio = eb->folios[i];
+ spin_lock(&mapping->i_private_lock);
/* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_page(eb, p, prealloc);
+ ret = attach_extent_buffer_folio(eb, folio, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the page private
+ * detach_extent_buffer_page() won't release the folio private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
- btrfs_page_inc_eb_refs(fs_info, p);
- spin_unlock(&mapping->private_lock);
+ btrfs_folio_inc_eb_refs(fs_info, folio);
+ spin_unlock(&mapping->i_private_lock);
+
+ WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+
+ /*
+ * Check if the current page is physically contiguous with previous eb
+ * page.
+ * At this stage, either we allocated a large folio, thus @i
+ * would only be 0, or we fall back to per-page allocation.
+ */
+ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+ page_contig = false;
- WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
- eb->pages[i] = p;
- if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+ if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
uptodate = 0;
/*
@@ -3581,12 +3717,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ /* All pages are physically contiguous, can skip cross page handling. */
+ if (page_contig)
+ eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
+ if (ret)
+ goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3594,9 +3731,10 @@ again:
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
+ ret = 0;
+ existing_eb = find_extent_buffer(fs_info, start);
+ if (existing_eb)
+ goto out;
else
goto again;
}
@@ -3609,19 +3747,46 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (i = 0; i < num_pages; i++)
- unlock_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++)
+ unlock_page(folio_page(eb->folios[i], 0));
return eb;
-free_eb:
+out:
WARN_ON(!atomic_dec_and_test(&eb->refs));
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
+
+ /*
+ * Any attached folios need to be detached before we unlock them. This
+ * is because when we're inserting our new folios into the mapping, and
+ * then attaching our eb to that folio. If we fail to insert our folio
+ * we'll lookup the folio for that index, and grab that EB. We do not
+ * want that to grab this eb, as we're getting ready to free it. So we
+ * have to detach it first and then unlock it.
+ *
+ * We have to drop our reference and NULL it out here because in the
+ * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+ * Below when we call btrfs_release_extent_buffer() we will call
+ * detach_extent_buffer_folio() on our remaining pages in the !subpage
+ * case. If we left eb->folios[i] populated in the subpage case we'd
+ * double put our reference and be super sad.
+ */
+ for (int i = 0; i < attached; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ unlock_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
}
+ /*
+ * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+ * so it can be cleaned up without utlizing page->mapping.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
btrfs_release_extent_buffer(eb);
- return exists;
+ if (ret < 0)
+ return ERR_PTR(ret);
+ ASSERT(existing_eb);
+ return existing_eb;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3713,31 +3878,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
{
- ASSERT(PageDirty(page));
- ASSERT(PageLocked(page));
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ ASSERT(folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ folio_clear_dirty_for_io(folio);
+ xa_lock_irq(&folio->mapping->i_pages);
+ if (!folio_test_dirty(folio))
+ __xa_clear_mark(&folio->mapping->i_pages,
+ folio_index(folio), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
+ struct folio *folio = eb->folios[0];
bool last;
- /* btree_clear_page_dirty() needs page locked */
- lock_page(page);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
- eb->len);
+ /* btree_clear_folio_dirty() needs page locked. */
+ folio_lock(folio);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
if (last)
- btree_clear_page_dirty(page);
- unlock_page(page);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
WARN_ON(atomic_read(&eb->refs) == 0);
}
@@ -3745,15 +3909,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i;
- int num_pages;
- struct page *page;
+ int num_folios;
btrfs_assert_tree_write_locked(eb);
if (trans && btrfs_header_generation(eb) != trans->transid)
return;
+ /*
+ * Instead of clearing the dirty flag off of the buffer, mark it as
+ * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+ * write-ordering in zoned mode, without the need to later re-dirty
+ * the extent_buffer.
+ *
+ * The actual zeroout of the buffer will happen later in
+ * btree_csum_one_bio.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+ return;
+ }
+
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
@@ -3763,30 +3939,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
continue;
- lock_page(page);
- btree_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int i;
- int num_pages;
+ int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
@@ -3805,34 +3980,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
* the above race.
*/
if (subpage)
- lock_page(eb->pages[0]);
- for (i = 0; i < num_pages; i++)
- btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
- eb->start, eb->len);
+ lock_page(folio_page(eb->folios[0], 0));
+ for (int i = 0; i < num_folios; i++)
+ btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+ eb->start, eb->len);
if (subpage)
- unlock_page(eb->pages[0]);
+ unlock_page(folio_page(eb->folios[0], 0));
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (i = 0; i < num_pages; i++)
- ASSERT(PageDirty(eb->pages[i]));
+ for (int i = 0; i < num_folios; i++)
+ ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!page)
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (!folio)
continue;
/*
@@ -3840,44 +4013,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
else
- btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_clear_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
/*
* This is special handling for metadata subpage, as regular
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
else
- btrfs_subpage_set_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_set_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
eb->read_mirror = bbio->mirror_num;
@@ -3893,15 +4062,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
+ struct folio *folio = fi.folio;
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ u32 len = fi.length;
if (uptodate)
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
bio_offset += len;
}
@@ -3917,8 +4086,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *check)
{
- int num_pages = num_extent_pages(eb), i;
struct btrfs_bio *bbio;
+ bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3942,17 +4111,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
- extent_buffer_read_end_io, eb);
+ end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
if (eb->fs_info->nodesize < PAGE_SIZE) {
- __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
- eb->start - page_offset(eb->pages[0]));
+ ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+ eb->start - folio_pos(eb->folios[0]));
+ ASSERT(ret);
} else {
- for (i = 0; i < num_pages; i++)
- __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ }
}
btrfs_submit_bio(bbio, mirror_num);
@@ -3999,29 +4175,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
if (check_eb_range(eb, start, len)) {
/*
* Invalid range hit, reset the memory, so callers won't get
- * some random garbage for their uninitialzed memory.
+ * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len);
return;
}
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ memcpy(dstv, eb->addr + start, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
memcpy(dst, kaddr + offset, cur);
dst += cur;
@@ -4035,24 +4215,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (copy_to_user_nofault(dstv, eb->addr + start, len))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
@@ -4070,25 +4255,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = get_eb_offset_in_page(eb, start);
-
- while (len > 0) {
- page = eb->pages[i];
+ if (eb->addr)
+ return memcmp(ptrv, eb->addr + start, len);
- cur = min(len, (PAGE_SIZE - offset));
+ offset = get_eb_offset_in_folio(eb, start);
- kaddr = page_address(page);
+ while (len > 0) {
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
@@ -4107,10 +4292,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
- struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct folio *folio = eb->folios[i];
+
+ ASSERT(folio);
/*
* If we are using the commit root we could potentially clear a page
@@ -4124,11 +4311,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
return;
if (fs_info->nodesize < PAGE_SIZE) {
- if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ struct folio *folio = eb->folios[0];
+
+ ASSERT(i == 0);
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
- btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+ btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
}
}
@@ -4136,29 +4326,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
- WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
if (check_eb_range(eb, start, len))
return;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (use_memmove)
+ memmove(eb->addr + start, srcv, len);
+ else
+ memcpy(eb->addr + start, srcv, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
if (check_uptodate)
- assert_eb_page_uptodate(eb, page);
+ assert_eb_folio_uptodate(eb, i);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (use_memmove)
memmove(kaddr + offset, src, cur);
else
@@ -4180,16 +4375,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
unsigned long cur = start;
+ if (eb->addr) {
+ memset(eb->addr + start, c, len);
+ return;
+ }
+
while (cur < start + len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned int offset = get_eb_offset_in_page(eb, cur);
- unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
- struct page *page = eb->pages[index];
+ unsigned long index = get_eb_folio_index(eb, cur);
+ unsigned int offset = get_eb_offset_in_folio(eb, cur);
+ unsigned int cur_len = min(start + len - cur, unit_size - offset);
- assert_eb_page_uptodate(eb, page);
- memset(page_address(page) + offset, c, cur_len);
+ assert_eb_folio_uptodate(eb, index);
+ memset(folio_address(eb->folios[index]) + offset, c, cur_len);
cur += cur_len;
}
@@ -4206,15 +4406,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
+ const int unit_size = folio_size(src->folios[0]);
unsigned long cur = 0;
ASSERT(dst->len == src->len);
while (cur < src->len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned long offset = get_eb_offset_in_page(src, cur);
- unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
- void *addr = page_address(src->pages[index]) + offset;
+ unsigned long index = get_eb_folio_index(src, cur);
+ unsigned long offset = get_eb_offset_in_folio(src, cur);
+ unsigned long cur_len = min(src->len, unit_size - offset);
+ void *addr = folio_address(src->folios[index]) + offset;
write_extent_buffer(dst, addr, cur, cur_len);
@@ -4227,12 +4428,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
u64 dst_len = dst->len;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- unsigned long i = get_eb_page_index(dst_offset);
+ unsigned long i = get_eb_folio_index(dst, dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -4240,15 +4441,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = get_eb_offset_in_page(dst, dst_offset);
+ offset = get_eb_offset_in_folio(dst, dst_offset);
while (len > 0) {
- page = dst->pages[i];
- assert_eb_page_uptodate(dst, page);
+ assert_eb_folio_uptodate(dst, i);
- cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+ cur = min(len, (unsigned long)(unit_size - offset));
- kaddr = page_address(page);
+ kaddr = folio_address(dst->folios[i]);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
@@ -4259,22 +4459,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * Calculate the page and offset of the byte containing the given bit number.
+ * Calculate the folio and offset of the byte containing the given bit number.
*
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains
+ * @folio_index: return index of the folio in the extent buffer that contains
* the given bit number
- * @page_offset: return offset into the page given by page_index
+ * @folio_offset: return offset into the folio given by folio_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
- unsigned long *page_index,
- size_t *page_offset)
+ unsigned long *folio_index,
+ size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -4284,10 +4484,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_page(eb->start) + byte_offset;
+ offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
- *page_index = offset >> PAGE_SHIFT;
- *page_offset = offset_in_page(offset);
+ *folio_index = offset >> folio_shift(eb->folios[0]);
+ *folio_offset = offset_in_folio(eb->folios[0], offset);
}
/*
@@ -4300,25 +4500,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
- u8 *kaddr;
- struct page *page;
unsigned long i;
size_t offset;
+ u8 *kaddr;
eb_bitmap_offset(eb, start, nr, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ assert_eb_folio_uptodate(eb, i);
+ kaddr = folio_address(eb->folios[i]);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
- unsigned long index = get_eb_page_index(bytenr);
+ unsigned long index = get_eb_folio_index(eb, bytenr);
if (check_eb_range(eb, bytenr, 1))
return NULL;
- return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+ return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}
/*
@@ -4403,19 +4601,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+ if (dst->addr) {
+ const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+ if (use_memmove)
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ else
+ memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (cur_off < len) {
unsigned long cur_src = cur_off + src_offset;
- unsigned long pg_index = get_eb_page_index(cur_src);
- unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+ unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
unsigned long cur_len = min(src_offset + len - cur_src,
- PAGE_SIZE - pg_off);
- void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ unit_size - folio_off);
+ void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
const bool use_memmove = areas_overlap(src_offset + cur_off,
dst_offset + cur_off, cur_len);
@@ -4441,24 +4650,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
+ if (dst->addr) {
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (len > 0) {
unsigned long src_i;
size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
+ size_t dst_off_in_folio;
+ size_t src_off_in_folio;
void *src_addr;
bool use_memmove;
- src_i = get_eb_page_index(src_end);
+ src_i = get_eb_folio_index(dst, src_end);
- dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
- src_off_in_page = get_eb_offset_in_page(dst, src_end);
+ dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+ src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
- cur = min_t(unsigned long, len, src_off_in_page + 1);
- cur = min(cur, dst_off_in_page + 1);
+ cur = min_t(unsigned long, len, src_off_in_folio + 1);
+ cur = min(cur, dst_off_in_folio + 1);
- src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
- cur + 1;
+ src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+ cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
@@ -4520,7 +4734,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
struct extent_buffer *eb = NULL;
/*
- * Unlike try_release_extent_buffer() which uses page->private
+ * Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
@@ -4560,43 +4774,44 @@ static int try_release_subpage_extent_buffer(struct page *page)
/*
* Here we don't care about the return value, we will always
- * check the page private at the end. And
+ * check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
- * Finally to check if we have cleared page private, as if we have
- * released all ebs in the page, the page private should be cleared now.
+ * Finally to check if we have cleared folio private, as if we have
+ * released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page))
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(page_folio(page)))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return ret;
}
int try_release_extent_buffer(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
- * We need to make sure nobody is changing page->private, as we rely on
- * page->private as the pointer to extent buffer.
+ * We need to make sure nobody is changing folio private, as we rely on
+ * folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
return 1;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
/*
@@ -4607,10 +4822,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..46050500529b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,8 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
- EXTENT_BUFFER_NO_CHECK,
+ /* Indicate the extent buffer is written zeroed out (for zoned) */
+ EXTENT_BUFFER_ZONED_ZEROOUT,
/* Indicate that extent buffer pages a being read */
EXTENT_BUFFER_READING,
};
@@ -43,10 +44,10 @@ enum {
};
/*
- * page->private values. Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values. Every page that is controlled by the extent map has
+ * folio private set to this value.
*/
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
@@ -77,6 +78,13 @@ struct extent_buffer {
unsigned long len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * The address where the eb can be accessed without any cross-page handling.
+ * This can be NULL if not possible.
+ */
+ void *addr;
+
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
@@ -86,7 +94,12 @@ struct extent_buffer {
struct rw_semaphore lock;
- struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ /*
+ * Pointers to all the folios of the extent buffer.
+ *
+ * For now the folio is always order 0 (aka, a single page).
+ */
+ struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
pid_t lock_owner;
@@ -108,29 +121,43 @@ struct btrfs_eb_write_context {
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb
+ * The eb->start is aligned to folio size, thus adding it
+ * won't cause any difference.
+ * 1.2) Several page sized folios
+ * The eb->start is aligned to folio (page) size, thus
+ * adding it won't cause any difference.
*
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * In this case there would only be one page sized folio, and there
+ * may be several different extent buffers in the page/folio.
+ * We need to add eb->start to properly access the offset inside
+ * that eb.
*/
- return offset_in_page(offset + eb->start);
+ return offset_in_folio(eb->folios[0], offset + eb->start);
}
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb.
+ * the folio_shift would be large enough to always make us
+ * return 0 as index.
+ * 1.2) Several page sized folios
+ * The folio_shift() would be PAGE_SHIFT, giving us the correct
+ * index.
*
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> PAGE_SHIFT;
+ return offset >> folio_shift(eb->folios[0]);
}
/*
@@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
return (eb->len >> PAGE_SHIFT) ?: 1;
}
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+ if (folio_order(eb->folios[0]))
+ return 1;
+ return num_extent_pages(eb);
+}
+
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a6d8368ed0ed..b61099bf97a8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->compress_type = BTRFS_COMPRESS_NONE;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em)
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
- return 0;
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
- /*
- * don't merge compressed extents, we need to know their
- * actual size
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
- return 0;
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_PINNED)
+ return false;
+
+ /* Don't merge compressed extents, we need to know their actual size. */
+ if (extent_map_is_compressed(em))
+ return false;
- if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
- test_bit(EXTENT_FLAG_LOGGING, &next->flags))
- return 0;
+ if (em->flags & EXTENT_FLAG_LOGGING)
+ return false;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
- if (!list_empty(&prev->list) || !list_empty(&next->list))
- return 0;
-
- ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
- prev->block_start != EXTENT_MAP_DELALLOC);
-
- if (prev->map_lookup || next->map_lookup)
- ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
- test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
- if (extent_map_end(prev) == next->start &&
- prev->flags == next->flags &&
- prev->map_lookup == next->map_lookup &&
- ((next->block_start == EXTENT_MAP_HOLE &&
- prev->block_start == EXTENT_MAP_HOLE) ||
- (next->block_start == EXTENT_MAP_INLINE &&
- prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
- next->block_start == extent_map_block_end(prev)))) {
- return 1;
- }
- return 0;
+ if (!list_empty(&em->list))
+ return false;
+
+ return true;
+}
+
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+ if (extent_map_end(prev) != next->start)
+ return false;
+
+ if (prev->flags != next->flags)
+ return false;
+
+ if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
+ return next->block_start == extent_map_block_end(prev);
+
+ /* HOLES and INLINE extents. */
+ return next->block_start == prev->block_start;
}
static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
@@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (refcount_read(&em->refs) > 2)
return;
+ if (!can_merge_extent_map(em))
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->orig_start = merge->orig_start;
em->len += merge->len;
@@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
}
}
@@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
/*
* Unpin an extent from the cache.
*
- * @tree: tree to unpin the extent in
+ * @inode: the inode from which we are unpinning an extent range
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
@@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
*/
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
- u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
bool prealloc = false;
@@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(!em || em->start != start);
-
- if (!em)
+ if (WARN_ON(!em)) {
+ btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ start, len, gen);
goto out;
+ }
+
+ if (WARN_ON(em->start != start))
+ btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ em->start, start, len, gen);
em->generation = gen;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
em->mod_start = em->start;
em->mod_len = em->len;
- if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_FILLING) {
prealloc = true;
- clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_FILLING;
}
try_merge_map(tree, em);
@@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
try_merge_map(tree, em);
}
@@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
em->mod_start = em->start;
em->mod_len = em->len;
+ ASSERT(list_empty(&em->list));
+
if (modified)
- list_move(&em->list, &tree->modified_extents);
+ list_add(&em->list, &tree->modified_extents);
else
try_merge_map(tree, em);
}
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
- }
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
- }
-}
-
/*
* Add new extent map to the extent tree
*
@@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
* into the tree directly, with an additional reference taken, or a
* reference dropped if the merge attempt was successful.
*/
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified)
+static int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified)
{
int ret = 0;
@@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
goto out;
setup_extent_mapping(tree, em, modified);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
- extent_map_device_set_bits(em, CHUNK_ALLOCATED);
- extent_map_device_clear_bits(em, CHUNK_TRIMMED);
- }
out:
return ret;
}
@@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ WARN_ON(em->flags & EXTENT_FLAG_PINNED);
rb_erase_cached(&em->rb_node, &tree->map);
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
}
@@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
- if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
@@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->start = start;
em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ !extent_map_is_compressed(em)) {
em->block_start += start_diff;
em->block_len = em->len;
}
@@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- ret = 0;
-
existing = search_extent_mapping(em_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
remove_extent_mapping(tree, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
@@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
}
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
start = em_end;
goto next;
}
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/*
* In case we split the extent map, we want to preserve the
* EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
* it on the new extent maps.
*/
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
modified = !list_empty(&em->list);
/*
@@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
@@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->len = em_end - end;
split->block_start = em->block_start;
split->flags = flags;
- split->compress_type = em->compress_type;
split->generation = gen;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
}
ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(em->flags & EXTENT_FLAG_PINNED);
+ ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
@@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
replace_extent_mapping(em_tree, em, split_pre, 1);
@@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, 1);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..e380fc08bbe4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -5,30 +5,33 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include "compression.h"
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
enum {
/* this entry not yet on disk, don't free it */
- EXTENT_FLAG_PINNED,
- EXTENT_FLAG_COMPRESSED,
+ ENUM_BIT(EXTENT_FLAG_PINNED),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
/* pre-allocated extent */
- EXTENT_FLAG_PREALLOC,
+ ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
- EXTENT_FLAG_LOGGING,
+ ENUM_BIT(EXTENT_FLAG_LOGGING),
/* Filling in a preallocated extent */
- EXTENT_FLAG_FILLING,
- /* filesystem extent mapping type */
- EXTENT_FLAG_FS_MAPPING,
+ ENUM_BIT(EXTENT_FLAG_FILLING),
/* This em is merged from two or more physically adjacent ems */
- EXTENT_FLAG_MERGED,
+ ENUM_BIT(EXTENT_FLAG_MERGED),
};
+/*
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
struct extent_map {
struct rb_node rb_node;
@@ -49,11 +52,8 @@ struct extent_map {
* For non-merged extents, it's from btrfs_file_extent_item::generation.
*/
u64 generation;
- unsigned long flags;
- /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
- struct map_lookup *map_lookup;
+ u32 flags;
refcount_t refs;
- unsigned int compress_type;
struct list_head list;
};
@@ -65,30 +65,57 @@ struct extent_map_tree {
struct btrfs_inode;
+static inline void extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
+{
+ if (type == BTRFS_COMPRESS_ZLIB)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ else if (type == BTRFS_COMPRESS_LZO)
+ em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+ else if (type == BTRFS_COMPRESS_ZSTD)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+ return BTRFS_COMPRESS_ZLIB;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+ return BTRFS_COMPRESS_LZO;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+ return BTRFS_COMPRESS_ZSTD;
+
+ return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+ return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+ EXTENT_FLAG_COMPRESS_LZO |
+ EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-static inline u64 extent_map_block_end(struct extent_map *em)
-{
- if (em->block_start + em->block_len < em->block_start)
- return (u64)-1;
- return em->block_start + em->block_len;
-}
-
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified);
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..81ac1d474bf1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
@@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return clear_extent_bit(&inode->file_extent_tree, start,
+ return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
return;
}
if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
+ extent_map_set_compression(em, compress_type);
em->block_start = bytenr;
em->block_len = em->orig_block_len;
} else {
@@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->block_start = bytenr;
em->block_len = em->len;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
em->block_start = EXTENT_MAP_INLINE;
@@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- em->compress_type = compress_type;
- if (compress_type != BTRFS_COMPRESS_NONE)
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32611a4edd6b..38dfcac47609 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
- block_len);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+ block_start, block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+ start_pos, num_bytes);
}
/*
@@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode,
* released.
*
* The private flag check is essential for subpage as we need
- * to store extra bitmap using page->private.
+ * to store extra bitmap using folio private.
*/
- if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page);
return -EAGAIN;
}
@@ -2150,7 +2153,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
if (em->block_start == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
- else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode,
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
- if (em->start <= alloc_start &&
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
@@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode,
goto out;
}
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC) {
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
@@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ !(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6f93c9a2c3e3..d372c7ce0e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_page_clear_checked(io_ctl->fs_info,
- io_ctl->pages[i],
+ btrfs_folio_clear_checked(io_ctl->fs_info,
+ page_folio(io_ctl->pages[i]),
page_offset(io_ctl->pages[i]),
PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 318df6f9d9cb..f8bb73d6ab68 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -188,6 +188,7 @@ enum {
BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
BTRFS_MOUNT_NODISCARD = (1UL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
};
/*
@@ -398,7 +399,8 @@ struct btrfs_fs_info {
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
+ struct rb_root_cached mapping_tree;
+ rwlock_t mapping_tree_lock;
/*
* Block reservation for extent, checksum, root tree and delayed dir
@@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb3c3f43c3fa..809b11472a80 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,6 +114,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -1037,7 +1046,7 @@ free_pages:
if (pages) {
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ btrfs_free_compr_page(pages[i]);
}
kfree(pages);
}
@@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ btrfs_free_compr_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -2793,7 +2802,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+ unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3796,7 +3805,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4725,7 +4734,7 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
ret = set_page_extent_mapped(page);
@@ -4767,9 +4776,10 @@ again:
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
}
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
err = maybe_insert_hole(inode, cur_offset, hole_size);
@@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -7258,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_PREALLOC)
+ em->flags |= EXTENT_FLAG_FILLING;
+ else if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7304,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7542,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7638,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7851,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac)
static void wait_subpage_spinlock(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7995,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8004,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8074,7 +8089,7 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
@@ -8098,6 +8113,7 @@ next:
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -8114,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
u64 page_end;
u64 end;
+ ASSERT(folio_order(folio) == 0);
+
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
@@ -8217,9 +8235,9 @@ again:
if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8462,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8501,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8521,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -9632,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9785,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9994,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10113,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10133,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10564,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10680,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10703,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10717,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10786,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10930,7 +10973,7 @@ static const struct address_space_operations btrfs_aops = {
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1743904202b..41b479861b3c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4533,29 +4533,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (ret < 0)
goto out_acct;
- file_start_write(file);
-
if (iov_iter_count(&iter) == 0) {
ret = 0;
- goto out_end_write;
+ goto out_iov;
}
pos = args.offset;
ret = rw_verify_area(WRITE, file, &pos, args.len);
if (ret < 0)
- goto out_end_write;
+ goto out_iov;
init_sync_kiocb(&kiocb, file);
ret = kiocb_set_rw_flags(&kiocb, 0);
if (ret)
- goto out_end_write;
+ goto out_iov;
kiocb.ki_pos = pos;
+ file_start_write(file);
+
ret = btrfs_do_write_iter(&kiocb, &iter, &args);
if (ret > 0)
fsnotify_modify(file);
-out_end_write:
file_end_write(file);
+out_iov:
kfree(iov);
out_acct:
if (ret > 0)
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
*
* @cache: The cache.
* @max_size: Maximum size (number of entries) for the cache.
- * Use 0 for unlimited size, it's the user's responsability to
+ * Use 0 for unlimited size, it's the user's responsibility to
* trim the cache in that case.
*/
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..1131d5a29d61 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index b8f9c9e56c8c..cdada4865837 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
* panic or BUGs, depending on mount options.
*/
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 4d04c1fa5899..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error);
__printf(5, 6)
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a82e1417c4d2..59850dc17b22 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
+ file_offset, len))
return false;
- btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
}
/* Now we're fine to update the accounting. */
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 567a6d3d4712..127ef8bf0ffd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -97,13 +97,6 @@ struct btrfs_ordered_extent {
u64 bytes_left;
/*
- * the end of the ordered extent which is behind it but
- * didn't update disk_i_size. Please see the comment of
- * btrfs_ordered_update_i_size();
- */
- u64 outstanding_isize;
-
- /*
* If we get truncated we need to adjust the file extent we enter for
* this ordered extent so that we do not expose stale data.
*/
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e46774e8f49f..63b426cc7798 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
*
* Must be called with qgroup_lock held and @prealloc preallocated.
*
- * The control on the lifespan of @prealloc would be transfered to this
+ * The control on the lifespan of @prealloc would be transferred to this
* function, thus caller should no longer touch @prealloc.
*/
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..792c8e17c31d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages);
+ rbio->stripe_pages + data_pages, 0);
if (ret < 0)
return ret;
@@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
@@ -1549,7 +1549,6 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct work_struct work;
};
/*
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..470213688872 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -164,7 +164,7 @@ struct raid56_bio_trace_info {
u8 stripe_nr;
};
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f88b0c2ac3fe..ae90894dc7dc 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
- btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5d9e5f74a52..abe594f77f99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* will re-read the whole page anyway.
*/
if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
round_up(i_size, PAGE_SIZE) - i_size);
unlock_page(page);
put_page(page);
@@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
@@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len);
goto release_page;
}
- btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, page_folio(page),
+ clamped_start, clamped_len);
/*
* Set the boundary if it's inside the page.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f62a408671cb..a01807cbd4d4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,7 +43,7 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
#define SCRUB_STRIPES_PER_GROUP 8
@@ -192,7 +192,6 @@ struct scrub_ctx {
int cur_stripe;
atomic_t cancel_req;
int readonly;
- int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
if (ret < 0)
goto error;
@@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
/* Metadata, verify the full tree block. */
if (sector->is_metadata) {
/*
- * Check if the tree block crosses the stripe boudary. If
+ * Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
* warning.
*
@@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/*
* Init needed infos for error reporting.
*
- * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -1280,7 +1279,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset,
+ struct btrfs_chunk_map *map, u64 *offset,
u64 *stripe_start)
{
int i;
@@ -1409,14 +1408,11 @@ search_forward:
if (ret > 0)
break;
next:
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret) {
- /* Either no more item or fatal error */
- btrfs_release_path(path);
- return ret;
- }
+ ret = btrfs_next_item(extent_root, path);
+ if (ret) {
+ /* Either no more items or a fatal error. */
+ btrfs_release_path(path);
+ return ret;
}
}
btrfs_release_path(path);
@@ -1816,7 +1812,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (sctx->is_dev_replace) {
/*
* For dev-replace, if we know there is something wrong with
- * metadata, we should immedately abort.
+ * metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1898,7 +1894,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
DECLARE_COMPLETION_ONSTACK(io_done);
@@ -2067,7 +2063,7 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2128,7 +2124,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2137,7 +2133,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
}
/* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
struct btrfs_block_group *bg,
int stripe_index)
{
@@ -2154,7 +2150,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
}
/* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2166,7 +2162,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
static int scrub_simple_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct btrfs_device *device,
int stripe_index)
{
@@ -2199,18 +2195,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct extent_map *em,
+ struct btrfs_chunk_map *map,
struct btrfs_device *scrub_dev,
int stripe_index)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
int ret2;
u64 physical = map->stripes[stripe_index].physical;
- const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
@@ -2373,17 +2368,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
int i;
int ret = 0;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, bg->start, bg->length);
- read_unlock(&map_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+ if (!map) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
@@ -2395,22 +2385,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- if (em->start != bg->start)
+ if (map->start != bg->start)
goto out;
- if (em->len < dev_extent_len)
+ if (map->chunk_len < dev_extent_len)
goto out;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
if (ret)
goto out;
}
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 1b999c6e4193..93511d54abf8 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,7 @@
* This means a slightly higher tree locking latency.
*/
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
@@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!page->mapping || !page->mapping->host ||
- is_data_inode(page->mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(mapping->host))
return true;
/*
@@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
}
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type)
+ struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
@@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
- if (page->mapping)
- ASSERT(PageLocked(page));
+ if (folio->mapping)
+ ASSERT(folio_test_locked(folio));
- /* Either not subpage, or the page already has private attached */
- if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
- attach_page_private(page, subpage);
+ folio_attach_private(folio, subpage);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- /* Either not subpage, or already detached */
- if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
return;
- subpage = detach_page_private(page);
+ subpage = folio_detach_private(folio);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
* This is important for eb allocation, to prevent race with last eb freeing
* of the same page.
* With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
* allocating the extent buffer.
*/
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
atomic_inc(&subpage->eb_refs);
}
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(atomic_read(&subpage->eb_refs));
atomic_dec(&subpage->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
+ /* For subpage support, the folio must be single page. */
+ ASSERT(folio_order(folio) == 0);
+
/* Basic checks */
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
- if (page->mapping)
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ if (folio->mapping)
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
}
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
atomic_add(nbits, &subpage->readers);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
bool is_data;
bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
- is_data = is_data_inode(page->mapping->host);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ is_data = is_data_inode(folio->mapping->host);
ASSERT(atomic_read(&subpage->readers) >= nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
@@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* As we want the atomic_sub_and_test() to be always executed.
*/
if (is_data && last)
- unlock_page(page);
+ folio_unlock(folio);
}
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
u32 orig_len = *len;
- *start = max_t(u64, page_offset(page), orig_start);
+ *start = max_t(u64, folio_pos(folio), orig_start);
/*
* For certain call sites like btrfs_drop_pages(), we may have pages
* beyond the target range. In that case, just set @len to 0, subpage
* helpers can handle @len == 0 without any problem.
*/
- if (page_offset(page) >= orig_start + orig_len)
+ if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
int ret;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
ASSERT(atomic_read(&subpage->readers) == 0);
ret = atomic_add_return(nbits, &subpage->writers);
@@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
}
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
/*
* We have call sites passing @lock_page into
@@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
}
/*
- * Lock a page for delalloc page writeback.
+ * Lock a folio for delalloc page writeback.
*
* Return -EAGAIN if the page is not properly initialized.
* Return 0 with the page locked, and writer counter updated.
@@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* it's really the correct page, as the caller is using
* filemap_get_folios_contig(), which can race with page invalidating.
*/
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
- lock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_lock(folio);
return 0;
}
- lock_page(page);
- if (!PagePrivate(page) || !page->private) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio_test_private(folio) || !folio_get_private(folio)) {
+ folio_unlock(folio);
return -EAGAIN;
}
- btrfs_subpage_clamp_range(page, &start, &len);
- btrfs_subpage_start_writer(fs_info, page, start, len);
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ btrfs_subpage_start_writer(fs_info, folio, start, len);
return 0;
}
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
- btrfs_subpage_clamp_range(page, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
- unlock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ folio_unlock(folio);
}
-#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
unsigned int start_bit; \
\
- btrfs_subpage_assert(fs_info, page, start, len); \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
start_bit += fs_info->subpage_info->name##_offset; \
start_bit; \
@@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
fs_info->subpage_info->bitmap_nr_bits)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
/*
@@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
@@ -455,101 +456,101 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
bool last;
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
if (last)
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- set_page_writeback(page);
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
- ASSERT(PageWriteback(page));
- end_page_writeback(page);
+ ASSERT(folio_test_writeback(folio));
+ folio_end_writeback(folio);
}
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageOrdered(page);
+ folio_set_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
- ClearPageOrdered(page);
+ folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
- SetPageChecked(page);
+ folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,10 +560,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
*/
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+ struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
@@ -584,88 +585,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
* in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
* back to regular sectorsize branch.
*/
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
- test_page_func) \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \
+ folio_clear_func, folio_test_func) \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- btrfs_subpage_clamp_range(page, &start, &len); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
- PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
- PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
- PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
- PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+ folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+ folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+ folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+ folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+ folio_test_checked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!PageDirty(page));
- if (!btrfs_is_subpage(fs_info, page))
+ ASSERT(!folio_test_dirty(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
}
@@ -684,18 +691,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* extent_write_locked_range().
* In this case, we have to call subpage helper to handle the case.
*/
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len)
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- ASSERT(PageLocked(page));
+ ASSERT(folio_test_locked(folio));
/* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* For subpage case, there are two types of locked page. With or
@@ -704,12 +713,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
/* No writers, locked by plain lock_page() */
- return unlock_page(page);
+ folio_unlock(folio);
+ return;
+ }
/* Have writers, use proper subpage helper to end it */
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
@@ -717,7 +728,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
@@ -729,9 +740,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long checked_bitmap;
unsigned long flags;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_info);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
@@ -741,10 +752,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
- dump_page(page, "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
- start, len, page_offset(page),
+ start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..793c2b314a58 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -73,71 +73,68 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page);
+ struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
/*
* Template for subpage related operations.
*
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
*
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
*
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ef256b944c72..896acfda1789 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -27,6 +27,7 @@
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include <linux/security.h>
+#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -64,19 +65,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
@@ -86,8 +75,22 @@ static void btrfs_put_super(struct super_block *sb)
close_ctree(fs_info);
}
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+ char *subvol_name;
+ u64 subvol_objectid;
+ u64 max_inline;
+ u32 commit_interval;
+ u32 metadata_ratio;
+ u32 thread_pool_size;
+ unsigned long mount_opt;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ refcount_t refs;
+};
+
enum {
- Opt_acl, Opt_noacl,
+ Opt_acl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
@@ -97,27 +100,26 @@ enum {
Opt_degraded,
Opt_device,
Opt_fatal_errors,
- Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_flushoncommit,
Opt_max_inline,
- Opt_barrier, Opt_nobarrier,
- Opt_datacow, Opt_nodatacow,
- Opt_datasum, Opt_nodatasum,
- Opt_defrag, Opt_nodefrag,
- Opt_discard, Opt_nodiscard,
+ Opt_barrier,
+ Opt_datacow,
+ Opt_datasum,
+ Opt_defrag,
+ Opt_discard,
Opt_discard_mode,
- Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
- Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache,
Opt_space_cache_version,
- Opt_ssd, Opt_nossd,
- Opt_ssd_spread, Opt_nossd_spread,
+ Opt_ssd,
+ Opt_ssd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
- Opt_treelog, Opt_notreelog,
+ Opt_treelog,
Opt_user_subvol_rm_allowed,
/* Rescue options */
@@ -128,14 +130,10 @@ enum {
Opt_ignoredatacsums,
Opt_rescue_all,
- /* Deprecated options */
- Opt_recovery,
- Opt_inode_cache, Opt_noinode_cache,
-
/* Debugging options */
- Opt_enospc_debug, Opt_noenospc_debug,
+ Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
- Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
@@ -143,785 +141,611 @@ enum {
Opt_err,
};
-static const match_table_t tokens = {
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_clear_cache, "clear_cache"},
- {Opt_commit_interval, "commit=%u"},
- {Opt_compress, "compress"},
- {Opt_compress_type, "compress=%s"},
- {Opt_compress_force, "compress-force"},
- {Opt_compress_force_type, "compress-force=%s"},
- {Opt_degraded, "degraded"},
- {Opt_device, "device=%s"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_flushoncommit, "flushoncommit"},
- {Opt_noflushoncommit, "noflushoncommit"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_datacow, "datacow"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datasum, "datasum"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_discard, "discard"},
- {Opt_discard_mode, "discard=%s"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_norecovery, "norecovery"},
- {Opt_ratio, "metadata_ratio=%u"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_skip_balance, "skip_balance"},
- {Opt_space_cache, "space_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_space_cache_version, "space_cache=%s"},
- {Opt_ssd, "ssd"},
- {Opt_nossd, "nossd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd_spread, "nossd_spread"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvol_empty, "subvol="},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_thread_pool, "thread_pool=%u"},
- {Opt_treelog, "treelog"},
- {Opt_notreelog, "notreelog"},
- {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+ Opt_fatal_errors_panic,
+ Opt_fatal_errors_bug,
+};
- /* Rescue options */
- {Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+ { "panic", Opt_fatal_errors_panic },
+ { "bug", Opt_fatal_errors_bug },
+ {}
+};
+
+enum {
+ Opt_discard_sync,
+ Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+ { "sync", Opt_discard_sync },
+ { "async", Opt_discard_async },
+ {}
+};
+
+enum {
+ Opt_space_cache_v1,
+ Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+ { "v1", Opt_space_cache_v1 },
+ { "v2", Opt_space_cache_v2 },
+ {}
+};
+
+enum {
+ Opt_rescue_usebackuproot,
+ Opt_rescue_nologreplay,
+ Opt_rescue_ignorebadroots,
+ Opt_rescue_ignoredatacsums,
+ Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+ { "usebackuproot", Opt_rescue_usebackuproot },
+ { "nologreplay", Opt_rescue_nologreplay },
+ { "ignorebadroots", Opt_rescue_ignorebadroots },
+ { "ibadroots", Opt_rescue_ignorebadroots },
+ { "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "idatacsums", Opt_rescue_ignoredatacsums },
+ { "all", Opt_rescue_parameter_all },
+ {}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+ Opt_fragment_parameter_data,
+ Opt_fragment_parameter_metadata,
+ Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+ { "data", Opt_fragment_parameter_data },
+ { "metadata", Opt_fragment_parameter_metadata },
+ { "all", Opt_fragment_parameter_all },
+ {}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("autodefrag", Opt_defrag),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("clear_cache", Opt_clear_cache),
+ fsparam_u32("commit", Opt_commit_interval),
+ fsparam_flag("compress", Opt_compress),
+ fsparam_string("compress", Opt_compress_type),
+ fsparam_flag("compress-force", Opt_compress_force),
+ fsparam_string("compress-force", Opt_compress_force_type),
+ fsparam_flag_no("datacow", Opt_datacow),
+ fsparam_flag_no("datasum", Opt_datasum),
+ fsparam_flag("degraded", Opt_degraded),
+ fsparam_string("device", Opt_device),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+ fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+ fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+ fsparam_string("max_inline", Opt_max_inline),
+ fsparam_u32("metadata_ratio", Opt_ratio),
+ fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+ fsparam_flag("skip_balance", Opt_skip_balance),
+ fsparam_flag_no("space_cache", Opt_space_cache),
+ fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+ fsparam_flag_no("ssd", Opt_ssd),
+ fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+ fsparam_string("subvol", Opt_subvol),
+ fsparam_flag("subvol=", Opt_subvol_empty),
+ fsparam_u64("subvolid", Opt_subvolid),
+ fsparam_u32("thread_pool", Opt_thread_pool),
+ fsparam_flag_no("treelog", Opt_treelog),
+ fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+ /* Rescue options. */
+ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
/* Deprecated, with alias rescue=nologreplay */
- {Opt_nologreplay, "nologreplay"},
+ __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
- {Opt_usebackuproot, "usebackuproot"},
+ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
- /* Deprecated options */
- {Opt_recovery, "recovery"},
-
- /* Debugging options */
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+ /* Debugging options. */
+ fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
- {Opt_fragment_data, "fragment=data"},
- {Opt_fragment_metadata, "fragment=metadata"},
- {Opt_fragment_all, "fragment=all"},
+ fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- {Opt_ref_verify, "ref_verify"},
+ fsparam_flag("ref_verify", Opt_ref_verify),
#endif
- {Opt_err, NULL},
+ {}
};
-static const match_table_t rescue_tokens = {
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_ignorebadroots, "ignorebadroots"},
- {Opt_ignorebadroots, "ibadroots"},
- {Opt_ignoredatacsums, "ignoredatacsums"},
- {Opt_ignoredatacsums, "idatacsums"},
- {Opt_rescue_all, "all"},
- {Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
- const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
- if (fs_info->mount_opt & opt) {
- btrfs_err(fs_info, "%s must be used with ro mount option",
- opt_name);
- return true;
- }
- return false;
+ return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *opts;
- char *orig;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int ret = 0;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&opts, ":")) != NULL) {
- int token;
+ switch (opt) {
+ case Opt_degraded:
+ btrfs_set_opt(ctx->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol_empty:
+ /*
+ * This exists because we used to allow it on accident, so we're
+ * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow
+ * empty subvol= again").
+ */
+ break;
+ case Opt_subvol:
+ kfree(ctx->subvol_name);
+ ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->subvol_name)
+ return -ENOMEM;
+ break;
+ case Opt_subvolid:
+ ctx->subvol_objectid = result.uint_64;
- if (!*p)
- continue;
- token = match_token(p, rescue_tokens, args);
- switch (token){
- case Opt_usebackuproot:
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
- break;
- case Opt_nologreplay:
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_ignorebadroots:
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- break;
- case Opt_ignoredatacsums:
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- break;
- case Opt_rescue_all:
- btrfs_info(info, "enabling all of the rescue options");
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_err:
- btrfs_info(info, "unrecognized rescue option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ /* subvolid=0 means give me the original fs_tree. */
+ if (!ctx->subvol_objectid)
+ ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+ break;
+ case Opt_device: {
+ struct btrfs_device *device;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(param->string, mode, false);
+ mutex_unlock(&uuid_mutex);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
+ break;
}
-out:
- kfree(orig);
- return ret;
-}
-
-/*
- * Regular mount options parser. Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p, *num;
- int intarg;
- int ret = 0;
- char *compress_type;
- bool compress_force = false;
- enum btrfs_compression_type saved_compress_type;
- int saved_compress_level;
- bool saved_compress_force;
- int no_compress = 0;
- const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
- btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (btrfs_free_space_cache_v1_active(info)) {
- if (btrfs_is_zoned(info)) {
- btrfs_info(info,
- "zoned: clearing existing space cache");
- btrfs_set_super_cache_generation(info->super_copy, 0);
+ case Opt_datasum:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
} else {
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
}
- }
-
- /*
- * Even the options are empty, we still need to do extra check
- * against new flags
- */
- if (!options)
- goto check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_degraded:
- btrfs_info(info, "allowing degraded mounts");
- btrfs_set_opt(info->mount_opt, DEGRADED);
- break;
- case Opt_subvol:
- case Opt_subvol_empty:
- case Opt_subvolid:
- case Opt_device:
- /*
- * These are parsed by btrfs_parse_subvol_options or
- * btrfs_parse_device_options and can be ignored here.
- */
- break;
- case Opt_nodatasum:
- btrfs_set_and_info(info, NODATASUM,
- "setting nodatasum");
- break;
- case Opt_datasum:
- if (btrfs_test_opt(info, NODATASUM)) {
- if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(info,
- "setting datasum, datacow enabled");
- else
- btrfs_info(info, "setting datasum");
- }
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_nodatacow:
- if (!btrfs_test_opt(info, NODATACOW)) {
- if (!btrfs_test_opt(info, COMPRESS) ||
- !btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(info,
- "setting nodatacow, compression disabled");
- } else {
- btrfs_info(info, "setting nodatacow");
- }
- }
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- btrfs_set_opt(info->mount_opt, NODATACOW);
- btrfs_set_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_datacow:
- btrfs_clear_and_info(info, NODATACOW,
- "setting datacow");
- break;
- case Opt_compress_force:
- case Opt_compress_force_type:
- compress_force = true;
- fallthrough;
- case Opt_compress:
- case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(info,
- COMPRESS) ?
- info->compress_type : BTRFS_COMPRESS_NONE;
- saved_compress_force =
- btrfs_test_opt(info, FORCE_COMPRESS);
- saved_compress_level = info->compress_level;
- if (token == Opt_compress ||
- token == Opt_compress_force ||
- strncmp(args[0].from, "zlib", 4) == 0) {
- compress_type = "zlib";
-
- info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- /*
- * args[0] contains uninitialized data since
- * for these tokens we don't expect any
- * parameter.
- */
- if (token != Opt_compress &&
- token != Opt_compress_force)
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZLIB,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- no_compress = 0;
- } else if (strncmp(args[0].from, "lzo", 3) == 0) {
- compress_type = "lzo";
- info->compress_type = BTRFS_COMPRESS_LZO;
- info->compress_level = 0;
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_LZO);
- no_compress = 0;
- } else if (strncmp(args[0].from, "zstd", 4) == 0) {
- compress_type = "zstd";
- info->compress_type = BTRFS_COMPRESS_ZSTD;
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZSTD,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
- no_compress = 0;
- } else if (strncmp(args[0].from, "no", 2) == 0) {
- compress_type = "no";
- info->compress_level = 0;
- info->compress_type = 0;
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- compress_force = false;
- no_compress++;
- } else {
- btrfs_err(info, "unrecognized compression value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
-
- if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- } else {
- /*
- * If we remount from compress-force=xxx to
- * compress=xxx, we need clear FORCE_COMPRESS
- * flag, otherwise, there is no way for users
- * to disable forcible compression separately.
- */
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- }
- if (no_compress == 1) {
- btrfs_info(info, "use no compression");
- } else if ((info->compress_type != saved_compress_type) ||
- (compress_force != saved_compress_force) ||
- (info->compress_level != saved_compress_level)) {
- btrfs_info(info, "%s %s compression, level %d",
- (compress_force) ? "force" : "use",
- compress_type, info->compress_level);
- }
- compress_force = false;
- break;
- case Opt_ssd:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_ssd_spread:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_set_and_info(info, SSD_SPREAD,
- "using spread ssd allocation scheme");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_nossd:
- btrfs_set_opt(info->mount_opt, NOSSD);
- btrfs_clear_and_info(info, SSD,
- "not using ssd optimizations");
- fallthrough;
- case Opt_nossd_spread:
- btrfs_clear_and_info(info, SSD_SPREAD,
- "not using spread ssd allocation scheme");
- break;
- case Opt_barrier:
- btrfs_clear_and_info(info, NOBARRIER,
- "turning on barriers");
- break;
- case Opt_nobarrier:
- btrfs_set_and_info(info, NOBARRIER,
- "turning off barriers");
- break;
- case Opt_thread_pool:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized thread_pool value %s",
- args[0].from);
- goto out;
- } else if (intarg == 0) {
- btrfs_err(info, "invalid value 0 for thread_pool");
- ret = -EINVAL;
- goto out;
- }
- info->thread_pool_size = intarg;
- break;
- case Opt_max_inline:
- num = match_strdup(&args[0]);
- if (num) {
- info->max_inline = memparse(num, NULL);
- kfree(num);
-
- if (info->max_inline) {
- info->max_inline = min_t(u64,
- info->max_inline,
- info->sectorsize);
- }
- btrfs_info(info, "max_inline at %llu",
- info->max_inline);
- } else {
- ret = -ENOMEM;
- goto out;
- }
- break;
- case Opt_acl:
+ break;
+ case Opt_datacow:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(ctx->mount_opt, NODATACOW);
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ }
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zlib", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "lzo", 3) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zstd", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "no", 2) == 0) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_ssd:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_ssd_spread:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ if (result.uint_32 == 0) {
+ btrfs_err(NULL, "invalid value 0 for thread_pool");
+ return -EINVAL;
+ }
+ ctx->thread_pool_size = result.uint_32;
+ break;
+ case Opt_max_inline:
+ ctx->max_inline = memparse(param->string, NULL);
+ break;
+ case Opt_acl:
+ if (result.negated) {
+ fc->sb_flags &= ~SB_POSIXACL;
+ } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- btrfs_err(info, "support for ACL not compiled in!");
- ret = -EINVAL;
- goto out;
+ btrfs_err(NULL, "support for ACL not compiled in");
+ return -EINVAL;
#endif
- case Opt_noacl:
- info->sb->s_flags &= ~SB_POSIXACL;
- break;
- case Opt_notreelog:
- btrfs_set_and_info(info, NOTREELOG,
- "disabling tree log");
- break;
- case Opt_treelog:
- btrfs_clear_and_info(info, NOTREELOG,
- "enabling tree log");
- break;
- case Opt_norecovery:
- case Opt_nologreplay:
- btrfs_warn(info,
+ }
+ /*
+ * VFS limits the ability to toggle ACL on and off via remount,
+ * despite every file system allowing this. This seems to be
+ * an oversight since we all do, but it'll fail if we're
+ * remounting. So don't set the mask here, we'll check it in
+ * btrfs_reconfigure and do the toggling ourselves.
+ */
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ fc->sb_flags_mask |= SB_POSIXACL;
+ break;
+ case Opt_treelog:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+ break;
+ case Opt_nologreplay:
+ btrfs_warn(NULL,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_flushoncommit:
- btrfs_set_and_info(info, FLUSHONCOMMIT,
- "turning on flush-on-commit");
- break;
- case Opt_noflushoncommit:
- btrfs_clear_and_info(info, FLUSHONCOMMIT,
- "turning off flush-on-commit");
- break;
- case Opt_ratio:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized metadata_ratio value %s",
- args[0].from);
- goto out;
- }
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %u",
- info->metadata_ratio);
- break;
- case Opt_discard:
- case Opt_discard_mode:
- if (token == Opt_discard ||
- strcmp(args[0].from, "sync") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
- btrfs_set_and_info(info, DISCARD_SYNC,
- "turning on sync discard");
- } else if (strcmp(args[0].from, "async") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
- btrfs_set_and_info(info, DISCARD_ASYNC,
- "turning on async discard");
- } else {
- btrfs_err(info, "unrecognized discard mode value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- btrfs_clear_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD_SYNC,
- "turning off discard");
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "turning off async discard");
- btrfs_set_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_space_cache:
- case Opt_space_cache_version:
- /*
- * We already set FREE_SPACE_TREE above because we have
- * compat_ro(FREE_SPACE_TREE) set, and we aren't going
- * to allow v1 to be set for extent tree v2, simply
- * ignore this setting if we're extent tree v2.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (token == Opt_space_cache ||
- strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(info->mount_opt,
- FREE_SPACE_TREE);
- btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
- } else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(info->mount_opt,
- SPACE_CACHE);
- btrfs_set_and_info(info, FREE_SPACE_TREE,
- "enabling free space tree");
- } else {
- btrfs_err(info, "unrecognized space_cache value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- break;
- case Opt_rescan_uuid_tree:
- btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
- break;
- case Opt_no_space_cache:
- /*
- * We cannot operate without the free space tree with
- * extent tree v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info, SPACE_CACHE,
- "disabling disk space caching");
- }
- if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info, FREE_SPACE_TREE,
- "disabling free space tree");
- }
- break;
- case Opt_inode_cache:
- case Opt_noinode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and has no effect since 5.11");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_flushoncommit:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ else
+ btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ ctx->metadata_ratio = result.uint_32;
+ break;
+ case Opt_discard:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, NODISCARD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ }
+ break;
+ case Opt_discard_mode:
+ switch (result.uint_32) {
+ case Opt_discard_sync:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
break;
- case Opt_clear_cache:
- /*
- * We cannot clear the free space tree with extent tree
- * v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- btrfs_set_and_info(info, CLEAR_CACHE,
- "force clearing of disk cache");
+ case Opt_discard_async:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
break;
- case Opt_user_subvol_rm_allowed:
- btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ default:
+ btrfs_err(NULL, "unrecognized discard mode value %s",
+ param->key);
+ return -EINVAL;
+ }
+ btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+ break;
+ case Opt_space_cache:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ }
+ break;
+ case Opt_space_cache_version:
+ switch (result.uint_32) {
+ case Opt_space_cache_v1:
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_enospc_debug:
- btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ case Opt_space_cache_v2:
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_noenospc_debug:
- btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ default:
+ btrfs_err(NULL, "unrecognized space_cache value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_clear_cache:
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ else
+ btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+ else
+ btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+ break;
+ case Opt_usebackuproot:
+ btrfs_warn(NULL,
+ "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+ /* If we're loading the backup roots we can't trust the space cache. */
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+ break;
+ case Opt_fatal_errors:
+ switch (result.uint_32) {
+ case Opt_fatal_errors_panic:
+ btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_defrag:
- btrfs_set_and_info(info, AUTO_DEFRAG,
- "enabling auto defrag");
+ case Opt_fatal_errors_bug:
+ btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_nodefrag:
- btrfs_clear_and_info(info, AUTO_DEFRAG,
- "disabling auto defrag");
+ default:
+ btrfs_err(NULL, "unrecognized fatal_errors value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_commit_interval:
+ ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval == 0)
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ break;
+ case Opt_rescue:
+ switch (result.uint_32) {
+ case Opt_rescue_usebackuproot:
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
break;
- case Opt_recovery:
- case Opt_usebackuproot:
- btrfs_warn(info,
- "'%s' is deprecated, use 'rescue=usebackuproot' instead",
- token == Opt_recovery ? "recovery" :
- "usebackuproot");
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ case Opt_rescue_nologreplay:
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
- case Opt_skip_balance:
- btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ case Opt_rescue_ignorebadroots:
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
break;
- case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0) {
- btrfs_set_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else if (strcmp(args[0].from, "bug") == 0) {
- btrfs_clear_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else {
- btrfs_err(info, "unrecognized fatal_errors value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
+ case Opt_rescue_ignoredatacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
- case Opt_commit_interval:
- intarg = 0;
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized commit_interval value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- if (intarg == 0) {
- btrfs_info(info,
- "using default commit interval %us",
- BTRFS_DEFAULT_COMMIT_INTERVAL);
- intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
- } else if (intarg > 300) {
- btrfs_warn(info, "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
- break;
- case Opt_rescue:
- ret = parse_rescue_options(info, args[0].from);
- if (ret < 0) {
- btrfs_err(info, "unrecognized rescue value %s",
- args[0].from);
- goto out;
- }
+ case Opt_rescue_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ default:
+ btrfs_info(NULL, "unrecognized rescue option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
- case Opt_fragment_all:
- btrfs_info(info, "fragmenting all space");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
- btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ case Opt_fragment:
+ switch (result.uint_32) {
+ case Opt_fragment_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_metadata:
- btrfs_info(info, "fragmenting metadata");
- btrfs_set_opt(info->mount_opt,
- FRAGMENT_METADATA);
+ case Opt_fragment_parameter_metadata:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_data:
- btrfs_info(info, "fragmenting data");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ case Opt_fragment_parameter_data:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
break;
+ default:
+ btrfs_info(NULL, "unrecognized fragment option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- case Opt_ref_verify:
- btrfs_info(info, "doing ref verification");
- btrfs_set_opt(info->mount_opt, REF_VERIFY);
- break;
+ case Opt_ref_verify:
+ btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+ break;
#endif
- case Opt_err:
- btrfs_err(info, "unrecognized mount option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ default:
+ btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+ return -EINVAL;
}
-check:
- /* We're read-only, don't have to check. */
- if (new_flags & SB_RDONLY)
- goto out;
- if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
- ret = -EINVAL;
-out:
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(info, "cannot disable free space tree");
- ret = -EINVAL;
- }
- if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
- ret = -EINVAL;
- }
- if (!ret)
- ret = btrfs_check_mountopts_zoned(info);
- if (!ret && !remounting) {
- if (btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
- }
- return ret;
+ return 0;
}
/*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
*/
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *device_name, *opts, *orig, *p;
- struct btrfs_device *device = NULL;
- int error = 0;
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
- lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(struct btrfs_fs_info *fs_info,
+ unsigned long mount_opt, unsigned long opt,
+ const char *opt_name)
+{
+ if (mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
- if (!options)
- return 0;
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags)
+{
+ bool ret = true;
- /*
- * strsep changes the string, duplicate it because btrfs_parse_options
- * gets called later
- */
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (!(flags & SB_RDONLY) &&
+ (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ ret = false;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free-space-tree");
+ ret = false;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+ ret = false;
+ }
- if (!*p)
- continue;
+ if (btrfs_check_mountopts_zoned(info, mount_opt))
+ ret = false;
- token = match_token(p, tokens, args);
- if (token == Opt_device) {
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- device = btrfs_scan_one_device(device_name, flags, false);
- kfree(device_name);
- if (IS_ERR(device)) {
- error = PTR_ERR(device);
- goto out;
- }
- }
+ if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+ btrfs_info(info, "using free-space-tree");
}
-out:
- kfree(orig);
- return error;
+ return ret;
}
/*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree(). We need to pre-load
+ * the mount options with the on-disk settings. Before the new mount API took
+ * effect we would do this on mount and remount. With the new mount API we'll
+ * only do this on the initial mount.
*
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options. If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
*/
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
- u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
- int error = 0;
- u64 subvolid;
-
- if (!options)
- return 0;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info,
+ "forcing free space tree for sector size %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ }
+ }
/*
- * strsep changes the string, duplicate it because
- * btrfs_parse_device_options gets called later
+ * At this point our mount options are populated, so we only mess with
+ * these settings if we don't have any settings already.
*/
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ return;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ if (btrfs_is_zoned(fs_info) &&
+ btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_info(fs_info, "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+ return;
+ }
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_subvol:
- kfree(*subvol_name);
- *subvol_name = match_strdup(&args[0]);
- if (!*subvol_name) {
- error = -ENOMEM;
- goto out;
- }
- break;
- case Opt_subvolid:
- error = match_u64(&args[0], &subvolid);
- if (error)
- goto out;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ return;
- /* we want the original fs_tree */
- if (subvolid == 0)
- subvolid = BTRFS_FS_TREE_OBJECTID;
+ if (btrfs_test_opt(fs_info, NOSPACECACHE))
+ return;
- *subvol_objectid = subvolid;
- break;
- default:
- break;
- }
- }
+ /*
+ * At this point we don't have explicit options set by the user, set
+ * them ourselves based on the state of the file system.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
-out:
- kfree(orig);
- return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating)
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+
+ /*
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
+ */
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1105,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
-#endif
- sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1291,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
return 0;
}
-static int btrfs_test_super(struct super_block *s, void *data)
-{
- struct btrfs_fs_info *p = data;
- struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
- return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
- int err = set_anon_super(s, data);
- if (!err)
- s->s_fs_info = data;
- return err;
-}
-
/*
* subvolumes are identified by ino 256
*/
@@ -1382,200 +1186,6 @@ out:
return root;
}
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
- int flags, const char *device_name, void *data)
-{
- struct block_device *bdev = NULL;
- struct super_block *s;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_fs_info *fs_info = NULL;
- void *new_sec_opts = NULL;
- blk_mode_t mode = sb_open_mode(flags);
- int error = 0;
-
- if (data) {
- error = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (error)
- return ERR_PTR(error);
- }
-
- /*
- * Setup a dummy root and fs_info for test/set super. This is because
- * we don't actually fill this stuff out until open_ctree, but we need
- * then open_ctree will properly initialize the file system specific
- * settings later. btrfs_init_fs_info initializes the static elements
- * of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
- */
- fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
- if (!fs_info) {
- error = -ENOMEM;
- goto error_sec_opts;
- }
- btrfs_init_fs_info(fs_info);
-
- fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- if (!fs_info->super_copy || !fs_info->super_for_commit) {
- error = -ENOMEM;
- goto error_fs_info;
- }
-
- mutex_lock(&uuid_mutex);
- error = btrfs_parse_device_options(data, mode);
- if (error) {
- mutex_unlock(&uuid_mutex);
- goto error_fs_info;
- }
-
- /*
- * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
- * either a valid device or an error.
- */
- device = btrfs_scan_one_device(device_name, mode, true);
- ASSERT(device != NULL);
- if (IS_ERR(device)) {
- mutex_unlock(&uuid_mutex);
- error = PTR_ERR(device);
- goto error_fs_info;
- }
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
- fs_info);
- if (IS_ERR(s)) {
- error = PTR_ERR(s);
- goto error_close_devices;
- }
-
- if (s->s_root) {
- btrfs_close_devices(fs_devices);
- btrfs_free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & SB_RDONLY)
- error = -EBUSY;
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
- s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
- error = btrfs_fill_super(s, fs_devices, data);
- }
- if (!error)
- error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
- security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- return dget(s->s_root);
-
-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
-error_sec_opts:
- security_free_mnt_opts(&new_sec_opts);
- return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/). This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- * 1. Parse subvol id related options for later use in mount_subvol().
- *
- * 2. Mount device's root (/) by calling vfs_kern_mount().
- *
- * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- * first place. In order to avoid calling btrfs_mount() again, we use
- * different file_system_type which is not registered to VFS by
- * register_filesystem() (btrfs_root_fs_type). As a result,
- * btrfs_mount_root() is called. The return value will be used by
- * mount_subtree() in mount_subvol().
- *
- * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
- * "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
-{
- struct vfsmount *mnt_root;
- struct dentry *root;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
- int error = 0;
-
- error = btrfs_parse_subvol_options(data, &subvol_name,
- &subvol_objectid);
- if (error) {
- kfree(subvol_name);
- return ERR_PTR(error);
- }
-
- /* mount device's root (/) */
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
- if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags & ~SB_RDONLY, device_name, data);
- } else {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags | SB_RDONLY, device_name, data);
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- down_write(&mnt_root->mnt_sb->s_umount);
- error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
- up_write(&mnt_root->mnt_sb->s_umount);
- if (error < 0) {
- root = ERR_PTR(error);
- mntput(mnt_root);
- kfree(subvol_name);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- /* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
- return root;
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
@@ -1638,202 +1248,274 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- unsigned old_flags = sb->s_flags;
- unsigned long old_opts = fs_info->mount_opt;
- unsigned long old_compress_type = fs_info->compress_type;
- u64 old_max_inline = fs_info->max_inline;
- u32 old_thread_pool_size = fs_info->thread_pool_size;
- u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- sync_filesystem(sb);
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "remounting read-write after error is not allowed");
+ return -EINVAL;
+ }
- if (data) {
- void *new_sec_opts = NULL;
+ if (fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
- ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (!ret)
- ret = security_sb_remount(sb, new_sec_opts);
- security_free_mnt_opts(&new_sec_opts);
- if (ret)
- goto restore;
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ return -EACCES;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ return -EINVAL;
}
- ret = btrfs_parse_options(fs_info, data, *flags);
+ /*
+ * NOTE: when remounting with a change that does writes, don't put it
+ * anywhere above this point, as we are not sure to be safe to write
+ * until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
- goto restore;
+ return ret;
- ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
- if (ret < 0)
- goto restore;
+ btrfs_clear_sb_rdonly(fs_info->sb);
- btrfs_remount_begin(fs_info, old_opts, *flags);
- btrfs_resize_thread_pool(fs_info,
- fs_info->thread_pool_size, old_thread_pool_size);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
- btrfs_warn(fs_info,
- "remount supports changing free space tree only from ro to rw");
- /* Make sure free space cache options match the state on disk */
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- if (btrfs_free_space_cache_v1_active(fs_info)) {
- btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- }
+ /*
+ * If we've gone from readonly -> read-write, we need to get our
+ * sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- goto out;
+ return 0;
+}
- if (*flags & SB_RDONLY) {
- /*
- * this also happens on 'umount -rf' or on shutdown, when
- * the filesystem is busy.
- */
- cancel_work_sync(&fs_info->async_reclaim_work);
- cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * This also happens on 'umount -rf' or on shutdown, when the
+ * filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
- btrfs_discard_cleanup(fs_info);
+ btrfs_discard_cleanup(fs_info);
- /* wait for the uuid_scan task to finish */
- down(&fs_info->uuid_tree_rescan_sem);
- /* avoid complains from lockdep et al. */
- up(&fs_info->uuid_tree_rescan_sem);
+ /* Wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* Avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
- btrfs_set_sb_rdonly(sb);
+ btrfs_set_sb_rdonly(fs_info->sb);
- /*
- * Setting SB_RDONLY will put the cleaner thread to
- * sleep at the next loop if it's already active.
- * If it's already asleep, we'll leave unused block
- * groups on disk until we're mounted read-write again
- * unless we clean them up here.
- */
- btrfs_delete_unused_bgs(fs_info);
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+ * loop if it's already active. If it's already asleep, we'll leave
+ * unused block groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
- /*
- * The cleaner task could be already running before we set the
- * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
- * We must make sure that after we finish the remount, i.e. after
- * we call btrfs_commit_super(), the cleaner can no longer start
- * a transaction - either because it was dropping a dead root,
- * running delayed iputs or deleting an unused block group (the
- * cleaner picked a block group from the list of unused block
- * groups before we were able to in the previous call to
- * btrfs_delete_unused_bgs()).
- */
- wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * The cleaner task could be already running before we set the flag
+ * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make
+ * sure that after we finish the remount, i.e. after we call
+ * btrfs_commit_super(), the cleaner can no longer start a transaction
+ * - either because it was dropping a dead root, running delayed iputs
+ * or deleting an unused block group (the cleaner picked a block
+ * group from the list of unused block groups before we were able to
+ * in the previous call to btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
- /*
- * We've set the superblock to RO mode, so we might have made
- * the cleaner task sleep without running all pending delayed
- * iputs. Go through all the delayed iputs here, so that if an
- * unmount happens without remounting RW we don't end up at
- * finishing close_ctree() with a non-empty list of delayed
- * iputs.
- */
- btrfs_run_delayed_iputs(fs_info);
+ /*
+ * We've set the superblock to RO mode, so we might have made the
+ * cleaner task sleep without running all pending delayed iputs. Go
+ * through all the delayed iputs here, so that if an unmount happens
+ * without remounting RW we don't end up at finishing close_ctree()
+ * with a non-empty list of delayed iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
- btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(fs_info);
- btrfs_pause_balance(fs_info);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
- /*
- * Pause the qgroup rescan worker if it is running. We don't want
- * it to be still running after we are in RO mode, as after that,
- * by the time we unmount, it might have left a transaction open,
- * so we would leak the transaction and/or crash.
- */
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want it to
+ * be still running after we are in RO mode, as after that, by the time
+ * we unmount, it might have left a transaction open, so we would leak
+ * the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
- ret = btrfs_commit_super(fs_info);
- if (ret)
- goto restore;
- } else {
- if (BTRFS_FS_ERROR(fs_info)) {
- btrfs_err(fs_info,
- "Remounting read-write after error is not allowed");
- ret = -EINVAL;
- goto restore;
- }
- if (fs_info->fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto restore;
- }
+ return btrfs_commit_super(fs_info);
+}
- if (!btrfs_check_rw_degradable(fs_info, NULL)) {
- btrfs_warn(fs_info,
- "too many missing devices, writable remount is not allowed");
- ret = -EACCES;
- goto restore;
- }
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ fs_info->max_inline = ctx->max_inline;
+ fs_info->commit_interval = ctx->commit_interval;
+ fs_info->metadata_ratio = ctx->metadata_ratio;
+ fs_info->thread_pool_size = ctx->thread_pool_size;
+ fs_info->mount_opt = ctx->mount_opt;
+ fs_info->compress_type = ctx->compress_type;
+ fs_info->compress_level = ctx->compress_level;
+}
- if (btrfs_super_log_root(fs_info->super_copy) != 0) {
- btrfs_warn(fs_info,
- "mount required to replay tree-log, cannot remount read-write");
- ret = -EINVAL;
- goto restore;
- }
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ ctx->max_inline = fs_info->max_inline;
+ ctx->commit_interval = fs_info->commit_interval;
+ ctx->metadata_ratio = fs_info->metadata_ratio;
+ ctx->thread_pool_size = fs_info->thread_pool_size;
+ ctx->mount_opt = fs_info->mount_opt;
+ ctx->compress_type = fs_info->compress_type;
+ ctx->compress_level = fs_info->compress_level;
+}
- /*
- * NOTE: when remounting with a change that does writes, don't
- * put it anywhere above this point, as we are not sure to be
- * safe to write until we pass the above checks.
- */
- ret = btrfs_start_pre_rw_mount(fs_info);
- if (ret)
- goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old)
+{
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+ btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+ btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+ btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+ btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+ btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+ btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+ btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+ btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+ btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+ btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+ btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+ btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+ btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+ btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+ btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+ btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+
+ btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+ btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+ btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+ btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+ btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+ btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+ btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+ /* Did the compression settings change? */
+ if (btrfs_test_opt(info, COMPRESS) &&
+ (!old ||
+ old->compress_type != info->compress_type ||
+ old->compress_level != info->compress_level ||
+ (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+ btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+ const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+ btrfs_info(info, "%s %s compression, level %d",
+ btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
- btrfs_clear_sb_rdonly(sb);
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_context old_ctx;
+ int ret = 0;
+ bool mount_reconfigure = (fc->s_fs_info != NULL);
- /*
- * If we've gone from readonly -> read/write, we need to get
- * our sync/async discard lists in the right state.
- */
- btrfs_discard_resume(fs_info);
+ btrfs_info_to_ctx(fs_info, &old_ctx);
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (!mount_reconfigure &&
+ !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ return -EINVAL;
+
+ ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+ if (ret < 0)
+ return ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+ btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+ old_ctx.thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from RO to RW");
+ /* Make sure free space cache options match the state on disk. */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
}
-out:
+
+ ret = 0;
+ if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_ro(fs_info);
+ else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_rw(fs_info);
+ if (ret)
+ goto restore;
+
/*
- * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
- * since the absence of the flag means it can be toggled off by remount.
+ * If we set the mask during the parameter parsing VFS would reject the
+ * remount. Here we can set the mask and the value will be updated
+ * appropriately.
*/
- *flags |= SB_I_VERSION;
+ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+ fc->sb_flags_mask |= SB_POSIXACL;
+ btrfs_emit_options(fs_info, &old_ctx);
wake_up_process(fs_info->transaction_kthread);
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
-
restore:
- /* We've hit an error - don't reset SB_RDONLY */
- if (sb_rdonly(sb))
- old_flags |= SB_RDONLY;
- if (!(old_flags & SB_RDONLY))
- clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
- sb->s_flags = old_flags;
- fs_info->mount_opt = old_opts;
- fs_info->compress_type = old_compress_type;
- fs_info->max_inline = old_max_inline;
- btrfs_resize_thread_pool(fs_info,
- old_thread_pool_size, fs_info->thread_pool_size);
- fs_info->metadata_ratio = old_metadata_ratio;
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_ctx_to_info(fs_info, &old_ctx);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
return ret;
}
@@ -2094,6 +1776,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct btrfs_fs_info *p = fc->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct block_device *bdev;
+ struct btrfs_device *device;
+ struct super_block *sb;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ int ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(fc->source, mode, true);
+ ASSERT(device != NULL);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(device);
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (ret)
+ return ret;
+
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto error;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+
+ /*
+ * From now on the error handling is not straightforward.
+ *
+ * If successful, this will transfer the fs_info into the super block,
+ * and fc->s_fs_info will be NULL. However if there's an existing
+ * super, we'll still have fc->s_fs_info populated. If we error
+ * completely out it'll be cleaned up when we drop the fs_context,
+ * otherwise it's tied to the lifetime of the super_block.
+ */
+ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto error;
+ }
+
+ set_device_specific_options(fs_info);
+
+ if (sb->s_root) {
+ btrfs_close_devices(fs_devices);
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
+ ret = -EBUSY;
+ } else {
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+ btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+ ret = btrfs_fill_super(sb, fs_devices, NULL);
+ }
+
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ btrfs_clear_oneshot_options(fs_info);
+
+ fc->root = dget(sb->s_root);
+ return 0;
+
+error:
+ btrfs_close_devices(fs_devices);
+ return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ * (iii) # create rw superblock + initial mount
+ * mount -t xfs /dev/sdb /opt/
+ *
+ * # create ro bind mount
+ * mount --bind -o ro /opt/foo /mnt/foo
+ *
+ * # unmount initial mount
+ * umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro. The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ * @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ * made the superblock ro. Note, how SB_RDONLY has the same value as
+ * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ * fsmount() or mount_setattr() this is a pure VFS level change for a
+ * specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ * in fc->sb_flags.
+ *
+ * This disambiguation has rather positive consequences. Mounting a subvolume
+ * ro will not also turn the superblock ro. Only the mount for the subvolume
+ * will become ro.
+ *
+ * So, if the superblock creation request comes from the new mount API the
+ * caller must have explicitly done:
+ *
+ * fsconfig(FSCONFIG_SET_FLAG, "ro")
+ * fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
+ *
+ * IOW, at some point the caller must have explicitly turned the whole
+ * superblock ro and we shouldn't just undo it like we did for the old mount
+ * API. In any case, it lets us avoid the hack in the new mount API.
+ *
+ * Consequently, the remounting hack must only be used for requests originating
+ * from the old mount API and should be marked for full deprecation so it can be
+ * turned off in a couple of years.
+ *
+ * The new mount API has no reason to support this hack.
+ */
+static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+{
+ struct vfsmount *mnt;
+ int ret;
+ const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
+
+ /*
+ * We got an EBUSY because our SB_RDONLY flag didn't match the existing
+ * super block, so invert our setting here and retry the mount so we
+ * can get our vfsmount.
+ */
+ if (ro2rw)
+ fc->sb_flags |= SB_RDONLY;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
+
+ mnt = fc_mount(fc);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ if (!fc->oldapi || !ro2rw)
+ return mnt;
+
+ /* We need to convert to rw, call reconfigure. */
+ fc->sb_flags &= ~SB_RDONLY;
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_reconfigure(fc);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret) {
+ mntput(mnt);
+ return ERR_PTR(ret);
+ }
+ return mnt;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_context *dup_fc;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ btrfs_free_fs_info(fs_info);
+ return -ENOMEM;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ dup_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(dup_fc)) {
+ btrfs_free_fs_info(fs_info);
+ return PTR_ERR(dup_fc);
+ }
+
+ /*
+ * When we do the sget_fc this gets transferred to the sb, so we only
+ * need to set it on the dup_fc as this is what creates the super block.
+ */
+ dup_fc->s_fs_info = fs_info;
+
+ /*
+ * We'll do the security settings in our btrfs_get_tree_super() mount
+ * loop, they were duplicated into dup_fc, we can drop the originals
+ * here.
+ */
+ security_free_mnt_opts(&fc->security);
+ fc->security = NULL;
+
+ mnt = fc_mount(dup_fc);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
+ mnt = btrfs_reconfigure_for_mount(dup_fc);
+ put_fs_context(dup_fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ /*
+ * This free's ->subvol_name, because if it isn't set we have to
+ * allocate a buffer to hold the subvol_name, so we just drop our
+ * reference to it here.
+ */
+ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+ ctx->subvol_name = NULL;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+static int btrfs_get_tree(struct fs_context *fc)
+{
+ /*
+ * Since we use mount_subtree to mount the default/specified subvol, we
+ * have to do mounts in two steps.
+ *
+ * First pass through we call btrfs_get_tree_subvol(), this is just a
+ * wrapper around fc_mount() to call back into here again, and this time
+ * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
+ * everything to open the devices and file system. Then we return back
+ * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+ * from there we can do our mount_subvol() call, which will lookup
+ * whichever subvol we're mounting and setup this fc with the
+ * appropriate dentry for the subvol.
+ */
+ if (fc->s_fs_info)
+ return btrfs_get_tree_super(fc);
+ return btrfs_get_tree_subvol(fc);
+}
+
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2101,22 +2086,85 @@ static void btrfs_kill_super(struct super_block *sb)
btrfs_free_fs_info(fs_info);
}
-static struct file_system_type btrfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+
+ if (fs_info)
+ btrfs_free_fs_info(fs_info);
+
+ if (ctx && refcount_dec_and_test(&ctx->refs)) {
+ kfree(ctx->subvol_name);
+ kfree(ctx);
+ }
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct btrfs_fs_context *ctx = src_fc->fs_private;
-static struct file_system_type btrfs_root_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount_root,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ /*
+ * Give a ref to our ctx to this dup, as we want to keep it around for
+ * our original fc so we can have the subvolume name or objectid.
+ *
+ * We unset ->source in the original fc because the dup needs it for
+ * mounting, and then once we free the dup it'll free ->source, so we
+ * need to make sure we're only pointing to it in one fc.
+ */
+ refcount_inc(&ctx->refs);
+ fc->fs_private = ctx;
+ fc->source = src_fc->source;
+ src_fc->source = NULL;
+ return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+ .parse_param = btrfs_parse_param,
+ .reconfigure = btrfs_reconfigure,
+ .get_tree = btrfs_get_tree,
+ .dup = btrfs_dup_fs_context,
+ .free = btrfs_free_fs_context,
};
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refs, 1);
+ fc->fs_private = ctx;
+ fc->ops = &btrfs_fs_context_ops;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+ } else {
+ ctx->thread_pool_size =
+ min_t(unsigned long, num_online_cpus() + 2, 8);
+ ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .init_fs_context = btrfs_init_fs_context,
+ .parameters = btrfs_fs_parameters,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2328,7 +2376,6 @@ static const struct super_operations btrfs_super_ops = {
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
- .remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
};
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..f18253ca280d 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,12 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e6b51fb3ddc1..84c05246ffd8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
unsigned long long limit;
limit = memparse(buf, &endptr);
+ /* There could be trailing '\n', also catch any typos after the value. */
+ endptr = skip_spaces(endptr);
+ if (*endptr != 0)
+ return -EINVAL;
WRITE_ONCE(device->scrub_speed_max, limit);
return len;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..709c6cc9706a 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+ [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0);
+ extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
TEST_ALLOC_INODE,
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
+ TEST_ALLOC_CHUNK_MAP,
};
extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1cc86af97dc6..25b3349595e0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < eb->len; i++) {
- struct page *page = eb->pages[i >> PAGE_SHIFT];
+ struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
void *addr = page_address(page) + offset_in_page(i);
if (memcmp(addr, memory + i, 1) != 0) {
@@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
- void *eb_addr = page_address(eb->pages[i]);
+ void *eb_addr = folio_address(eb->folios[i]);
if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
dump_eb_and_memory_contents(eb, memory, test_name);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 29bdd08b241f..253cce7ffecf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K)) {
+ if (!em) {
+ test_err("case1 [%llu %llu]: no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
@@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
test_err("case2 [0 1K]: ret %d", ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ if (!em) {
+ test_err("case2 [0 1K]: no extent map returned");
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
@@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case3 [0x%llx 0x%llx): ret %d",
+ test_err("case3 [%llu %llu): ret %d",
start, start + len, ret);
goto out;
}
+ if (!em) {
+ test_err("case3 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (em &&
- (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len)) {
+ if (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len) {
test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
ret = -EINVAL;
@@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case4 [0x%llx 0x%llx): ret %d",
- start, len, ret);
+ test_err("case4 [%llu %llu): ret %d",
+ start, start + len, ret);
goto out;
}
- if (em && (start < em->start || start + len > extent_map_end(em))) {
+ if (!em) {
+ test_err("case4 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
- start, len, ret, em->start, em->len, em->block_start,
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
+ start, start + len, ret, em->start, em->len, em->block_start,
em->block_len);
ret = -EINVAL;
}
@@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
u64 start, u64 len, u64 block_start)
{
struct extent_map *em;
@@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
em->len = len;
em->block_start = block_start;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct inode *inode;
@@ -585,35 +606,35 @@ static int test_case_5(void)
em_tree = &BTRFS_I(inode)->extent_tree;
/* [0, 12k) */
- ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
struct extent_map *em = NULL;
int ret;
- ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
@@ -713,7 +734,7 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -742,9 +763,9 @@ static int test_case_7(void)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -765,7 +786,7 @@ static int test_case_7(void)
em->block_start = SZ_32K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -859,33 +880,21 @@ struct rmap_test_vector {
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
- struct extent_map *em;
- struct map_lookup *map = NULL;
+ struct btrfs_chunk_map *map;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
- em = alloc_extent_map();
- if (!em) {
- test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
- }
-
- map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
if (!map) {
- kfree(em);
- test_std_err(TEST_ALLOC_EXTENT_MAP);
+ test_std_err(TEST_ALLOC_CHUNK_MAP);
return -ENOMEM;
}
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
- em->start = SZ_4G;
- em->len = test->data_stripe_size * test->num_data_stripes;
- em->block_len = em->len;
- em->orig_block_len = test->data_stripe_size;
- em->map_lookup = map;
-
+ map->start = SZ_4G;
+ map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+ map->stripe_size = test->data_stripe_size;
map->num_stripes = test->num_stripes;
map->type = test->raid_type;
@@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
- write_lock(&fs_info->mapping_tree.lock);
- ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
- write_unlock(&fs_info->mapping_tree.lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
- test_err("error adding block group mapping to mapping tree");
+ test_err("error adding chunk map to mapping tree");
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
- write_lock(&fs_info->mapping_tree.lock);
- remove_extent_mapping(&fs_info->mapping_tree, em);
- write_unlock(&fs_info->mapping_tree.lock);
- /* For us */
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
out_free:
- /* For the tree */
- free_extent_map(em);
kfree(logical);
return ret;
}
@@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void)
ret = test_case_4(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_5();
+ ret = test_case_5(fs_info);
if (ret)
goto out;
ret = test_case_6(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_7();
+ ret = test_case_7(fs_info);
if (ret)
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..9957de9f7806 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
/*
@@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
disk_bytenr = em->block_start;
@@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
vacancy_only, em->flags);
goto out;
}
@@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ test_err("wrong flags, wanted %u, have %u", vacancy_only,
em->flags);
goto out;
}
@@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, wanted 0 got %lu",
+ test_err("unexpected flags set, wanted 0 got %u",
em->flags);
goto out;
}
@@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
test_msg("running inode tests");
- set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+ compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+ prealloc_only |= EXTENT_FLAG_PREALLOC;
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..14b9fbe82da4 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -22,7 +22,7 @@ struct btrfs_tree_parent_check {
/*
* Expected transid, can be 0 to skip the check, but such skip
- * should only be utlized for backref walk related code.
+ * should only be utilized for backref walk related code.
*/
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d6729d9fd2f..331fc7429952 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
- btrfs_redirty_list_add(trans->transaction, eb);
} else {
unaccount_log_buffer(eb->fs_info, eb->start);
}
@@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
@@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
@@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
struct btrfs_key key;
+ enum btrfs_compression_type compress_type;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
btrfs_set_stack_file_extent_generation(&fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ compress_type = extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_offset(&fi, extent_offset);
btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
- btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
@@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f627674b37db..4c32497311d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -41,6 +41,17 @@
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -1742,19 +1753,18 @@ out:
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
* Find the mapping containing the given logical extent.
*
@@ -2994,38 +3079,37 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
+ if (unlikely(!map)) {
+ read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len <= logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
+ read_unlock(&fs_info->mapping_tree_lock);
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
- logical, logical + length, em->start, em->start + em->len);
- free_extent_map(em);
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3050,23 +3134,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- return PTR_ERR(em);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3169,7 +3251,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3188,7 +3270,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, map);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3200,7 +3282,7 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5347,24 +5429,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ p = &fs_info->mapping_tree.rb_root.rb_node;
+ while (*p) {
+ struct btrfs_chunk_map *entry;
+
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+ if (map->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (map->start > entry->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&map->rb_node, parent, p);
+ rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp)
+{
+ const int size = btrfs_chunk_map_size(map->num_stripes);
+ struct btrfs_chunk_map *clone;
+
+ clone = kmemdup(map, size, gfp);
+ if (!clone)
+ return NULL;
+
+ refcount_set(&clone->refs, 1);
+ RB_CLEAR_NODE(&clone->rb_node);
+
+ return clone;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
int i;
int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5375,41 +5564,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
j * ctl->stripe_size;
}
}
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5422,23 +5592,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5514,8 +5671,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5544,14 +5700,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
@@ -5608,7 +5763,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5653,7 +5808,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5662,17 +5817,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5693,38 +5846,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
enum btrfs_raid_types index;
int ret = 1;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5733,7 +5885,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
index = btrfs_bg_flags_to_raid_index(map->type);
/* Non-RAID56, use their ncopies from btrfs_raid_array. */
@@ -5750,53 +5901,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
if (!btrfs_fs_incompat(fs_info, RAID56))
return len;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
+ struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
int i;
@@ -5903,8 +6050,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
@@ -5922,11 +6068,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int ret;
int i;
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -5934,8 +6078,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
/*
@@ -6032,10 +6176,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
}
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return stripes;
out_free_map:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
@@ -6133,17 +6277,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u32 *stripe_nr, u64 *stripe_offset,
- u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
- *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(*stripe_offset < U32_MAX);
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6158,18 +6301,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* to go rounddown(), not round_down(), as nr_data_stripes is
* not ensured to be power of 2.
*/
- *full_stripe_start =
- btrfs_stripe_nr_to_offset(
- rounddown(*stripe_nr, nr_data_stripes(map)));
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(*full_stripe_start + full_stripe_len > offset);
- ASSERT(*full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
*/
- if (op == BTRFS_MAP_WRITE)
- return full_stripe_len - (offset - *full_stripe_start);
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
}
/*
@@ -6177,26 +6319,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return BTRFS_STRIPE_LEN - *stripe_offset;
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
return U64_MAX;
}
-static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length, struct btrfs_io_stripe *dst,
- struct map_lookup *map, u32 stripe_index,
- u64 stripe_offset, u64 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- dst->dev = map->stripes[stripe_index].dev;
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
return btrfs_get_raid_extent_offset(fs_info, logical, length,
- map->type, stripe_index, dst);
+ map->type,
+ io_geom->stripe_index, dst);
- dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
return 0;
}
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ enum btrfs_map_op op, int mirror_num)
+{
+ if (!smap)
+ return false;
+
+ if (num_alloc_stripes != 1)
+ return false;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ return false;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ return false;
+
+ return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
+
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
+
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
+ }
+
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ /*
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
+ */
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
/*
* Map one logical range to one or more physical ranges.
*
@@ -6237,43 +6533,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- u64 stripe_offset;
- u32 stripe_nr;
- u32 stripe_index;
- int data_stripes;
int i;
int ret = 0;
- int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
- int num_stripes;
int num_copies;
- int max_errors = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
u16 num_alloc_stripes;
- u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
+
num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (mirror_num > num_copies)
+ if (io_geom.mirror_num > num_copies)
return -EINVAL;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- map = em->map_lookup;
- data_stripes = nr_data_stripes(map);
-
- map_offset = logical - em->start;
- max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
- &stripe_offset, &raid56_full_stripe_start);
- *length = min_t(u64, em->len - map_offset, max_len);
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6284,107 +6574,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- if (op == BTRFS_MAP_READ)
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_index = (stripe_nr % factor) * map->sub_stripes;
- stripe_nr /= factor;
-
- if (op != BTRFS_MAP_READ)
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (op != BTRFS_MAP_READ || mirror_num > 1) {
- /*
- * Needs full stripe mapping.
- *
- * Push stripe_nr back to the start of the full stripe
- * For those cases needing a full stripe, @stripe_nr
- * is the full stripe number.
- *
- * Originally we go raid56_full_stripe_start / full_stripe_len,
- * but that can be expensive. Here we just divide
- * @stripe_nr with @data_stripes.
- */
- stripe_nr /= data_stripes;
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = btrfs_chunk_max_errors(map);
-
- /* Return the length to the full stripe end */
- *length = min(logical + *length,
- raid56_full_stripe_start + em->start +
- btrfs_stripe_nr_to_offset(data_stripes)) -
- logical;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- ASSERT(mirror_num <= 1);
- /* Just grab the data stripe directly. */
- stripe_index = stripe_nr % data_stripes;
- stripe_nr /= data_stripes;
-
- /* We distribute the parity blocks across stripes */
- stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num < 1)
- mirror_num = 1;
- }
- } else {
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
/*
* After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- mirror_num = stripe_index + 1;
+ map_blocks_single(map, &io_geom);
+ break;
}
- if (stripe_index >= map->num_stripes) {
+ if (io_geom.stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
+ io_geom.stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
- num_alloc_stripes = num_stripes;
+ num_alloc_stripes = io_geom.num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ)
/*
@@ -6401,14 +6630,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (smap && num_alloc_stripes == 1 &&
- !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
- op != BTRFS_MAP_READ) &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- ret = set_io_stripe(fs_info, op, logical, length, smap, map,
- stripe_index, stripe_offset, stripe_nr);
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+ io_geom.mirror_num)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
- *mirror_num_ret = mirror_num;
+ *mirror_num_ret = io_geom.mirror_num;
*bioc_ret = NULL;
goto out;
}
@@ -6428,7 +6654,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* It's still mostly the same as other profiles, just with extra rotation.
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6437,28 +6663,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* In this case, we just add @stripe_nr with @i, then do the
* modulo, to reduce one modulo call.
*/
- bioc->full_stripe_logical = em->start +
- btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (int i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
- if (ret < 0)
- break;
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
}
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ for (i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
if (ret < 0)
break;
- stripe_index++;
+ io_geom.stripe_index++;
}
}
@@ -6469,18 +6698,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
if (op != BTRFS_MAP_READ)
- max_errors = btrfs_chunk_max_errors(map);
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ &io_geom.num_stripes, &io_geom.max_errors);
}
*bioc_ret = bioc;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
out:
if (dev_replace_is_ongoing) {
@@ -6488,7 +6717,7 @@ out:
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -6660,12 +6889,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const struct map_lookup *map = em->map_lookup;
const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(em->len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -6734,9 +6962,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
@@ -6770,35 +6996,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6813,7 +7026,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = btrfs_calc_stripe_length(em);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6829,7 +7042,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
ret = PTR_ERR(map->stripes[i].dev);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
}
@@ -6838,15 +7051,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
&(map->stripes[i].dev->dev_state));
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
}
- free_extent_map(em);
return ret;
}
@@ -7156,26 +7366,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7193,18 +7398,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7697,20 +7899,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (!map) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7718,12 +7915,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = btrfs_calc_stripe_length(em);
+ stripe_len = btrfs_calc_stripe_length(map);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
@@ -7746,7 +7942,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -7792,32 +7988,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (map->num_stripes != map->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cc374864a79..53f87f398da7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -426,7 +426,8 @@ struct btrfs_discard_stripe {
struct btrfs_io_context {
refcount_t refs;
struct btrfs_fs_info *fs_info;
- u64 map_type; /* get from map_lookup->type */
+ /* Taken from struct btrfs_chunk_map::type. */
+ u64 map_type;
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
@@ -529,18 +530,32 @@ struct btrfs_raid_attr {
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-struct map_lookup {
+struct btrfs_chunk_map {
+ struct rb_node rb_node;
+ /* For mount time dev extent verification. */
+ int verified_stripes;
+ refcount_t refs;
+ u64 start;
+ u64 chunk_len;
+ u64 stripe_size;
u64 type;
int io_align;
int io_width;
int num_stripes;
int sub_stripes;
- int verified_stripes; /* For mount time dev extent verification */
struct btrfs_io_stripe stripes[];
};
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+ if (map && refcount_dec_and_test(&map->refs)) {
+ ASSERT(RB_EMPTY_NODE(&map->rb_node));
+ kfree(map);
+ }
+}
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
*
* @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
* than 4G. This does the proper type cast to avoid overflow.
@@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
@@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp);
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3cf236fb40a4..6287763fdccc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name, void *buffer,
+ size_t size)
+{
+ int ret;
+ bool is_cap = false;
+
+ name = xattr_full_name(handler, name);
+
+ /*
+ * security.capability doesn't cache the results, so calls into us
+ * constantly to see if there's a capability xattr. Cache the result
+ * here in order to avoid wasting time doing lookups for xattrs we know
+ * don't exist.
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ is_cap = true;
+ if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+ return -ENODATA;
+ }
+
+ ret = btrfs_getxattr(inode, name, buffer, size);
+ if (ret == -ENODATA && is_cap)
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+ return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
+ const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = btrfs_xattr_handler_get,
- .set = btrfs_xattr_handler_set,
+ .get = btrfs_xattr_handler_get_security,
+ .set = btrfs_xattr_handler_set_security,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
err = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 6c231a116a29..36cf1f0e338e 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 188378ca19c7..12066afc235c 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -781,7 +781,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* Check mount options here, because we might change fs_info->zoned
* from fs_info->zone_size.
*/
- ret = btrfs_check_mountopts_zoned(fs_info);
+ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
if (ret)
return ret;
@@ -789,7 +789,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -798,18 +798,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
* Space cache writing is not COWed. Disable that to avoid write errors
* in sequential zones.
*/
- if (btrfs_test_opt(info, SPACE_CACHE)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_err(info, "zoned: space cache v1 is not supported");
return -EINVAL;
}
- if (btrfs_test_opt(info, NODATACOW)) {
+ if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
btrfs_err(info, "zoned: NODATACOW not supported");
return -EINVAL;
}
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "zoned: async discard ignored and disabled for zoned mode");
+ if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+ btrfs_info(info,
+ "zoned: async discard ignored and disabled for zoned mode");
+ btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+ }
return 0;
}
@@ -1290,7 +1293,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct map_lookup *map)
+ struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device = map->stripes[zone_idx].dev;
@@ -1393,7 +1396,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1435,7 +1438,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1483,7 +1486,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1515,7 +1518,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1552,9 +1555,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
struct zone_info *zone_info = NULL;
@@ -1575,17 +1576,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EIO;
}
- /* Get the chunk mapping */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
-
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, logical, length);
+ if (!map)
return -EINVAL;
- map = em->map_lookup;
-
- cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS);
if (!cache->physical_map) {
ret = -ENOMEM;
goto out;
@@ -1687,12 +1682,11 @@ out:
spin_unlock(&fs_info->zone_active_bgs_lock);
}
} else {
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
cache->physical_map = NULL;
}
bitmap_free(active);
kfree(zone_info);
- free_extent_map(em);
return ret;
}
@@ -1715,22 +1709,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
cache->zone_unusable = unusable;
}
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb)
-{
- if (!btrfs_is_zoned(eb->fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
- return;
-
- ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
- memzero_extent_buffer(eb, 0, eb->len);
- set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
- set_extent_buffer_dirty(eb);
- set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY, NULL);
-}
-
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -2082,7 +2060,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *device;
u64 physical;
const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2194,7 +2172,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
const bool is_metadata = (block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
int ret = 0;
@@ -2643,7 +2621,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
/* Release reservation for currently active block groups. */
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- struct map_lookup *map = block_group->physical_map;
+ struct btrfs_chunk_map *map = block_group->physical_map;
if (!(block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..f24a5ffb7807 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+ unsigned long *mount_opt)
{
return 0;
}
@@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info(
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb) { }
-
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
@@ -323,7 +319,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
- /* Do not allow Host Manged zoned device */
+ /* Do not allow Host Managed zoned device. */
return bdev_zoned_model(bdev) != BLK_ZONED_HM;
}
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5511766485cd..0d66db8bc1d4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
-
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 967f34b70aa8..d3bcf601d3e5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* Various filesystems appear to want __find_get_block to be non-blocking.
* But it's the page lock which protects the buffers. To get around this,
* we get exclusion from try_to_free_buffers with the blockdev mapping's
- * private_lock.
+ * i_private_lock.
*
- * Hack idea: for the blockdev mapping, private_lock contention
+ * Hack idea: for the blockdev mapping, i_private_lock contention
* may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take private_lock.
+ * succeeds, there is no need to take i_private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
@@ -199,12 +199,12 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
- index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->private_lock);
+ spin_lock(&bd_mapping->i_private_lock);
head = folio_buffers(folio);
if (!head)
goto out_unlock;
@@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << bd_inode->i_blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->private_lock);
+ spin_unlock(&bd_mapping->i_private_lock);
folio_put(folio);
out:
return ret;
@@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
}
/*
- * Completion handler for block_write_full_page() - pages which are unlocked
- * during I/O, and which have PageWriteback cleared upon I/O completion.
+ * Completion handler for block_write_full_folio() - folios which are unlocked
+ * during I/O, and which have the writeback flag cleared upon I/O completion.
*/
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first;
@@ -415,7 +415,6 @@ still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
-EXPORT_SYMBOL(end_buffer_async_write);
/*
* If a page's buffers are under async readin (end_buffer_async_read
@@ -467,25 +466,25 @@ EXPORT_SYMBOL(mark_buffer_async_write);
*
* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
- * management of a list of dependent buffers at ->i_mapping->private_list.
+ * management of a list of dependent buffers at ->i_mapping->i_private_list.
*
* Locking is a little subtle: try_to_free_buffers() will remove buffers
* from their controlling inode's queue when they are being freed. But
* try_to_free_buffers() will be operating against the *blockdev* mapping
* at the time, not against the S_ISREG file which depends on those buffers.
- * So the locking for private_list is via the private_lock in the address_space
+ * So the locking for i_private_list is via the i_private_lock in the address_space
* which backs the buffers. Which is different from the address_space
* against which the buffers are listed. So for a particular address_space,
- * mapping->private_lock does *not* protect mapping->private_list! In fact,
- * mapping->private_list will always be protected by the backing blockdev's
- * ->private_lock.
+ * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact,
+ * mapping->i_private_list will always be protected by the backing blockdev's
+ * ->i_private_lock.
*
* Which introduces a requirement: all buffers on an address_space's
- * ->private_list must be from the same address_space: the blockdev's.
+ * ->i_private_list must be from the same address_space: the blockdev's.
*
- * address_spaces which do not place buffers at ->private_list via these
- * utility functions are free to use private_lock and private_list for
- * whatever they want. The only requirement is that list_empty(private_list)
+ * address_spaces which do not place buffers at ->i_private_list via these
+ * utility functions are free to use i_private_lock and i_private_list for
+ * whatever they want. The only requirement is that list_empty(i_private_list)
* be true at clear_inode() time.
*
* FIXME: clear_inode should not call invalidate_inode_buffers(). The
@@ -508,7 +507,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
*/
/*
- * The buffer's backing address_space's private_lock must be held
+ * The buffer's backing address_space's i_private_lock must be held
*/
static void __remove_assoc_queue(struct buffer_head *bh)
{
@@ -519,7 +518,7 @@ static void __remove_assoc_queue(struct buffer_head *bh)
int inode_has_buffers(struct inode *inode)
{
- return !list_empty(&inode->i_data.private_list);
+ return !list_empty(&inode->i_data.i_private_list);
}
/*
@@ -561,7 +560,7 @@ repeat:
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
*
- * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * Starts I/O against the buffers at mapping->i_private_list, and waits upon
* that I/O.
*
* Basically, this is a convenience function for fsync().
@@ -570,13 +569,13 @@ repeat:
*/
int sync_mapping_buffers(struct address_space *mapping)
{
- struct address_space *buffer_mapping = mapping->private_data;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+ if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
return 0;
- return fsync_buffers_list(&buffer_mapping->private_lock,
- &mapping->private_list);
+ return fsync_buffers_list(&buffer_mapping->i_private_lock,
+ &mapping->i_private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);
@@ -673,17 +672,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
struct address_space *buffer_mapping = bh->b_folio->mapping;
mark_buffer_dirty(bh);
- if (!mapping->private_data) {
- mapping->private_data = buffer_mapping;
+ if (!mapping->i_private_data) {
+ mapping->i_private_data = buffer_mapping;
} else {
- BUG_ON(mapping->private_data != buffer_mapping);
+ BUG_ON(mapping->i_private_data != buffer_mapping);
}
if (!bh->b_assoc_map) {
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
list_move_tail(&bh->b_assoc_buffers,
- &mapping->private_list);
+ &mapping->i_private_list);
bh->b_assoc_map = mapping;
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
@@ -706,7 +705,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
* page on the dirty page list.
*
- * We use private_lock to lock against try_to_free_buffers while using the
+ * We use i_private_lock to lock against try_to_free_buffers while using the
* page's buffer list. Also use this to protect against clean buffers being
* added to the page after it was set dirty.
*
@@ -718,7 +717,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
struct buffer_head *head;
bool newly_dirty;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
head = folio_buffers(folio);
if (head) {
struct buffer_head *bh = head;
@@ -734,7 +733,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
*/
folio_memcg_lock(folio);
newly_dirty = !folio_test_set_dirty(folio);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (newly_dirty)
__folio_mark_dirty(folio, mapping, 1);
@@ -827,7 +826,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
smp_mb();
if (buffer_dirty(bh)) {
list_add(&bh->b_assoc_buffers,
- &mapping->private_list);
+ &mapping->i_private_list);
bh->b_assoc_map = mapping;
}
spin_unlock(lock);
@@ -851,7 +850,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* probably unmounting the fs, but that doesn't mean we have already
* done a sync(). Just drop the buffers from the inode list.
*
- * NOTE: we take the inode's blockdev's mapping's private_lock. Which
+ * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
* assumes that all the buffers are against the blockdev. Not true
* for reiserfs.
*/
@@ -859,13 +858,13 @@ void invalidate_inode_buffers(struct inode *inode)
{
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->private_data;
+ struct list_head *list = &mapping->i_private_list;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
while (!list_empty(list))
__remove_assoc_queue(BH_ENTRY(list->next));
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
}
EXPORT_SYMBOL(invalidate_inode_buffers);
@@ -882,10 +881,10 @@ int remove_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->private_data;
+ struct list_head *list = &mapping->i_private_list;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
while (!list_empty(list)) {
struct buffer_head *bh = BH_ENTRY(list->next);
if (buffer_dirty(bh)) {
@@ -894,7 +893,7 @@ int remove_inode_buffers(struct inode *inode)
}
__remove_assoc_queue(bh);
}
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
return ret;
}
@@ -995,11 +994,12 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
* Initialise the state of a blockdev folio's buffers.
*/
static sector_t folio_init_buffers(struct folio *folio,
- struct block_device *bdev, sector_t block, int size)
+ struct block_device *bdev, unsigned size)
{
struct buffer_head *head = folio_buffers(folio);
struct buffer_head *bh = head;
bool uptodate = folio_test_uptodate(folio);
+ sector_t block = div_u64(folio_pos(folio), size);
sector_t end_block = blkdev_max_block(bdev, size);
do {
@@ -1024,86 +1024,88 @@ static sector_t folio_init_buffers(struct folio *folio,
}
/*
- * Create the page-cache page that contains the requested block.
+ * Create the page-cache folio that contains the requested block.
*
* This is used purely for blockdev mappings.
+ *
+ * Returns false if we have a failure which cannot be cured by retrying
+ * without sleeping. Returns true if we succeeded, or the caller should retry.
*/
-static int
-grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits, gfp_t gfp)
+static bool grow_dev_folio(struct block_device *bdev, sector_t block,
+ pgoff_t index, unsigned size, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct folio *folio;
struct buffer_head *bh;
- sector_t end_block;
- int ret = 0;
+ sector_t end_block = 0;
folio = __filemap_get_folio(inode->i_mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
- return PTR_ERR(folio);
+ return false;
bh = folio_buffers(folio);
if (bh) {
if (bh->b_size == size) {
- end_block = folio_init_buffers(folio, bdev,
- (sector_t)index << sizebits, size);
- goto done;
+ end_block = folio_init_buffers(folio, bdev, size);
+ goto unlock;
+ }
+
+ /*
+ * Retrying may succeed; for example the folio may finish
+ * writeback, or buffers may be cleaned. This should not
+ * happen very often; maybe we have old buffers attached to
+ * this blockdev's page cache and we're trying to change
+ * the block size?
+ */
+ if (!try_to_free_buffers(folio)) {
+ end_block = ~0ULL;
+ goto unlock;
}
- if (!try_to_free_buffers(folio))
- goto failed;
}
- ret = -ENOMEM;
bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
if (!bh)
- goto failed;
+ goto unlock;
/*
* Link the folio to the buffers and initialise them. Take the
* lock to be atomic wrt __find_get_block(), which does not
* run under the folio lock.
*/
- spin_lock(&inode->i_mapping->private_lock);
+ spin_lock(&inode->i_mapping->i_private_lock);
link_dev_buffers(folio, bh);
- end_block = folio_init_buffers(folio, bdev,
- (sector_t)index << sizebits, size);
- spin_unlock(&inode->i_mapping->private_lock);
-done:
- ret = (block < end_block) ? 1 : -ENXIO;
-failed:
+ end_block = folio_init_buffers(folio, bdev, size);
+ spin_unlock(&inode->i_mapping->i_private_lock);
+unlock:
folio_unlock(folio);
folio_put(folio);
- return ret;
+ return block < end_block;
}
/*
- * Create buffers for the specified block device block's page. If
- * that page was dirty, the buffers are set dirty also.
+ * Create buffers for the specified block device block's folio. If
+ * that folio was dirty, the buffers are set dirty also. Returns false
+ * if we've hit a permanent error.
*/
-static int
-grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
+static bool grow_buffers(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
- pgoff_t index;
- int sizebits;
-
- sizebits = PAGE_SHIFT - __ffs(size);
- index = block >> sizebits;
+ loff_t pos;
/*
- * Check for a block which wants to lie outside our maximum possible
- * pagecache index. (this comparison is done using sector_t types).
+ * Check for a block which lies outside our maximum possible
+ * pagecache index.
*/
- if (unlikely(index != block >> sizebits)) {
- printk(KERN_ERR "%s: requested out-of-range block %llu for "
- "device %pg\n",
+ if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
+ printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
__func__, (unsigned long long)block,
bdev);
- return -EIO;
+ return false;
}
- /* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits, gfp);
+ /* Create a folio with the proper size buffers */
+ return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}
static struct buffer_head *
@@ -1124,14 +1126,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
for (;;) {
struct buffer_head *bh;
- int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size, gfp);
- if (ret < 0)
+ if (!grow_buffers(bdev, block, size, gfp))
return NULL;
}
}
@@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* and then attach the address_space's inode to its superblock's dirty
* inode list.
*
- * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
+ * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
* i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
@@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh)
if (bh->b_assoc_map) {
struct address_space *buffer_mapping = bh->b_folio->mapping;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
list_del_init(&bh->b_assoc_buffers);
bh->b_assoc_map = NULL;
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
__brelse(bh);
}
@@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio);
/*
* We attach and possibly dirty the buffers atomically wrt
- * block_dirty_folio() via private_lock. try_to_free_buffers
+ * block_dirty_folio() via i_private_lock. try_to_free_buffers
* is already excluded via the folio lock.
*/
struct buffer_head *create_empty_buffers(struct folio *folio,
@@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
} while (bh);
tail->b_this_page = head;
- spin_lock(&folio->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
bh = head;
do {
@@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
} while (bh != head);
}
folio_attach_private(folio, head);
- spin_unlock(&folio->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return head;
}
@@ -1699,13 +1699,13 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
struct folio_batch fbatch;
- pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
pgoff_t end;
int i, count;
struct buffer_head *bh;
struct buffer_head *head;
- end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE;
folio_batch_init(&fbatch);
while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
count = folio_batch_count(&fbatch);
@@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
if (!folio_buffers(folio))
continue;
/*
- * We use folio lock instead of bd_mapping->private_lock
+ * We use folio lock instead of bd_mapping->i_private_lock
* to pin buffers here since we can afford to sleep and
* it scales better than a global spinlock lock.
*/
@@ -1748,19 +1748,6 @@ unlock_page:
}
EXPORT_SYMBOL(clean_bdev_aliases);
-/*
- * Size is a power-of-two in the range 512..PAGE_SIZE,
- * and the case we care about most is PAGE_SIZE.
- *
- * So this *could* possibly be written with those
- * constraints in mind (relevant mostly if some
- * architecture has a slow bit-scan instruction)
- */
-static inline int block_size_bits(unsigned int blocksize)
-{
- return ilog2(blocksize);
-}
-
static struct buffer_head *folio_create_buffers(struct folio *folio,
struct inode *inode,
unsigned int b_state)
@@ -1790,30 +1777,29 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
*/
/*
- * While block_write_full_page is writing back the dirty buffers under
+ * While block_write_full_folio is writing back the dirty buffers under
* the page lock, whoever dirtied the buffers may decide to clean them
* again at any time. We handle that by only looking at the buffer
* state inside lock_buffer().
*
- * If block_write_full_page() is called for regular writeback
+ * If block_write_full_folio() is called for regular writeback
* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
* locked buffer. This only can happen if someone has written the buffer
* directly, with submit_bh(). At the address_space level PageWriteback
* prevents this contention from occurring.
*
- * If block_write_full_page() is called with wbc->sync_mode ==
+ * If block_write_full_folio() is called with wbc->sync_mode ==
* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
int __block_write_full_folio(struct inode *inode, struct folio *folio,
- get_block_t *get_block, struct writeback_control *wbc,
- bh_end_io_t *handler)
+ get_block_t *get_block, struct writeback_control *wbc)
{
int err;
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
- unsigned int blocksize, bbits;
+ size_t blocksize;
int nr_underway = 0;
blk_opf_t write_flags = wbc_to_write_flags(wbc);
@@ -1832,10 +1818,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
bh = head;
blocksize = bh->b_size;
- bbits = block_size_bits(blocksize);
- block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
- last_block = (i_size_read(inode) - 1) >> bbits;
+ block = div_u64(folio_pos(folio), blocksize);
+ last_block = div_u64(i_size_read(inode) - 1, blocksize);
/*
* Get all the dirty buffers mapped to disk addresses and
@@ -1849,7 +1834,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
* truncate in progress.
*/
/*
- * The buffer was zeroed by block_write_full_page()
+ * The buffer was zeroed by block_write_full_folio()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -1887,7 +1872,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
continue;
}
if (test_clear_buffer_dirty(bh)) {
- mark_buffer_async_write_endio(bh, handler);
+ mark_buffer_async_write_endio(bh,
+ end_buffer_async_write);
} else {
unlock_buffer(bh);
}
@@ -1940,7 +1926,8 @@ recover:
if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
- mark_buffer_async_write_endio(bh, handler);
+ mark_buffer_async_write_endio(bh,
+ end_buffer_async_write);
} else {
/*
* The buffer may have been set dirty during
@@ -2014,7 +2001,7 @@ static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
- loff_t offset = block << inode->i_blkbits;
+ loff_t offset = (loff_t)block << inode->i_blkbits;
bh->b_bdev = iomap->bdev;
@@ -2081,27 +2068,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap)
{
- unsigned from = pos & (PAGE_SIZE - 1);
- unsigned to = from + len;
+ size_t from = offset_in_folio(folio, pos);
+ size_t to = from + len;
struct inode *inode = folio->mapping->host;
- unsigned block_start, block_end;
+ size_t block_start, block_end;
sector_t block;
int err = 0;
- unsigned blocksize, bbits;
+ size_t blocksize;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
- bbits = block_size_bits(blocksize);
+ block = div_u64(folio_pos(folio), blocksize);
- block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
-
- for(bh = head, block_start = 0; bh != head || !block_start;
+ for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
@@ -2364,7 +2348,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- unsigned int blocksize, bbits;
+ size_t blocksize;
int nr, i;
int fully_mapped = 1;
bool page_error = false;
@@ -2378,10 +2362,9 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
- bbits = block_size_bits(blocksize);
- iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
- lblock = (limit+blocksize-1) >> bbits;
+ iblock = div_u64(folio_pos(folio), blocksize);
+ lblock = div_u64(limit + blocksize - 1, blocksize);
bh = head;
nr = 0;
i = 0;
@@ -2666,8 +2649,8 @@ int block_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
-
+ iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
+
folio = filemap_grab_folio(mapping, index);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -2720,17 +2703,15 @@ EXPORT_SYMBOL(block_truncate_page);
/*
* The generic ->writepage function for buffer-backed address_spaces
*/
-int block_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+ void *get_block)
{
- struct folio *folio = page_folio(page);
struct inode * const inode = folio->mapping->host;
loff_t i_size = i_size_read(inode);
/* Is the folio fully inside i_size? */
if (folio_pos(folio) + folio_size(folio) <= i_size)
- return __block_write_full_folio(inode, folio, get_block, wbc,
- end_buffer_async_write);
+ return __block_write_full_folio(inode, folio, get_block, wbc);
/* Is the folio fully outside i_size? (truncate in progress) */
if (folio_pos(folio) >= i_size) {
@@ -2747,10 +2728,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
*/
folio_zero_segment(folio, offset_in_folio(folio, i_size),
folio_size(folio));
- return __block_write_full_folio(inode, folio, get_block, wbc,
- end_buffer_async_write);
+ return __block_write_full_folio(inode, folio, get_block, wbc);
}
-EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
@@ -2883,7 +2862,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
* are unused, and releases them if so.
*
* Exclusion against try_to_free_buffers may be obtained by either
- * locking the folio or by holding its mapping's private_lock.
+ * locking the folio or by holding its mapping's i_private_lock.
*
* If the folio is dirty but all the buffers are clean then we need to
* be sure to mark the folio clean as well. This is because the folio
@@ -2894,7 +2873,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
* The same applies to regular filesystem folios: if all the buffers are
* clean then we set the folio clean and proceed. To do that, we require
* total exclusion from block_dirty_folio(). That is obtained with
- * private_lock.
+ * i_private_lock.
*
* try_to_free_buffers() is non-blocking.
*/
@@ -2946,7 +2925,7 @@ bool try_to_free_buffers(struct folio *folio)
goto out;
}
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
ret = drop_buffers(folio, &buffers_to_free);
/*
@@ -2959,13 +2938,13 @@ bool try_to_free_buffers(struct folio *folio)
* the folio's buffers clean. We discover that here and clean
* the folio also.
*
- * private_lock must be held over this entire operation in order
+ * i_private_lock must be held over this entire operation in order
* to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
folio_cancel_dirty(folio);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index aa4efcabb5e3..3f24905f4066 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -77,6 +77,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
{ "tag", cachefiles_daemon_tag },
#ifdef CONFIG_CACHEFILES_ONDEMAND
{ "copen", cachefiles_ondemand_copen },
+ { "restore", cachefiles_ondemand_restore },
#endif
{ "", NULL }
};
@@ -355,14 +356,24 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
struct poll_table_struct *poll)
{
struct cachefiles_cache *cache = file->private_data;
+ XA_STATE(xas, &cache->reqs, 0);
+ struct cachefiles_req *req;
__poll_t mask;
poll_wait(file, &cache->daemon_pollwq, poll);
mask = 0;
if (cachefiles_in_ondemand_mode(cache)) {
- if (!xa_empty(&cache->reqs))
- mask |= EPOLLIN;
+ if (!xa_empty(&cache->reqs)) {
+ rcu_read_lock();
+ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
+ if (!cachefiles_ondemand_is_reopening_read(req)) {
+ mask |= EPOLLIN;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
} else {
if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
mask |= EPOLLIN;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 40052bdb3365..35ba2117a6f6 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -31,6 +31,11 @@ struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie)
if (!object)
return NULL;
+ if (cachefiles_ondemand_init_obj_info(object, volume)) {
+ kmem_cache_free(cachefiles_object_jar, object);
+ return NULL;
+ }
+
refcount_set(&object->ref, 1);
spin_lock_init(&object->lock);
@@ -88,7 +93,7 @@ void cachefiles_put_object(struct cachefiles_object *object,
ASSERTCMP(object->file, ==, NULL);
kfree(object->d_name);
-
+ cachefiles_ondemand_deinit_obj_info(object);
cache = object->volume->cache->cache;
fscache_put_cookie(object->cookie, fscache_cookie_put_object);
object->cookie = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 2ad58c465208..4a87c9d714a9 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -44,6 +44,19 @@ struct cachefiles_volume {
struct dentry *fanout[256]; /* Fanout subdirs */
};
+enum cachefiles_object_state {
+ CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
+ CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
+ CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
+};
+
+struct cachefiles_ondemand_info {
+ struct work_struct ondemand_work;
+ int ondemand_id;
+ enum cachefiles_object_state state;
+ struct cachefiles_object *object;
+};
+
/*
* Backing file state.
*/
@@ -61,7 +74,7 @@ struct cachefiles_object {
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
#ifdef CONFIG_CACHEFILES_ONDEMAND
- int ondemand_id;
+ struct cachefiles_ondemand_info *ondemand;
#endif
};
@@ -290,12 +303,42 @@ extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
char *args);
+extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache,
+ char *args);
+
extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
extern int cachefiles_ondemand_read(struct cachefiles_object *object,
loff_t pos, size_t len);
+extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+ struct cachefiles_volume *volume);
+extern void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj);
+
+#define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE) \
+static inline bool \
+cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \
+{ \
+ return object->ondemand->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+} \
+ \
+static inline void \
+cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
+{ \
+ object->ondemand->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+}
+
+CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
+CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
+CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+ return cachefiles_ondemand_object_is_reopening(req->object) &&
+ req->msg.opcode == CACHEFILES_OP_READ;
+}
+
#else
static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
@@ -317,6 +360,20 @@ static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
{
return -EOPNOTSUPP;
}
+
+static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+ struct cachefiles_volume *volume)
+{
+ return 0;
+}
+static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj)
+{
+}
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+ return false;
+}
#endif
/*
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 009d23cd435b..5857241c5918 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
_enter("%ld", ret);
- kiocb_end_write(iocb);
+ if (ki->was_async)
+ kiocb_end_write(iocb);
if (ret < 0)
trace_cachefiles_io_error(object, inode, ret,
@@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object,
ki->iocb.ki_complete = cachefiles_write_complete;
atomic_long_add(ki->b_writing, &cache->b_writing);
- kiocb_start_write(&ki->iocb);
-
get_file(ki->iocb.ki_filp);
cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 0254ed39f68c..b8fbbb1961bb 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -9,21 +9,19 @@ static int cachefiles_ondemand_fd_release(struct inode *inode,
{
struct cachefiles_object *object = file->private_data;
struct cachefiles_cache *cache = object->volume->cache;
- int object_id = object->ondemand_id;
+ struct cachefiles_ondemand_info *info = object->ondemand;
+ int object_id = info->ondemand_id;
struct cachefiles_req *req;
XA_STATE(xas, &cache->reqs, 0);
xa_lock(&cache->reqs);
- object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+ info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+ cachefiles_ondemand_set_object_close(object);
- /*
- * Flush all pending READ requests since their completion depends on
- * anon_fd.
- */
- xas_for_each(&xas, req, ULONG_MAX) {
+ /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
+ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
if (req->msg.object_id == object_id &&
- req->msg.opcode == CACHEFILES_OP_READ) {
- req->error = -EIO;
+ req->msg.opcode == CACHEFILES_OP_CLOSE) {
complete(&req->done);
xas_store(&xas, NULL);
}
@@ -176,11 +174,37 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
trace_cachefiles_ondemand_copen(req->object, id, size);
+ cachefiles_ondemand_set_object_open(req->object);
+ wake_up_all(&cache->daemon_pollwq);
+
out:
complete(&req->done);
return ret;
}
+int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
+{
+ struct cachefiles_req *req;
+
+ XA_STATE(xas, &cache->reqs, 0);
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ /*
+ * Reset the requests to CACHEFILES_REQ_NEW state, so that the
+ * requests have been processed halfway before the crash of the
+ * user daemon could be reprocessed after the recovery.
+ */
+ xas_lock(&xas);
+ xas_for_each(&xas, req, ULONG_MAX)
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ xas_unlock(&xas);
+
+ wake_up_all(&cache->daemon_pollwq);
+ return 0;
+}
+
static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
{
struct cachefiles_object *object;
@@ -218,8 +242,7 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
load = (void *)req->msg.data;
load->fd = fd;
- req->msg.object_id = object_id;
- object->ondemand_id = object_id;
+ object->ondemand->ondemand_id = object_id;
cachefiles_get_unbind_pincount(cache);
trace_cachefiles_ondemand_open(object, &req->msg, load);
@@ -234,6 +257,43 @@ err:
return ret;
}
+static void ondemand_object_worker(struct work_struct *work)
+{
+ struct cachefiles_ondemand_info *info =
+ container_of(work, struct cachefiles_ondemand_info, ondemand_work);
+
+ cachefiles_ondemand_init_object(info->object);
+}
+
+/*
+ * If there are any inflight or subsequent READ requests on the
+ * closed object, reopen it.
+ * Skip read requests whose related object is reopening.
+ */
+static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xas,
+ unsigned long xa_max)
+{
+ struct cachefiles_req *req;
+ struct cachefiles_object *object;
+ struct cachefiles_ondemand_info *info;
+
+ xas_for_each_marked(xas, req, xa_max, CACHEFILES_REQ_NEW) {
+ if (req->msg.opcode != CACHEFILES_OP_READ)
+ return req;
+ object = req->object;
+ info = object->ondemand;
+ if (cachefiles_ondemand_object_is_close(object)) {
+ cachefiles_ondemand_set_object_reopening(object);
+ queue_work(fscache_wq, &info->ondemand_work);
+ continue;
+ }
+ if (cachefiles_ondemand_object_is_reopening(object))
+ continue;
+ return req;
+ }
+ return NULL;
+}
+
ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
{
@@ -244,16 +304,16 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
int ret = 0;
XA_STATE(xas, &cache->reqs, cache->req_id_next);
+ xa_lock(&cache->reqs);
/*
* Cyclically search for a request that has not ever been processed,
* to prevent requests from being processed repeatedly, and make
* request distribution fair.
*/
- xa_lock(&cache->reqs);
- req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ req = cachefiles_ondemand_select_req(&xas, ULONG_MAX);
if (!req && cache->req_id_next > 0) {
xas_set(&xas, 0);
- req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW);
+ req = cachefiles_ondemand_select_req(&xas, cache->req_id_next - 1);
}
if (!req) {
xa_unlock(&cache->reqs);
@@ -273,14 +333,18 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
xa_unlock(&cache->reqs);
id = xas.xa_index;
- msg->msg_id = id;
if (msg->opcode == CACHEFILES_OP_OPEN) {
ret = cachefiles_ondemand_get_fd(req);
- if (ret)
+ if (ret) {
+ cachefiles_ondemand_set_object_close(req->object);
goto error;
+ }
}
+ msg->msg_id = id;
+ msg->object_id = req->object->ondemand->ondemand_id;
+
if (copy_to_user(_buffer, msg, n) != 0) {
ret = -EFAULT;
goto err_put_fd;
@@ -313,19 +377,23 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
void *private)
{
struct cachefiles_cache *cache = object->volume->cache;
- struct cachefiles_req *req;
+ struct cachefiles_req *req = NULL;
XA_STATE(xas, &cache->reqs, 0);
int ret;
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return 0;
- if (test_bit(CACHEFILES_DEAD, &cache->flags))
- return -EIO;
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ ret = -EIO;
+ goto out;
+ }
req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
- if (!req)
- return -ENOMEM;
+ if (!req) {
+ ret = -ENOMEM;
+ goto out;
+ }
req->object = object;
init_completion(&req->done);
@@ -363,8 +431,9 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
/* coupled with the barrier in cachefiles_flush_reqs() */
smp_mb();
- if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
- WARN_ON_ONCE(object->ondemand_id == 0);
+ if (opcode == CACHEFILES_OP_CLOSE &&
+ !cachefiles_ondemand_object_is_open(object)) {
+ WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
xas_unlock(&xas);
ret = -EIO;
goto out;
@@ -387,7 +456,15 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
wake_up_all(&cache->daemon_pollwq);
wait_for_completion(&req->done);
ret = req->error;
+ kfree(req);
+ return ret;
out:
+ /* Reset the object to close state in error handling path.
+ * If error occurs after creating the anonymous fd,
+ * cachefiles_ondemand_fd_release() will set object to close.
+ */
+ if (opcode == CACHEFILES_OP_OPEN)
+ cachefiles_ondemand_set_object_close(object);
kfree(req);
return ret;
}
@@ -430,18 +507,10 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
void *private)
{
struct cachefiles_object *object = req->object;
- int object_id = object->ondemand_id;
- /*
- * It's possible that object id is still 0 if the cookie looking up
- * phase failed before OPEN request has ever been sent. Also avoid
- * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
- * anon_fd has already been closed.
- */
- if (object_id <= 0)
+ if (!cachefiles_ondemand_object_is_open(object))
return -ENOENT;
- req->msg.object_id = object_id;
trace_cachefiles_ondemand_close(object, &req->msg);
return 0;
}
@@ -457,16 +526,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
struct cachefiles_object *object = req->object;
struct cachefiles_read *load = (void *)req->msg.data;
struct cachefiles_read_ctx *read_ctx = private;
- int object_id = object->ondemand_id;
-
- /* Stop enqueuing requests when daemon has closed anon_fd. */
- if (object_id <= 0) {
- WARN_ON_ONCE(object_id == 0);
- pr_info_once("READ: anonymous fd closed prematurely.\n");
- return -EIO;
- }
- req->msg.object_id = object_id;
load->off = read_ctx->off;
load->len = read_ctx->len;
trace_cachefiles_ondemand_read(object, &req->msg, load);
@@ -485,7 +545,7 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
* creating a new tmpfile as the cache file. Reuse the previously
* allocated object ID if any.
*/
- if (object->ondemand_id > 0)
+ if (cachefiles_ondemand_object_is_open(object))
return 0;
volume_key_size = volume->key[0] + 1;
@@ -503,6 +563,28 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
cachefiles_ondemand_init_close_req, NULL);
}
+int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+ struct cachefiles_volume *volume)
+{
+ if (!cachefiles_in_ondemand_mode(volume->cache))
+ return 0;
+
+ object->ondemand = kzalloc(sizeof(struct cachefiles_ondemand_info),
+ GFP_KERNEL);
+ if (!object->ondemand)
+ return -ENOMEM;
+
+ object->ondemand->object = object;
+ INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
+ return 0;
+}
+
+void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *object)
+{
+ kfree(object->ondemand);
+ object->ondemand = NULL;
+}
+
int cachefiles_ondemand_read(struct cachefiles_object *object,
loff_t pos, size_t len)
{
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85be3bf18cdf..13af429ab030 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -907,8 +907,8 @@ static void writepages_finish(struct ceph_osd_request *req)
doutc(cl, "unlocking %p\n", page);
if (remove_page)
- generic_error_remove_page(inode->i_mapping,
- page);
+ generic_error_remove_folio(inode->i_mapping,
+ page_folio(page));
unlock_page(page);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..d380d9dad0e0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -12,6 +12,7 @@
#include <linux/falloc.h>
#include <linux/iversion.h>
#include <linux/ktime.h>
+#include <linux/splice.h>
#include "super.h"
#include "mds_client.h"
@@ -3010,8 +3011,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* {read,write}_iter, which will get caps again.
*/
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- ret = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, src_objlen, flags);
+ ret = splice_file_range(src_file, &src_off, dst_file, &dst_off,
+ src_objlen);
/* Abort on short copies or on error */
if (ret < (long)src_objlen) {
doutc(cl, "Failed partial copy (%zd)\n", ret);
@@ -3065,8 +3066,8 @@ out_caps:
*/
if (len && (len < src_ci->i_layout.object_size)) {
doutc(cl, "Final partial copy of %zu bytes\n", len);
- bytes = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
+ bytes = splice_file_range(src_file, &src_off, dst_file,
+ &dst_off, len);
if (bytes > 0)
ret += bytes;
else
@@ -3089,8 +3090,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
return ret;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 16acc58311ea..148856a582a9 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -79,14 +79,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto finish_write;
- file_start_write(host_file);
inode_lock(coda_inode);
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
inode_unlock(coda_inode);
- file_end_write(host_file);
finish_write:
venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
diff --git a/fs/dax.c b/fs/dax.c
index 3380b43cb6bb..423fc1607dfa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
srcmap->type == IOMAP_UNWRITTEN;
- void *saddr = 0;
+ void *saddr = NULL;
int ret = 0;
if (!zero_edge) {
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..2ba37643b9c5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -428,7 +428,8 @@ static void d_lru_add(struct dentry *dentry)
this_cpu_inc(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_inc(nr_dentry_negative);
- WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ WARN_ON_ONCE(!list_lru_add_obj(
+ &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_lru_del(struct dentry *dentry)
@@ -438,7 +439,8 @@ static void d_lru_del(struct dentry *dentry)
this_cpu_dec(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative);
- WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ WARN_ON_ONCE(!list_lru_del_obj(
+ &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_shrink_del(struct dentry *dentry)
@@ -1240,7 +1242,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item,
*
* This is guaranteed by the fact that all LRU management
* functions are intermediated by the LRU API calls like
- * list_lru_add and list_lru_del. List movement in this file
+ * list_lru_add_obj and list_lru_del_obj. List movement in this file
* only ever occur through this functions or through callbacks
* like this one, that are called from the LRU API.
*
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6d7c1a49581f..c6f4a9a98b85 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1100,17 +1100,35 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
return r;
}
+static ssize_t write_file_blob(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct debugfs_blob_wrapper *blob = file->private_data;
+ struct dentry *dentry = F_DENTRY(file);
+ ssize_t r;
+
+ r = debugfs_file_get(dentry);
+ if (unlikely(r))
+ return r;
+ r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf,
+ count);
+
+ debugfs_file_put(dentry);
+ return r;
+}
+
static const struct file_operations fops_blob = {
.read = read_file_blob,
+ .write = write_file_blob,
.open = simple_open,
.llseek = default_llseek,
};
/**
- * debugfs_create_blob - create a debugfs file that is used to read a binary blob
+ * debugfs_create_blob - create a debugfs file that is used to read and write
+ * a binary blob
* @name: a pointer to a string containing the name of the file to create.
- * @mode: the read permission that the file should have (other permissions are
- * masked out)
+ * @mode: the permission that the file should have
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is %NULL, then the
* file will be created in the root of the debugfs filesystem.
@@ -1119,7 +1137,7 @@ static const struct file_operations fops_blob = {
*
* This function creates a file in debugfs with the given name that exports
* @blob->data as a binary blob. If the @mode variable is so set it can be
- * read from. Writing is not supported.
+ * read from and written to.
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
@@ -1134,7 +1152,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
- return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob);
+ return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 20533266ade6..60456263a338 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
loff_t offset = iocb->ki_pos;
const loff_t end = offset + count;
struct dio *dio;
- struct dio_submit sdio = { 0, };
+ struct dio_submit sdio = { NULL, };
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
unsigned long align = offset | iov_iter_alignment(iter);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 42f332f46359..4fa11d9ddbb6 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -443,14 +443,14 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
break;
case 3:
if (ri->header) {
- seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+ seq_puts(seq, "rsb ptr nodeid first_lkid flags !root_list_empty !recover_list_empty recover_locks_count len\n");
ri->header = 0;
}
print_format3(ri->rsb, seq);
break;
case 4:
if (ri->header) {
- seq_puts(seq, "version 4 rsb 2\n");
+ seq_puts(seq, "rsb ptr nodeid master_nodeid dir_nodeid our_nodeid toss_time flags len str|hex name\n");
ri->header = 0;
}
print_format4(ri->rsb, seq);
@@ -748,7 +748,7 @@ static int table_open4(struct inode *inode, struct file *file)
struct seq_file *seq;
int ret;
- ret = seq_open(file, &format5_seq_ops);
+ ret = seq_open(file, &format4_seq_ops);
if (ret)
return ret;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 67f8dd8a05ef..6296c62c10fa 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1817,8 +1817,8 @@ static int dlm_tcp_bind(struct socket *sock)
memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
- result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
- addr_len);
+ result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+ addr_len);
if (result < 0) {
/* This *may* not indicate a critical error */
log_print("could not bind for connect: %d", result);
@@ -1830,7 +1830,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
+ return kernel_connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1862,8 +1862,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
/* Bind to our port */
make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
- addr_len);
+ return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+ addr_len);
}
static const struct dlm_proto_ops dlm_tcp_ops = {
@@ -1888,12 +1888,12 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
int ret;
/*
- * Make sock->ops->connect() function return in specified time,
+ * Make kernel_connect() function return in specified time,
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
sock_set_sndtimeo(sock->sk, 5);
- ret = sock->ops->connect(sock, addr, addr_len, 0);
+ ret = kernel_connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
return ret;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index e6b4c1a21446..d814c5121367 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -140,11 +140,12 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
- op->info.wait = IS_SETLKW(cmd);
+ op->info.wait = !!(fl->fl_flags & FL_SLEEP);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
+ op->info.owner = (__u64)(long)fl->fl_owner;
/* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -154,9 +155,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
goto out;
}
- /* fl_owner is lockd which doesn't distinguish
- processes on the nfs client */
- op->info.owner = (__u64) fl->fl_pid;
op_data->callback = fl->fl_lmops->lm_grant;
locks_init_lock(&op_data->flc);
locks_copy_lock(&op_data->flc, fl);
@@ -168,8 +166,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
rv = FILE_LOCK_DEFERRED;
goto out;
- } else {
- op->info.owner = (__u64)(long) fl->fl_owner;
}
send_op(op);
@@ -326,10 +322,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = (__u64)(long)fl->fl_owner;
if (fl->fl_flags & FL_CLOSE) {
op->info.flags |= DLM_PLOCK_FL_CLOSE;
@@ -389,7 +382,7 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
info.number = number;
info.start = fl->fl_start;
info.end = fl->fl_end;
- info.owner = (__u64)fl->fl_pid;
+ info.owner = (__u64)(long)fl->fl_owner;
rv = do_lock_cancel(&info);
switch (rv) {
@@ -450,10 +443,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = (__u64)(long)fl->fl_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 91290fe4a70b..586446e02ef7 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -77,6 +77,7 @@ bool efivarfs_valid_name(const char *str, int len)
static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
+ struct efivarfs_fs_info *info = dir->i_sb->s_fs_info;
struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
@@ -118,7 +119,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
inode->i_private = var;
kmemleak_ignore(var);
- err = efivar_entry_add(var, &efivarfs_list);
+ err = efivar_entry_add(var, &info->efivarfs_list);
if (err)
goto out;
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index c66647f5c0bd..169252e6dc46 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -16,6 +16,9 @@ struct efivarfs_mount_opts {
struct efivarfs_fs_info {
struct efivarfs_mount_opts mount_opts;
+ struct list_head efivarfs_list;
+ struct super_block *sb;
+ struct notifier_block nb;
};
struct efi_variable {
@@ -33,7 +36,8 @@ struct efivar_entry {
struct kobject kobj;
};
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
+ struct list_head *),
void *data, bool duplicates, struct list_head *head);
int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
@@ -64,6 +68,4 @@ extern struct inode *efivarfs_get_inode(struct super_block *sb,
const struct inode *dir, int mode, dev_t dev,
bool is_removable);
-extern struct list_head efivarfs_list;
-
#endif /* EFIVAR_FS_INTERNAL_H */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 77240953a92e..6038dd39367a 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -15,10 +15,29 @@
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/statfs.h>
+#include <linux/notifier.h>
+#include <linux/printk.h>
#include "internal.h"
-LIST_HEAD(efivarfs_list);
+static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
+ void *data)
+{
+ struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, nb);
+
+ switch (event) {
+ case EFIVAR_OPS_RDONLY:
+ sfi->sb->s_flags |= SB_RDONLY;
+ break;
+ case EFIVAR_OPS_RDWR:
+ sfi->sb->s_flags &= ~SB_RDONLY;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
static void efivarfs_evict_inode(struct inode *inode)
{
@@ -166,7 +185,8 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
}
static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
- unsigned long name_size, void *data)
+ unsigned long name_size, void *data,
+ struct list_head *list)
{
struct super_block *sb = (struct super_block *)data;
struct efivar_entry *entry;
@@ -221,7 +241,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
}
__efivar_entry_get(entry, NULL, &size, NULL);
- __efivar_entry_add(entry, &efivarfs_list);
+ __efivar_entry_add(entry, list);
/* copied by the above to local storage in the dentry. */
kfree(name);
@@ -291,13 +311,11 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para
static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct efivarfs_fs_info *sfi = sb->s_fs_info;
struct inode *inode = NULL;
struct dentry *root;
int err;
- if (!efivar_is_available())
- return -EOPNOTSUPP;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -319,11 +337,16 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (!root)
return -ENOMEM;
- INIT_LIST_HEAD(&efivarfs_list);
+ sfi->sb = sb;
+ sfi->nb.notifier_call = efivarfs_ops_notifier;
+ err = blocking_notifier_chain_register(&efivar_ops_nh, &sfi->nb);
+ if (err)
+ return err;
- err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
+ err = efivar_init(efivarfs_callback, (void *)sb, true,
+ &sfi->efivarfs_list);
if (err)
- efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
+ efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
return err;
}
@@ -333,19 +356,35 @@ static int efivarfs_get_tree(struct fs_context *fc)
return get_tree_single(fc, efivarfs_fill_super);
}
+static int efivarfs_reconfigure(struct fs_context *fc)
+{
+ if (!efivar_supports_writes() && !(fc->sb_flags & SB_RDONLY)) {
+ pr_err("Firmware does not support SetVariableRT. Can not remount with rw\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
.parse_param = efivarfs_parse_param,
+ .reconfigure = efivarfs_reconfigure,
};
static int efivarfs_init_fs_context(struct fs_context *fc)
{
struct efivarfs_fs_info *sfi;
+ if (!efivar_is_available())
+ return -EOPNOTSUPP;
+
sfi = kzalloc(sizeof(*sfi), GFP_KERNEL);
if (!sfi)
return -ENOMEM;
+ INIT_LIST_HEAD(&sfi->efivarfs_list);
+
sfi->mount_opts.uid = GLOBAL_ROOT_UID;
sfi->mount_opts.gid = GLOBAL_ROOT_GID;
@@ -356,13 +395,14 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
static void efivarfs_kill_sb(struct super_block *sb)
{
- kill_litter_super(sb);
+ struct efivarfs_fs_info *sfi = sb->s_fs_info;
- if (!efivar_is_available())
- return;
+ blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
+ kill_litter_super(sb);
/* Remove all entries and destroy */
- efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
+ efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
+ kfree(sfi);
}
static struct file_system_type efivarfs_type = {
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 9e4f47808bd5..114ff0fd4e55 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -369,7 +369,8 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
*
* Returns 0 on success, or a kernel error code on failure.
*/
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
+ struct list_head *),
void *data, bool duplicates, struct list_head *head)
{
unsigned long variable_name_size = 1024;
@@ -420,7 +421,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
status = EFI_NOT_FOUND;
} else {
err = func(variable_name, vendor_guid,
- variable_name_size, data);
+ variable_name_size, data, head);
if (err)
status = EFI_NOT_FOUND;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 33a918f9566c..ad8186d47ba7 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,7 +43,17 @@ struct eventfd_ctx {
int id;
};
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
+/**
+ * eventfd_signal_mask - Increment the event counter
+ * @ctx: [in] Pointer to the eventfd context.
+ * @mask: [in] poll mask
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ */
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
{
unsigned long flags;
@@ -56,45 +66,23 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
* safe context.
*/
if (WARN_ON_ONCE(current->in_eventfd))
- return 0;
+ return;
spin_lock_irqsave(&ctx->wqh.lock, flags);
current->in_eventfd = 1;
- if (ULLONG_MAX - ctx->count < n)
- n = ULLONG_MAX - ctx->count;
- ctx->count += n;
+ if (ctx->count < ULLONG_MAX)
+ ctx->count++;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
-
- return n;
-}
-
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented. This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
-{
- return eventfd_signal_mask(ctx, n, 0);
}
-EXPORT_SYMBOL_GPL(eventfd_signal);
+EXPORT_SYMBOL_GPL(eventfd_signal_mask);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
if (ctx->id >= 0)
- ida_simple_remove(&eventfd_ida, ctx->id);
+ ida_free(&eventfd_ida, ctx->id);
kfree(ctx);
}
@@ -407,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+ ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..ee43597cb453 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1578,11 +1578,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
*/
- t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
if (t->fs == p->fs)
n_fs++;
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 464faf6c217e..5a4272b2c6b0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
static const struct address_space_operations ext2_dax_aops = {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9a84a5f9fef4..d5bd1e3a5d36 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
BUG_ON(len > PAGE_SIZE);
kaddr = kmap_local_folio(folio, 0);
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
- flush_dcache_folio(folio);
+ kaddr = folio_zero_tail(folio, len, kaddr + len);
kunmap_local(kaddr);
- folio_zero_segment(folio, len, folio_size(folio));
folio_mark_uptodate(folio);
brelse(iloc.bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61277f7f8722..83ee4e0f46f4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1261,7 +1261,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
static int ext4_write_end(struct file *file,
@@ -3213,7 +3213,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
}
/* Any metadata buffers to write? */
- if (!list_empty(&inode->i_mapping->private_list))
+ if (!list_empty(&inode->i_mapping->i_private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
@@ -3564,7 +3564,7 @@ static const struct address_space_operations ext4_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3581,7 +3581,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3598,7 +3598,7 @@ static const struct address_space_operations ext4_da_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 4f931f80cb34..aa6be510eb8f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
if (ret)
return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev);
+ bdev_thaw(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..312bc6813357 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
folio_clear_error(folio);
/*
- * Comments copied from block_write_full_page:
+ * Comments copied from block_write_full_folio:
*
* The folio straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c5fcf377ab1f..0980845c8b8f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5864,11 +5864,9 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
struct ext4_super_block *es;
int errno;
- /* see get_tree_bdev why this is needed and safe */
- up_write(&sb->s_umount);
- bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- sb, &fs_holder_ops);
- down_write(&sb->s_umount);
+ bdev_handle = bdev_open_by_dev(j_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ sb, &fs_holder_ops);
if (IS_ERR(bdev_handle)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 36e5dab6baae..6b2af514660d 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
continue;
}
- generic_error_remove_page(mapping, &folio->page);
+ generic_error_remove_folio(mapping, folio);
folio_unlock(folio);
}
folio_batch_release(&fbatch);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e50363583f01..4580dfefd5e9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2239,11 +2239,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
- ret = freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
if (ret)
goto out;
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
- thaw_bdev(sb->s_bdev);
+ bdev_thaw(sb->s_bdev);
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 560bfcad1af2..a9eb3891f417 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -600,7 +600,7 @@ make_now:
#ifdef CONFIG_F2FS_FS_COMPRESSION
inode->i_mapping->a_ops = &f2fs_compress_aops;
/*
- * generic_error_remove_page only truncates pages of regular
+ * generic_error_remove_folio only truncates pages of regular
* inode
*/
inode->i_mode |= S_IFREG;
diff --git a/fs/file.c b/fs/file.c
index 5fb0b146e79e..3b683b9101d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file)
EXPORT_SYMBOL(fd_install);
/**
- * pick_file - return file associatd with fd
+ * file_close_fd_locked - return file associated with fd
* @files: file struct to retrieve file from
* @fd: file descriptor to retrieve file for
*
+ * Doesn't take a separate reference count.
+ *
* Context: files_lock must be held.
*
* Returns: The file associated with @fd (NULL if @fd is not open)
*/
-static struct file *pick_file(struct files_struct *files, unsigned fd)
+struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
struct fdtable *fdt = files_fdtable(files);
struct file *file;
+ lockdep_assert_held(&files->file_lock);
+
if (fd >= fdt->max_fds)
return NULL;
@@ -660,7 +664,7 @@ int close_fd(unsigned fd)
struct file *file;
spin_lock(&files->file_lock);
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (!file)
return -EBADF;
@@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
max_fd = min(max_fd, n);
for (; fd <= max_fd; fd++) {
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
if (file) {
spin_unlock(&files->file_lock);
filp_close(file, files);
@@ -795,26 +799,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
return 0;
}
-/*
- * See close_fd_get_file() below, this variant assumes current->files->file_lock
- * is held.
- */
-struct file *__close_fd_get_file(unsigned int fd)
-{
- return pick_file(current->files, fd);
-}
-
-/*
- * variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file.
+/**
+ * file_close_fd - return file associated with fd
+ * @fd: file descriptor to retrieve file for
+ *
+ * Doesn't take a separate reference count.
+ *
+ * Returns: The file associated with @fd (NULL if @fd is not open)
*/
-struct file *close_fd_get_file(unsigned int fd)
+struct file *file_close_fd(unsigned int fd)
{
struct files_struct *files = current->files;
struct file *file;
spin_lock(&files->file_lock);
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
spin_unlock(&files->file_lock);
return file;
@@ -959,31 +958,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
struct file *file;
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
struct file __rcu **fdentry;
+ unsigned long nospec_mask;
- if (unlikely(fd >= fdt->max_fds))
- return NULL;
-
- fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+ /* Mask is a 0 for invalid fd's, ~0 for valid ones */
+ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
/*
- * Ok, we have a file pointer. However, because we do
- * this all locklessly under RCU, we may be racing with
- * that file being closed.
- *
- * Such a race can take two forms:
- *
- * (a) the file ref already went down to zero and the
- * file hasn't been reused yet or the file count
- * isn't zero but the file has already been reused.
+ * fdentry points to the 'fd' offset, or fdt->fd[0].
+ * Loading from fdt->fd[0] is always safe, because the
+ * array always exists.
*/
- file = __get_file_rcu(fdentry);
+ fdentry = fdt->fd + (fd & nospec_mask);
+
+ /* Do the load, then mask any invalid result */
+ file = rcu_dereference_raw(*fdentry);
+ file = (void *)(nospec_mask & (unsigned long)file);
if (unlikely(!file))
return NULL;
- if (unlikely(IS_ERR(file)))
+ /*
+ * Ok, we have a file pointer that was valid at
+ * some point, but it might have become stale since.
+ *
+ * We need to confirm it by incrementing the refcount
+ * and then check the lookup again.
+ *
+ * atomic_long_inc_not_zero() gives us a full memory
+ * barrier. We only really need an 'acquire' one to
+ * protect the loads below, but we don't have that.
+ */
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
continue;
/*
+ * Such a race can take two forms:
+ *
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
+ *
* (b) the file table entry has changed under us.
* Note that we don't need to re-check the 'fdt->fd'
* pointer having changed, because it always goes
@@ -991,7 +1004,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
+ if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
+ unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
@@ -1128,13 +1142,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
* atomic_read_acquire() pairs with atomic_dec_and_test() in
* put_files_struct().
*/
- if (atomic_read_acquire(&files->count) == 1) {
+ if (likely(atomic_read_acquire(&files->count) == 1)) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask);
+ file = __fget_files(files, fd, mask);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
@@ -1282,7 +1296,7 @@ out_unlock:
}
/**
- * __receive_fd() - Install received file into file descriptor table
+ * receive_fd() - Install received file into file descriptor table
* @file: struct file that was received from another process
* @ufd: __user pointer to write new fd number to
* @o_flags: the O_* flags to apply to the new fd entry
@@ -1296,7 +1310,7 @@ out_unlock:
*
* Returns newly install fd or -ve on error.
*/
-int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
int new_fd;
int error;
@@ -1321,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
__receive_sock(file);
return new_fd;
}
+EXPORT_SYMBOL_GPL(receive_fd);
int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
@@ -1336,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
return new_fd;
}
-int receive_fd(struct file *file, unsigned int o_flags)
-{
- return __receive_fd(file, NULL, o_flags);
-}
-EXPORT_SYMBOL_GPL(receive_fd);
-
static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
diff --git a/fs/file_table.c b/fs/file_table.c
index de4a2915bfd4..3ba764d73fc9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -75,18 +75,6 @@ static inline void file_free(struct file *f)
}
}
-void release_empty_file(struct file *f)
-{
- WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
- if (atomic_long_dec_and_test(&f->f_count)) {
- security_file_free(f);
- put_cred(f->f_cred);
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- kmem_cache_free(filp_cachep, f);
- }
-}
-
/*
* Return the total number of open files in the system
*/
@@ -419,7 +407,7 @@ static void delayed_fput(struct work_struct *unused)
static void ____fput(struct callback_head *work)
{
- __fput(container_of(work, struct file, f_rcuhead));
+ __fput(container_of(work, struct file, f_task_work));
}
/*
@@ -445,9 +433,13 @@ void fput(struct file *file)
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_rcuhead, ____fput);
- if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index de2a5bccb930..26d367e3668d 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -29,7 +29,7 @@ vxfs_typdump(struct vxfs_typed *typ)
/**
* vxfs_bmap_ext4 - do bmap for ext4 extents
* @ip: pointer to the inode we do bmap for
- * @iblock: logical block.
+ * @bn: logical block.
*
* Description:
* vxfs_bmap_ext4 performs the bmap operation for inodes with
@@ -97,7 +97,7 @@ fail_buf:
* vxfs_bmap_indir reads a &struct vxfs_typed at @indir
* and performs the type-defined action.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*
* Note:
@@ -179,7 +179,7 @@ out:
* Description:
* Performs the bmap operation for typed extents.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*/
static daddr_t
@@ -243,7 +243,7 @@ vxfs_bmap_typed(struct inode *ip, long iblock)
* vxfs_bmap1 perfoms a logical to physical block mapping
* for vxfs-internal purposes.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*/
daddr_t
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 9b49ec36e667..ed51fcd34757 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -15,7 +15,7 @@
/**
* vxfs_immed_read_folio - read part of an immed inode into pagecache
- * @file: file context (unused)
+ * @fp: file context (unused)
* @folio: folio to fill in.
*
* Description:
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index f04ba2ed1e1a..1b0bca8b4cc6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -177,8 +177,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
/**
* vxfs_readdir - read a directory
* @fp: the directory to read
- * @retp: return buffer
- * @filler: filldir callback
+ * @ctx: dir_context for filldir/readdir
*
* Description:
* vxfs_readdir fills @retp with directory entries from @fp
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a660f1f21540..148a71b8b4d0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,7 @@
#include <linux/uio.h>
#include <linux/fs.h>
#include <linux/filelock.h>
+#include <linux/splice.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -3195,8 +3196,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
return ret;
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9611bfceda4b..974aca9c8ea8 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
}
/**
- * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page
+ * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio
* @folio: The folio to write
* @wbc: The writeback control
*
- * This is the same as calling block_write_full_page, but it also
+ * This is the same as calling block_write_full_folio, but it also
* writes pages outside of i_size
*/
static int gfs2_write_jdata_folio(struct folio *folio,
@@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio,
folio_size(folio));
return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc,
- wbc, end_buffer_async_write);
+ wbc);
}
/**
@@ -403,18 +403,18 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
}
/**
- * stuffed_readpage - Fill in a Linux folio with stuffed file data
+ * stuffed_read_folio - Fill in a Linux folio with stuffed file data
* @ip: the inode
* @folio: the folio
*
* Returns: errno
*/
-static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
+static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio)
{
- struct buffer_head *dibh;
- size_t i_size = i_size_read(&ip->i_inode);
- void *data;
- int error;
+ struct buffer_head *dibh = NULL;
+ size_t dsize = i_size_read(&ip->i_inode);
+ void *from = NULL;
+ int error = 0;
/*
* Due to the order of unstuffing files and ->fault(), we can be
@@ -422,22 +422,20 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
* so we need to supply one here. It doesn't happen often.
*/
if (unlikely(folio->index)) {
- folio_zero_range(folio, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- return 0;
+ dsize = 0;
+ } else {
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+ from = dibh->b_data + sizeof(struct gfs2_dinode);
}
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
- data = dibh->b_data + sizeof(struct gfs2_dinode);
- memcpy_to_folio(folio, 0, data, i_size);
- folio_zero_range(folio, i_size, folio_size(folio) - i_size);
+ folio_fill_tail(folio, 0, from, dsize);
brelse(dibh);
- folio_mark_uptodate(folio);
+out:
+ folio_end_read(folio, error == 0);
- return 0;
+ return error;
}
/**
@@ -456,13 +454,12 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, folio);
- folio_unlock(folio);
+ error = stuffed_read_folio(ip, folio);
} else {
error = mpage_read_folio(folio, gfs2_block_map);
}
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return -EIO;
return error;
@@ -748,7 +745,7 @@ static const struct address_space_operations gfs2_aops = {
.bmap = gfs2_bmap,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
static const struct address_space_operations gfs2_jdata_aops = {
@@ -761,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.invalidate_folio = gfs2_invalidate_folio,
.release_folio = gfs2_release_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 2e215e8c3c88..177f1f41f225 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -32,21 +32,25 @@
static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
{
- struct dentry *parent;
+ struct dentry *parent = NULL;
struct gfs2_sbd *sdp;
struct gfs2_inode *dip;
- struct inode *inode;
+ struct inode *dinode, *inode;
struct gfs2_holder d_gh;
struct gfs2_inode *ip = NULL;
int error, valid = 0;
int had_lock = 0;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- parent = dget_parent(dentry);
- sdp = GFS2_SB(d_inode(parent));
- dip = GFS2_I(d_inode(parent));
+ if (flags & LOOKUP_RCU) {
+ dinode = d_inode_rcu(READ_ONCE(dentry->d_parent));
+ if (!dinode)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dinode = d_inode(parent);
+ }
+ sdp = GFS2_SB(dinode);
+ dip = GFS2_I(dinode);
inode = d_inode(dentry);
if (inode) {
@@ -62,7 +66,8 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags)
had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
if (!had_lock) {
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+ flags & LOOKUP_RCU ? GL_NOBLOCK : 0, &d_gh);
if (error)
goto out;
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index cf40895233f5..d418d8b5367f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,8 +138,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino,
GFS2_BLKST_DINODE);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
return d_obtain_alias(inode);
}
@@ -192,5 +190,6 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
+ .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b66efc1a82a..992ca4effb50 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1442,7 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
if (fl->fl_type == F_UNLCK)
locks_lock_file_wait(file, fl);
return -EIO;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d6bf1f8c25dc..34540f9d011c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -156,7 +156,7 @@ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (likely(!gfs2_withdrawn(sdp)))
+ if (!gfs2_withdrawing_or_withdrawn(sdp))
return false;
if (gl->gl_ops->go_flags & GLOF_NONDISK)
return false;
@@ -278,7 +278,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
if (mapping) {
truncate_inode_pages_final(mapping);
- if (!gfs2_withdrawn(sdp))
+ if (!gfs2_withdrawing_or_withdrawn(sdp))
GLOCK_BUG_ON(gl, !mapping_empty(mapping));
}
trace_gfs2_glock_put(gl);
@@ -517,6 +517,23 @@ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
}
/**
+ * find_last_waiter - find the last gh that's waiting for the glock
+ * @gl: the glock
+ *
+ * This also is a fast way of finding out if there are any waiters.
+ */
+
+static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (list_empty(&gl->gl_holders))
+ return NULL;
+ gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
+ return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh;
+}
+
+/**
* state_change - record that the glock is now in a different state
* @gl: the glock
* @new_state: the new state
@@ -757,7 +774,7 @@ skip_inval:
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
- if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
+ if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp))
gfs2_withdraw_delayed(sdp);
if (glock_blocked_by_withdraw(gl) &&
(target != LM_ST_UNLOCKED ||
@@ -794,7 +811,7 @@ skip_inval:
gfs2_glock_queue_work(gl, 0);
} else if (ret) {
fs_err(sdp, "lm_lock ret %d\n", ret);
- GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp));
+ GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp));
}
} else { /* lock_nolock */
finish_xmote(gl, target);
@@ -1213,7 +1230,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->host = s->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
}
@@ -1555,11 +1572,30 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- int error = 0;
+ int error;
if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP))
return -EIO;
+ if (gh->gh_flags & GL_NOBLOCK) {
+ struct gfs2_holder *current_gh;
+
+ error = -ECHILD;
+ spin_lock(&gl->gl_lockref.lock);
+ if (find_last_waiter(gl))
+ goto unlock;
+ current_gh = find_first_holder(gl);
+ if (!may_grant(gl, current_gh, gh))
+ goto unlock;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
+ trace_gfs2_promote(gh);
+ error = 0;
+unlock:
+ spin_unlock(&gl->gl_lockref.lock);
+ return error;
+ }
+
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
@@ -1575,6 +1611,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
run_queue(gl, 1);
spin_unlock(&gl->gl_lockref.lock);
+ error = 0;
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 61197598abfd..0114f3e0ebe0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -84,6 +84,7 @@ enum {
#define GL_SKIP 0x0100
#define GL_NOPID 0x0200
#define GL_NOCACHE 0x0400
+#define GL_NOBLOCK 0x0800
/*
* lm_async_cb return flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b41c78bd2cc0..45653cbc8a87 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -174,7 +174,7 @@ static int gfs2_rgrp_metasync(struct gfs2_glock *gl)
filemap_fdatawrite_range(metamapping, start, end);
error = filemap_fdatawait_range(metamapping, start, end);
- WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
+ WARN_ON_ONCE(error && !gfs2_withdrawing_or_withdrawn(sdp));
mapping_set_error(metamapping, error);
if (error)
gfs2_io_error(sdp);
@@ -494,7 +494,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
/**
* inode_go_instantiate - read in an inode if necessary
- * @gh: The glock holder
+ * @gl: The glock
*
* Returns: errno
*/
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1b95db2c3aac..6bfc9383b7b8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1882,10 +1882,10 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
WARN_ON_ONCE(!may_not_block);
return -ECHILD;
}
- if (gfs2_glock_is_locked_by_me(gl) == NULL) {
- if (may_not_block)
- return -ECHILD;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+ int noblock = may_not_block ? GL_NOBLOCK : 0;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+ LM_FLAG_ANY | noblock, &i_gh);
if (error)
return error;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 59ab18c79889..d1ac5d0679ea 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1122,7 +1122,7 @@ static void gdlm_recover_prep(void *arg)
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_prep ignored due to withdraw.\n");
return;
}
@@ -1148,7 +1148,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int jid = slot->slot - 1;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
jid);
return;
@@ -1177,7 +1177,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_done ignored due to withdraw.\n");
return;
}
@@ -1208,7 +1208,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
jid);
return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5271ae87d1c..8cddf955ebc0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -126,7 +126,7 @@ __acquires(&sdp->sd_ail_lock)
}
}
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
gfs2_remove_from_ail(bd);
continue;
}
@@ -352,14 +352,15 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
* @sdp: The superblock
* @max_revokes: If non-zero, add revokes where appropriate
*
- * Tries to empty the ail1 lists, starting with the oldest first
+ * Tries to empty the ail1 lists, starting with the oldest first.
+ * Returns %true if the ail1 list is now empty.
*/
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
+static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
{
struct gfs2_trans *tr, *s;
int oldest_tr = 1;
- int ret;
+ bool empty;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
@@ -369,15 +370,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
oldest_tr = 0;
}
gfs2_log_update_flush_tail(sdp);
- ret = list_empty(&sdp->sd_ail1_list);
+ empty = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) {
- gfs2_lm(sdp, "fatal: I/O error(s)\n");
- gfs2_withdraw(sdp);
- }
-
- return ret;
+ return empty;
}
static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
@@ -814,6 +810,9 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp)
gfs2_log_lock(sdp);
gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
+
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
}
/**
@@ -841,7 +840,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
struct super_block *sb = sdp->sd_vfs;
u64 dblock;
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
@@ -914,8 +913,9 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
{
blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
+ struct super_block *sb = sdp->sd_vfs;
- gfs2_assert_withdraw(sdp, !test_bit(SDF_FROZEN, &sdp->sd_flags));
+ gfs2_assert_withdraw(sdp, sb->s_writers.frozen != SB_FREEZE_COMPLETE);
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
@@ -974,8 +974,9 @@ void gfs2_ail_drain(struct gfs2_sbd *sdp)
static void empty_ail1_list(struct gfs2_sbd *sdp)
{
unsigned long start = jiffies;
+ bool empty = false;
- for (;;) {
+ while (!empty) {
if (time_after(jiffies, start + (HZ * 600))) {
fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n",
__func__, current->journal_info ? 1 : 0);
@@ -984,9 +985,14 @@ static void empty_ail1_list(struct gfs2_sbd *sdp)
}
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- if (gfs2_ail1_empty(sdp, 0))
- return;
+ empty = gfs2_ail1_empty(sdp, 0);
+
+ if (gfs2_withdrawing_or_withdrawn(sdp))
+ break;
}
+
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
}
/**
@@ -1047,7 +1053,8 @@ repeat:
* Do this check while holding the log_flush_lock to prevent new
* buffers from being added to the ail via gfs2_pin()
*/
- if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ if (gfs2_withdrawing_or_withdrawn(sdp) ||
+ !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
goto out;
/* Log might have been flushed while we waited for the flush lock */
@@ -1096,13 +1103,13 @@ repeat:
goto out_withdraw;
gfs2_ordered_write(sdp);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
lops_before_commit(sdp, tr);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
@@ -1110,7 +1117,7 @@ repeat:
} else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) {
log_write_header(sdp, flags);
}
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
lops_after_commit(sdp, tr);
@@ -1128,7 +1135,7 @@ repeat:
if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
if (!sdp->sd_log_idle) {
empty_ail1_list(sdp);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
log_write_header(sdp, flags);
}
@@ -1297,8 +1304,9 @@ int gfs2_logd(void *data)
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
+ set_freezable();
while (!kthread_should_stop()) {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
break;
/* Check for errors writing to the journal */
@@ -1330,18 +1338,19 @@ int gfs2_logd(void *data)
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
- try_to_freeze();
-
- t = wait_event_interruptible_timeout(sdp->sd_logd_waitq,
+ t = wait_event_freezable_timeout(sdp->sd_logd_waitq,
test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) ||
gfs2_ail_flush_reqd(sdp) ||
gfs2_jrnl_flush_reqd(sdp) ||
sdp->sd_log_error ||
- gfs2_withdrawn(sdp) ||
+ gfs2_withdrawing_or_withdrawn(sdp) ||
kthread_should_stop(),
t);
}
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
+
return 0;
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 483f69807062..314ec2a70167 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -391,22 +391,15 @@ static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
* Simply unlock the pages in the bio. The main thread will wait on them and
* process them in order as necessary.
*/
-
static void gfs2_end_log_read(struct bio *bio)
{
- struct page *page;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
- if (bio->bi_status) {
- int err = blk_status_to_errno(bio->bi_status);
-
- SetPageError(page);
- mapping_set_error(page->mapping, err);
- }
- unlock_page(page);
+ bio_for_each_folio_all(fi, bio) {
+ /* We're abusing wb_err to get the error to gfs2_find_jhead */
+ filemap_set_wb_err(fi.folio->mapping, error);
+ folio_end_read(fi.folio, !error);
}
bio_put(bio);
@@ -475,7 +468,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
folio = filemap_get_folio(jd->jd_inode->i_mapping, index);
folio_wait_locked(folio);
- if (folio_test_error(folio))
+ if (!folio_test_uptodate(folio))
*done = true;
if (!*done)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 25ceb0805df2..f814054c8cd0 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -252,7 +252,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head *bh, *bhs[2];
int num = 0;
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp)) {
*bhp = NULL;
return -EIO;
}
@@ -310,7 +311,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp))
return -EIO;
wait_on_buffer(bh);
@@ -321,7 +323,8 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
gfs2_io_error_bh_wd(sdp, bh);
return -EIO;
}
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp))
return -EIO;
return 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108c5d26839..1281e60be639 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mapping->host = sb->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
@@ -1073,7 +1073,7 @@ hostdata_error:
void gfs2_lm_unmount(struct gfs2_sbd *sdp)
{
const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
- if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount)
+ if (!gfs2_withdrawing_or_withdrawn(sdp) && lm->lm_unmount)
lm->lm_unmount(sdp);
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 95dae7838b4e..aa9cf0102848 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -128,7 +128,7 @@ static void gfs2_qd_dispose(struct gfs2_quota_data *qd)
hlist_bl_del_rcu(&qd->qd_hlist);
spin_unlock_bucket(qd->qd_hash);
- if (!gfs2_withdrawn(sdp)) {
+ if (!gfs2_withdrawing_or_withdrawn(sdp)) {
gfs2_assert_warn(sdp, !qd->qd_change);
gfs2_assert_warn(sdp, !qd->qd_slot_ref);
gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
if (qd->qd_sbd != sdp)
continue;
if (lockref_get_not_dead(&qd->qd_lockref)) {
- list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
return qd;
}
}
@@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd)
}
qd->qd_lockref.count = 0;
- list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru);
spin_unlock(&qd->qd_lockref.lock);
}
@@ -1505,7 +1505,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
LIST_HEAD(dispose);
int count;
- BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+ BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) &&
+ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
spin_lock(&qd_lock);
list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
@@ -1517,7 +1518,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
lockref_mark_dead(&qd->qd_lockref);
spin_unlock(&qd->qd_lockref.lock);
- list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
list_add(&qd->qd_lru, &dispose);
}
spin_unlock(&qd_lock);
@@ -1539,7 +1540,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
{
if (error == 0 || error == -EROFS)
return;
- if (!gfs2_withdrawn(sdp)) {
+ if (!gfs2_withdrawing_or_withdrawn(sdp)) {
if (!cmpxchg(&sdp->sd_log_error, 0, error))
fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
wake_up(&sdp->sd_logd_waitq);
@@ -1582,8 +1583,9 @@ int gfs2_quotad(void *data)
unsigned long quotad_timeo = 0;
unsigned long t = 0;
+ set_freezable();
while (!kthread_should_stop()) {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
break;
/* Update the master statfs file */
@@ -1601,13 +1603,11 @@ int gfs2_quotad(void *data)
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
- try_to_freeze();
-
t = min(quotad_timeo, statfs_timeo);
- t = wait_event_interruptible_timeout(sdp->sd_quota_wait,
+ t = wait_event_freezable_timeout(sdp->sd_quota_wait,
sdp->sd_statfs_force_sync ||
- gfs2_withdrawn(sdp) ||
+ gfs2_withdrawing_or_withdrawn(sdp) ||
kthread_should_stop(),
t);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 5aae02669a40..f4fe7039f725 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -411,7 +411,7 @@ void gfs2_recover_func(struct work_struct *work)
int error = 0;
int jlocked = 0;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n",
jd->jd_jid);
goto fail;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c2060203b98a..26d6c1eea559 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -159,13 +159,13 @@ static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
}
/**
- * gfs2_bit_search
+ * gfs2_bit_search - search bitmap for a state
* @ptr: Pointer to bitmap data
* @mask: Mask to use (normally 0x55555.... but adjusted for search start)
* @state: The state we are searching for
*
- * We xor the bitmap data with a patter which is the bitwise opposite
- * of what we are looking for, this gives rise to a pattern of ones
+ * We xor the bitmap data with a pattern which is the bitwise opposite
+ * of what we are looking for. This gives rise to a pattern of ones
* wherever there is a match. Since we have two bits per entry, we
* take this pattern, shift it down by one place and then and it with
* the original. All the even bit positions (0,2,4, etc) then represent
@@ -1188,7 +1188,7 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
/**
* gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
- * @gh: the glock holder representing the rgrpd to read in
+ * @gl: the glock representing the rgrpd to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
@@ -1967,7 +1967,7 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
}
/**
- * gfs2_rgrp_used_recently
+ * gfs2_rgrp_used_recently - test if an rgrp has been used recently
* @rs: The block reservation with the rgrp to test
* @msecs: The time limit in milliseconds
*
@@ -2306,7 +2306,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt);
- if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
+ if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) {
struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d21c04a22d73..e5f79466340d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -134,7 +134,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
int error;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return -EIO;
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
@@ -153,7 +153,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (!error && gfs2_withdrawn(sdp))
+ if (!error && gfs2_withdrawing_or_withdrawn(sdp))
error = -EIO;
if (!error)
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -499,7 +499,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
return;
}
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -605,7 +605,7 @@ restart:
if (!sb_rdonly(sb))
gfs2_make_fs_ro(sdp);
else {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
gfs2_destroy_threads(sdp);
gfs2_quota_cleanup(sdp);
@@ -673,28 +673,6 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
return sdp->sd_log_error;
}
-static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
-{
- struct super_block *sb = sdp->sd_vfs;
- int error;
-
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- if (error)
- return error;
-
- if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
- GFS2_LFC_FREEZE_GO_SYNC);
- if (gfs2_withdrawn(sdp)) {
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- if (error)
- return error;
- return -EIO;
- }
- }
- return 0;
-}
-
static int gfs2_do_thaw(struct gfs2_sbd *sdp)
{
struct super_block *sb = sdp->sd_vfs;
@@ -724,7 +702,7 @@ void gfs2_freeze_func(struct work_struct *work)
if (test_bit(SDF_FROZEN, &sdp->sd_flags))
goto freeze_failed;
- error = gfs2_freeze_locally(sdp);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
goto freeze_failed;
@@ -759,12 +737,13 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
- error = -EBUSY;
- if (test_bit(SDF_FROZEN, &sdp->sd_flags))
- goto out;
+ if (test_bit(SDF_FROZEN, &sdp->sd_flags)) {
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return -EBUSY;
+ }
for (;;) {
- error = gfs2_freeze_locally(sdp);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
@@ -772,8 +751,11 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
error = gfs2_lock_fs_check_clean(sdp);
- if (!error)
- break; /* success */
+ if (!error) {
+ set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+ set_bit(SDF_FROZEN, &sdp->sd_flags);
+ break;
+ }
error = gfs2_do_thaw(sdp);
if (error)
@@ -793,14 +775,23 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
out:
- if (!error) {
- set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
- set_bit(SDF_FROZEN, &sdp->sd_flags);
- }
mutex_unlock(&sdp->sd_freeze_mutex);
return error;
}
+static int gfs2_freeze_fs(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+ GFS2_LFC_FREEZE_GO_SYNC);
+ if (gfs2_withdrawing_or_withdrawn(sdp))
+ return -EIO;
+ }
+ return 0;
+}
+
/**
* gfs2_thaw_super - reallow writes to the filesystem
* @sb: the VFS structure for the filesystem
@@ -814,10 +805,12 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
- error = -EINVAL;
- if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
- goto out;
+ if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) {
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return -EINVAL;
+ }
+ atomic_inc(&sb->s_active);
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
error = gfs2_do_thaw(sdp);
@@ -826,8 +819,8 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
clear_bit(SDF_FROZEN, &sdp->sd_flags);
}
-out:
mutex_unlock(&sdp->sd_freeze_mutex);
+ deactivate_super(sb);
return error;
}
@@ -1065,16 +1058,6 @@ static int gfs2_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
- do {
- if (d1 == d2)
- return 1;
- d1 = d1->d_parent;
- } while (!IS_ROOT(d1));
- return 0;
-}
-
/**
* gfs2_show_options - Show mount options for /proc/mounts
* @s: seq_file structure
@@ -1096,7 +1079,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
statfs_slow = sdp->sd_tune.gt_statfs_slow;
spin_unlock(&sdp->sd_tune.gt_spin);
- if (is_ancestor(root, sdp->sd_master_dir))
+ if (is_subdir(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
if (args->ar_lockproto[0])
seq_show_option(s, "lockproto", args->ar_lockproto);
@@ -1607,6 +1590,7 @@ const struct super_operations gfs2_super_ops = {
.put_super = gfs2_put_super,
.sync_fs = gfs2_sync_fs,
.freeze_super = gfs2_freeze_super,
+ .freeze_fs = gfs2_freeze_fs,
.thaw_super = gfs2_thaw_super,
.statfs = gfs2_statfs,
.drop_inode = gfs2_drop_inode,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 60a0206890c5..250f340cb44d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -193,7 +193,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
{
- unsigned int b = gfs2_withdrawn(sdp);
+ unsigned int b = gfs2_withdrawing_or_withdrawn(sdp);
return snprintf(buf, PAGE_SIZE, "%u\n", b);
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7e835be7032d..192213c7359a 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -268,7 +268,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
(unsigned long long)bd->bd_bh->b_blocknr);
goto out_unlock;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index da29fafb6272..f52141ce9485 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -372,7 +372,7 @@ void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
const char *function, char *file, unsigned int line,
bool delayed)
{
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
fs_err(sdp,
@@ -548,7 +548,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
const char *function, char *file, unsigned int line,
bool withdraw)
{
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
fs_err(sdp, "fatal: I/O error\n"
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 11c9d59b6889..ba071998461f 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -198,13 +198,14 @@ static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp)
}
/**
- * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn
+ * gfs2_withdrawing_or_withdrawn - test whether the file system is withdrawing
+ * or withdrawn
* @sdp: the superblock
*/
-static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
+static inline bool gfs2_withdrawing_or_withdrawn(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
- test_bit(SDF_WITHDRAWING, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWING, &sdp->sd_flags));
}
/**
@@ -213,13 +214,13 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
*/
static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) &&
- !test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAWING, &sdp->sd_flags) &&
+ !test_bit(SDF_WITHDRAWN, &sdp->sd_flags));
}
static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags));
}
#define gfs2_tune_get(sdp, field) \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a7bc4690a780..8c34798a0715 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -29,11 +29,6 @@ static const struct inode_operations hfs_file_inode_operations;
#define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO)
-static int hfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hfs_get_block, wbc);
-}
-
static int hfs_read_folio(struct file *file, struct folio *folio)
{
return block_read_full_folio(folio, hfs_get_block);
@@ -162,9 +157,10 @@ const struct address_space_operations hfs_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfs_read_folio,
- .writepage = hfs_writepage,
+ .writepages = hfs_writepages,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = hfs_bmap,
.release_folio = hfs_release_folio,
};
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 702a0663b1d8..3d326926c195 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,11 +28,6 @@ static int hfsplus_read_folio(struct file *file, struct folio *folio)
return block_read_full_folio(folio, hfsplus_get_block);
}
-static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hfsplus_get_block, wbc);
-}
-
static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
@@ -159,9 +154,10 @@ const struct address_space_operations hfsplus_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfsplus_read_folio,
- .writepage = hfsplus_writepage,
+ .writepages = hfsplus_writepages,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = hfsplus_bmap,
.release_folio = hfsplus_release_folio,
};
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0b791adf02e5..b0cb70400996 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,8 +30,7 @@ struct hfsplus_wd {
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
* @data: output pointer for location of requested data
- * @op: direction of I/O
- * @op_flags: request op flags
+ * @opf: request op flags
*
* The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
* HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
@@ -43,6 +42,8 @@ struct hfsplus_wd {
* that starts at the rounded-down address. As long as the data was
* read using hfsplus_submit_bio() and the same buffer is used things
* will work correctly.
+ *
+ * Returns: %0 on success else -errno code
*/
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, blk_opf_t opf)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f757d4f7ad98..ea5b8e57d904 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -686,7 +686,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
* at inode creation time. If this is a device special inode,
* i_mapping may not point to the original address space.
*/
- resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
/* Only regular and link inodes have associated reserve maps */
if (resv_map)
resv_map_release(&resv_map->refs);
@@ -1000,7 +1000,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
simple_inode_init_ts(inode);
- inode->i_mapping->private_data = resv_map;
+ inode->i_mapping->i_private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
default:
@@ -1129,8 +1129,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
#define hugetlbfs_migrate_folio NULL
#endif
-static int hugetlbfs_error_remove_page(struct address_space *mapping,
- struct page *page)
+static int hugetlbfs_error_remove_folio(struct address_space *mapping,
+ struct folio *folio)
{
return 0;
}
@@ -1277,7 +1277,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_end = hugetlbfs_write_end,
.dirty_folio = noop_dirty_folio,
.migrate_folio = hugetlbfs_migrate_folio,
- .error_remove_page = hugetlbfs_error_remove_page,
+ .error_remove_folio = hugetlbfs_error_remove_folio,
};
diff --git a/fs/inode.c b/fs/inode.c
index f238d987dec9..99d8754a74a3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -209,7 +209,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&mapping->nr_thps, 0);
#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
init_rwsem(&mapping->invalidate_lock);
lockdep_set_class_and_name(&mapping->invalidate_lock,
@@ -398,8 +398,8 @@ static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
- INIT_LIST_HEAD(&mapping->private_list);
- spin_lock_init(&mapping->private_lock);
+ INIT_LIST_HEAD(&mapping->i_private_list);
+ spin_lock_init(&mapping->i_private_lock);
mapping->i_mmap = RB_ROOT_CACHED;
}
@@ -464,7 +464,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
if (!mapping_shrinkable(&inode->i_data))
return;
- if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
+ if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
else if (rotate)
inode->i_state |= I_REFERENCED;
@@ -482,7 +482,7 @@ void inode_add_lru(struct inode *inode)
static void inode_lru_list_del(struct inode *inode)
{
- if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
+ if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -620,7 +620,7 @@ void clear_inode(struct inode *inode)
* nor even WARN_ON(!mapping_empty).
*/
xa_unlock_irq(&inode->i_data.i_pages);
- BUG_ON(!list_empty(&inode->i_data.private_list));
+ BUG_ON(!list_empty(&inode->i_data.i_private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
BUG_ON(!list_empty(&inode->i_wb_list));
@@ -1836,37 +1836,37 @@ EXPORT_SYMBOL(bmap);
* earlier than or equal to either the ctime or mtime,
* or if at least a day has passed since the last atime update.
*/
-static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
struct timespec64 atime, mtime, ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
- return 1;
+ return true;
/*
* Is mtime younger than or equal to atime? If yes, update atime:
*/
atime = inode_get_atime(inode);
mtime = inode_get_mtime(inode);
if (timespec64_compare(&mtime, &atime) >= 0)
- return 1;
+ return true;
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
ctime = inode_get_ctime(inode);
if (timespec64_compare(&ctime, &atime) >= 0)
- return 1;
+ return true;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
- return 1;
+ return true;
/*
* Good, we can skip the atime update:
*/
- return 0;
+ return false;
}
/**
@@ -2404,7 +2404,7 @@ EXPORT_SYMBOL(inode_init_owner);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
bool inode_owner_or_capable(struct mnt_idmap *idmap,
const struct inode *inode)
diff --git a/fs/internal.h b/fs/internal.h
index 58e43341aebf..bf2ee2e0d45d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -83,6 +83,8 @@ int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);
+int show_path(struct seq_file *m, struct dentry *root);
+
/*
* fs_struct.c
*/
@@ -94,7 +96,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
-void release_empty_file(struct file *f);
static inline void file_put_write_access(struct file *file)
{
@@ -180,7 +181,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
-extern struct file *__close_fd_get_file(unsigned int fd);
+struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
@@ -243,10 +244,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
/*
* fs/splice.c:
*/
-long splice_file_to_pipe(struct file *in,
- struct pipe_inode_info *opipe,
- loff_t *offset,
- size_t len, unsigned int flags);
+ssize_t splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags);
/*
* fs/xattr.c:
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f5fd99d6b0d4..76cf22ac97d7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -920,8 +920,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (!f.file)
return -EBADF;
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl_compat(f.file, cmd, arg);
if (error)
goto out;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f72df2babe56..093c4515b22a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
{
const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset;
- size_t poff = offset_in_page(iomap->offset);
size_t offset = offset_in_folio(folio, iomap->offset);
- void *addr;
if (folio_test_uptodate(folio))
return 0;
- if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
- return -EIO;
- if (WARN_ON_ONCE(size > PAGE_SIZE -
- offset_in_page(iomap->inline_data)))
- return -EIO;
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
ifs_alloc(iter->inode, folio, iter->flags);
- addr = kmap_local_folio(folio, offset);
- memcpy(addr, iomap->inline_data, size);
- memset(addr + size, 0, PAGE_SIZE - poff - size);
- kunmap_local(addr);
- iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
+ folio_fill_tail(folio, offset, iomap->inline_data, size);
+ iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
return 0;
}
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 9d26b1b9fc01..0925caab23c4 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -157,7 +157,7 @@ __jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
kfree(buf);
}
-void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
+static void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
{
struct jffs2_eraseblock *jeb;
uint32_t free = 0, dirty = 0, used = 0, wasted = 0,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 11c77757ead9..8eec84c651bf 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -63,10 +63,10 @@
*/
static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
-static int dbBackSplit(dmtree_t * tp, int leafno);
-static int dbJoin(dmtree_t * tp, int leafno, int newval);
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl);
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl);
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl);
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl);
static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
int level);
static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
@@ -2103,7 +2103,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system.
*/
if (dp->tree.stree[word] == NOFREE)
- dbBackSplit((dmtree_t *) & dp->tree, word);
+ dbBackSplit((dmtree_t *)&dp->tree, word, false);
dbAllocBits(bmp, dp, blkno, nblocks);
}
@@ -2189,7 +2189,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* the binary system of the leaves if need be.
*/
dbSplit(tp, word, BUDMIN,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
word += 1;
} else {
@@ -2229,7 +2229,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system of the leaves to reflect the current
* allocation (size).
*/
- dbSplit(tp, word, size, NOFREE);
+ dbSplit(tp, word, size, NOFREE, false);
/* get the number of dmap words handled */
nw = BUDSIZE(size, BUDMIN);
@@ -2336,7 +2336,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf for this dmap word.
*/
rc = dbJoin(tp, word,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
if (rc)
return rc;
@@ -2369,7 +2369,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf.
*/
- rc = dbJoin(tp, word, size);
+ rc = dbJoin(tp, word, size, false);
if (rc)
return rc;
@@ -2521,16 +2521,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
* that it is at the front of a binary buddy system.
*/
if (oldval == NOFREE) {
- rc = dbBackSplit((dmtree_t *) dcp, leafno);
+ rc = dbBackSplit((dmtree_t *)dcp, leafno, true);
if (rc) {
release_metapage(mp);
return rc;
}
oldval = dcp->stree[ti];
}
- dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+ dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true);
} else {
- rc = dbJoin((dmtree_t *) dcp, leafno, newval);
+ rc = dbJoin((dmtree_t *) dcp, leafno, newval, true);
if (rc) {
release_metapage(mp);
return rc;
@@ -2561,7 +2561,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (alloc) {
dbJoin((dmtree_t *) dcp, leafno,
- oldval);
+ oldval, true);
} else {
/* the dbJoin() above might have
* caused a larger binary buddy system
@@ -2571,9 +2571,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (dcp->stree[ti] == NOFREE)
dbBackSplit((dmtree_t *)
- dcp, leafno);
+ dcp, leafno, true);
dbSplit((dmtree_t *) dcp, leafno,
- dcp->budmin, oldval);
+ dcp->budmin, oldval, true);
}
/* release the buffer and return the error.
@@ -2621,7 +2621,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl)
{
int budsz;
int cursz;
@@ -2643,7 +2643,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
while (cursz >= splitsz) {
/* update the buddy's leaf with its new value.
*/
- dbAdjTree(tp, leafno ^ budsz, cursz);
+ dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl);
/* on to the next size and buddy.
*/
@@ -2655,7 +2655,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
/* adjust the dmap tree to reflect the specified leaf's new
* value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
}
@@ -2686,7 +2686,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static int dbBackSplit(dmtree_t * tp, int leafno)
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
{
int budsz, bud, w, bsz, size;
int cursz;
@@ -2737,7 +2737,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
* system in two.
*/
cursz = leaf[bud] - 1;
- dbSplit(tp, bud, cursz, cursz);
+ dbSplit(tp, bud, cursz, cursz, is_ctl);
break;
}
}
@@ -2763,9 +2763,11 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
* leafno - the number of the leaf to be updated.
* newval - the new value for the leaf.
*
- * RETURN VALUES: none
+ * RETURN VALUES:
+ * 0 - success
+ * -EIO - i/o error
*/
-static int dbJoin(dmtree_t * tp, int leafno, int newval)
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int budsz, buddy;
s8 *leaf;
@@ -2790,6 +2792,10 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
* get the buddy size (number of words covered) of
* the new value.
*/
+
+ if ((newval - tp->dmt_budmin) > BUDMIN)
+ return -EIO;
+
budsz = BUDSIZE(newval, tp->dmt_budmin);
/* try to join.
@@ -2820,12 +2826,12 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
if (leafno < buddy) {
/* leafno is the left buddy.
*/
- dbAdjTree(tp, buddy, NOFREE);
+ dbAdjTree(tp, buddy, NOFREE, is_ctl);
} else {
/* buddy is the left buddy and becomes
* leafno.
*/
- dbAdjTree(tp, leafno, NOFREE);
+ dbAdjTree(tp, leafno, NOFREE, is_ctl);
leafno = buddy;
}
@@ -2838,7 +2844,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
/* update the leaf value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
return 0;
}
@@ -2859,15 +2865,20 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
*
* RETURN VALUES: none
*/
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int lp, pp, k;
- int max;
+ int max, size;
+
+ size = is_ctl ? CTLTREESIZE : TREESIZE;
/* pick up the index of the leaf for this leafno.
*/
lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+ if (WARN_ON_ONCE(lp >= size || lp < 0))
+ return;
+
/* is the current value the same as the old value ? if so,
* there is nothing to do.
*/
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 92b7c533407c..031d8f570f58 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -633,6 +633,11 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
index = base + (lim >> 1);
+ if (stbl[index] < 0) {
+ rc = -EIO;
+ goto out;
+ }
+
if (p->header.flag & BT_LEAF) {
/* uppercase leaf name to compare */
cmp =
@@ -1970,7 +1975,7 @@ static int dtSplitRoot(tid_t tid,
do {
f = &rp->slot[fsi];
fsi = f->next;
- } while (fsi != -1);
+ } while (fsi >= 0);
f->next = n;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index a037ee59e398..2ec35889ad24 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2179,6 +2179,9 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* get the ag and iag numbers for this iag.
*/
agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+ if (agno >= MAXAG || agno < 0)
+ return -EIO;
+
iagno = le32_to_cpu(iagp->iagnum);
/* check if this is the last free extent within the
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 415eb65a36ff..9b5c6a20b30c 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -172,15 +172,15 @@ int jfs_mount(struct super_block *sb)
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
- /* map further access of per fileset inodes by the fileset inode */
- sbi->ipimap = ipimap;
-
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
goto err_ipimap;
}
+ /* map further access of per fileset inodes by the fileset inode */
+ sbi->ipimap = ipimap;
+
return rc;
/*
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index dccc8b3f1045..be17e3c43582 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2702,6 +2702,7 @@ int jfs_lazycommit(void *arg)
unsigned long flags;
struct jfs_sb_info *sbi;
+ set_freezable();
do {
LAZY_LOCK(flags);
jfs_commit_thread_waking = 0; /* OK to wake another thread */
@@ -2884,6 +2885,7 @@ int jfs_sync(void *arg)
struct jfs_inode_info *jfs_ip;
tid_t tid;
+ set_freezable();
do {
/*
* write each inode on the anonymous inode list
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f8af6c3ae336..73f37f298087 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/highuid.h>
+#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
@@ -397,9 +398,10 @@ static int minix_get_block(struct inode *inode, sector_t block,
return V2_minix_get_block(inode, block, bh_result, create);
}
-static int minix_writepage(struct page *page, struct writeback_control *wbc)
+static int minix_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, minix_get_block, wbc);
+ return mpage_writepages(mapping, wbc, minix_get_block);
}
static int minix_read_folio(struct file *file, struct folio *folio)
@@ -444,9 +446,10 @@ static const struct address_space_operations minix_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = minix_read_folio,
- .writepage = minix_writepage,
+ .writepages = minix_writepages,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = minix_bmap,
.direct_IO = noop_direct_IO
};
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 57d1dedf3f8f..64c5205e2b5e 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -9,8 +9,16 @@
#include "internal.h"
+/*
+ * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
+ * never from raw values. These are just internal helpers.
+ */
+#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
+#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
+
struct mnt_idmap {
- struct user_namespace *owner;
+ struct uid_gid_map uid_map;
+ struct uid_gid_map gid_map;
refcount_t count;
};
@@ -20,25 +28,11 @@ struct mnt_idmap {
* mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
*/
struct mnt_idmap nop_mnt_idmap = {
- .owner = &init_user_ns,
.count = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);
/**
- * check_fsmapping - check whether an mount idmapping is allowed
- * @idmap: idmap of the relevent mount
- * @sb: super block of the filesystem
- *
- * Return: true if @idmap is allowed, false if not.
- */
-bool check_fsmapping(const struct mnt_idmap *idmap,
- const struct super_block *sb)
-{
- return idmap->owner != sb->s_user_ns;
-}
-
-/**
* initial_idmapping - check whether this is the initial mapping
* @ns: idmapping to check
*
@@ -53,26 +47,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns)
}
/**
- * no_idmapping - check whether we can skip remapping a kuid/gid
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- *
- * This function can be used to check whether a remapping between two
- * idmappings is required.
- * An idmapped mount is a mount that has an idmapping attached to it that
- * is different from the filsystem's idmapping and the initial idmapping.
- * If the initial mapping is used or the idmapping of the mount and the
- * filesystem are identical no remapping is required.
- *
- * Return: true if remapping can be skipped, false if not.
- */
-static inline bool no_idmapping(const struct user_namespace *mnt_userns,
- const struct user_namespace *fs_userns)
-{
- return initial_idmapping(mnt_userns) || mnt_userns == fs_userns;
-}
-
-/**
* make_vfsuid - map a filesystem kuid according to an idmapping
* @idmap: the mount's idmapping
* @fs_userns: the filesystem's idmapping
@@ -81,8 +55,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
* Take a @kuid and remap it from @fs_userns into @idmap. Use this
* function when preparing a @kuid to be reported to userspace.
*
- * If no_idmapping() determines that this is not an idmapped mount we can
- * simply return @kuid unchanged.
+ * If initial_idmapping() determines that this is not an idmapped mount
+ * we can simply return @kuid unchanged.
* If initial_idmapping() tells us that the filesystem is not mounted with an
* idmapping we know the value of @kuid won't change when calling
* from_kuid() so we can simply retrieve the value via __kuid_val()
@@ -94,13 +68,12 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
*/
vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
- struct user_namespace *fs_userns,
- kuid_t kuid)
+ struct user_namespace *fs_userns,
+ kuid_t kuid)
{
uid_t uid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return VFSUIDT_INIT(kuid);
if (initial_idmapping(fs_userns))
uid = __kuid_val(kuid);
@@ -108,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
uid = from_kuid(fs_userns, kuid);
if (uid == (uid_t)-1)
return INVALID_VFSUID;
- return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+ return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);
@@ -121,8 +94,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid);
* Take a @kgid and remap it from @fs_userns into @idmap. Use this
* function when preparing a @kgid to be reported to userspace.
*
- * If no_idmapping() determines that this is not an idmapped mount we can
- * simply return @kgid unchanged.
+ * If initial_idmapping() determines that this is not an idmapped mount
+ * we can simply return @kgid unchanged.
* If initial_idmapping() tells us that the filesystem is not mounted with an
* idmapping we know the value of @kgid won't change when calling
* from_kgid() so we can simply retrieve the value via __kgid_val()
@@ -136,9 +109,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, kgid_t kgid)
{
gid_t gid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return VFSGIDT_INIT(kgid);
if (initial_idmapping(fs_userns))
gid = __kgid_val(kgid);
@@ -146,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
gid = from_kgid(fs_userns, kgid);
if (gid == (gid_t)-1)
return INVALID_VFSGID;
- return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+ return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);
@@ -165,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
uid_t uid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return AS_KUIDT(vfsuid);
- uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+ uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
if (uid == (uid_t)-1)
return INVALID_UID;
if (initial_idmapping(fs_userns))
@@ -193,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
gid_t gid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return AS_KGIDT(vfsgid);
- gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+ gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
if (gid == (gid_t)-1)
return INVALID_GID;
if (initial_idmapping(fs_userns))
@@ -228,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid)
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
+static int copy_mnt_idmap(struct uid_gid_map *map_from,
+ struct uid_gid_map *map_to)
+{
+ struct uid_gid_extent *forward, *reverse;
+ u32 nr_extents = READ_ONCE(map_from->nr_extents);
+ /* Pairs with smp_wmb() when writing the idmapping. */
+ smp_rmb();
+
+ /*
+ * Don't blindly copy @map_to into @map_from if nr_extents is
+ * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
+ * read @nr_extents someone could have written an idmapping and
+ * then we might end up with inconsistent data. So just don't do
+ * anything at all.
+ */
+ if (nr_extents == 0)
+ return 0;
+
+ /*
+ * Here we know that nr_extents is greater than zero which means
+ * a map has been written. Since idmappings can't be changed
+ * once they have been written we know that we can safely copy
+ * from @map_to into @map_from.
+ */
+
+ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+ *map_to = *map_from;
+ return 0;
+ }
+
+ forward = kmemdup(map_from->forward,
+ nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
+ if (!forward)
+ return -ENOMEM;
+
+ reverse = kmemdup(map_from->reverse,
+ nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
+ if (!reverse) {
+ kfree(forward);
+ return -ENOMEM;
+ }
+
+ /*
+ * The idmapping isn't exposed anywhere so we don't need to care
+ * about ordering between extent pointers and @nr_extents
+ * initialization.
+ */
+ map_to->forward = forward;
+ map_to->reverse = reverse;
+ map_to->nr_extents = nr_extents;
+ return 0;
+}
+
+static void free_mnt_idmap(struct mnt_idmap *idmap)
+{
+ if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(idmap->uid_map.forward);
+ kfree(idmap->uid_map.reverse);
+ }
+ if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(idmap->gid_map.forward);
+ kfree(idmap->gid_map.reverse);
+ }
+ kfree(idmap);
+}
+
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
struct mnt_idmap *idmap;
+ int ret;
idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
if (!idmap)
return ERR_PTR(-ENOMEM);
- idmap->owner = get_user_ns(mnt_userns);
refcount_set(&idmap->count, 1);
+ ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
+ if (!ret)
+ ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
+ if (ret) {
+ free_mnt_idmap(idmap);
+ idmap = ERR_PTR(ret);
+ }
return idmap;
}
@@ -267,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
*/
void mnt_idmap_put(struct mnt_idmap *idmap)
{
- if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
- put_user_ns(idmap->owner);
- kfree(idmap);
- }
+ if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+ free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/mount.h b/fs/mount.h
index 130c07c2f8d2..4a42fc68f4cc 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
- /*
- * Traversal and modification of .list is protected by either
- * - taking namespace_sem for write, OR
- * - taking namespace_sem for read AND taking .ns_lock.
- */
- struct list_head list;
- spinlock_t ns_lock;
+ struct rb_root mounts; /* Protected by namespace_sem */
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
- unsigned int mounts; /* # of mounts in the namespace */
+ unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
} __randomize_layout;
@@ -55,7 +49,10 @@ struct mount {
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
- struct list_head mnt_list;
+ union {
+ struct rb_node mnt_node; /* Under ns->mounts */
+ struct list_head mnt_list;
+ };
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
@@ -72,7 +69,8 @@ struct mount {
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
- int mnt_id; /* mount identifier */
+ int mnt_id; /* mount identifier, reused */
+ u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
@@ -127,7 +125,6 @@ struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
- struct mount cursor;
};
extern const struct seq_operations mounts_op;
@@ -146,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0;
}
+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+ WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+ mnt->mnt.mnt_flags &= ~MNT_ONRB;
+ rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+ list_add_tail(&mnt->mnt_list, dt_list);
+}
+
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/mpage.c b/fs/mpage.c
index ffb064ed9d04..738882e0766d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -166,7 +166,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
@@ -205,6 +205,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
unsigned map_offset = block_in_file - args->first_logical_block;
unsigned last = nblocks - map_offset;
+ first_block = map_bh->b_blocknr + map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
@@ -212,8 +213,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
if (page_block == blocks_per_page)
break;
- blocks[page_block] = map_bh->b_blocknr + map_offset +
- relative_block;
page_block++;
block_in_file++;
}
@@ -259,7 +258,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
+ if (!page_block)
+ first_block = map_bh->b_blocknr;
+ else if (first_block + page_block != map_bh->b_blocknr)
goto confused;
nblocks = map_bh->b_size >> blkbits;
for (relative_block = 0; ; relative_block++) {
@@ -268,7 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
break;
} else if (page_block == blocks_per_page)
break;
- blocks[page_block] = map_bh->b_blocknr+relative_block;
page_block++;
block_in_file++;
}
@@ -289,7 +289,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
/*
* This folio will go to BIO. Do we need to send this BIO off first?
*/
- if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
+ if (args->bio && (args->last_block_in_bio != first_block - 1))
args->bio = mpage_bio_submit_read(args->bio);
alloc_new:
@@ -298,7 +298,7 @@ alloc_new:
gfp);
if (args->bio == NULL)
goto confused;
- args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ args->bio->bi_iter.bi_sector = first_block << (blkbits - 9);
}
length = first_hole << blkbits;
@@ -313,7 +313,7 @@ alloc_new:
(first_hole != blocks_per_page))
args->bio = mpage_bio_submit_read(args->bio);
else
- args->last_block_in_bio = blocks[blocks_per_page - 1];
+ args->last_block_in_bio = first_block + blocks_per_page - 1;
out:
return args->bio;
@@ -430,13 +430,13 @@ struct mpage_data {
* We have our BIO, so we can now mark the buffers clean. Make
* sure to only clean buffers which we know we'll be writing.
*/
-static void clean_buffers(struct page *page, unsigned first_unmapped)
+static void clean_buffers(struct folio *folio, unsigned first_unmapped)
{
unsigned buffer_counter = 0;
- struct buffer_head *bh, *head;
- if (!page_has_buffers(page))
+ struct buffer_head *bh, *head = folio_buffers(folio);
+
+ if (!head)
return;
- head = page_buffers(page);
bh = head;
do {
@@ -451,18 +451,8 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
* read_folio would fail to serialize with the bh and it would read from
* disk before we reach the platter.
*/
- if (buffer_heads_over_limit && PageUptodate(page))
- try_to_free_buffers(page_folio(page));
-}
-
-/*
- * For situations where we want to clean all buffers attached to a page.
- * We don't need to calculate how many buffers are attached to the page,
- * we just need to specify a number larger than the maximum number of buffers.
- */
-void clean_page_buffers(struct page *page)
-{
- clean_buffers(page, ~0U);
+ if (buffer_heads_over_limit && folio_test_uptodate(folio))
+ try_to_free_buffers(folio);
}
static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
@@ -476,7 +466,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
@@ -514,10 +504,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
if (page_block) {
- if (bh->b_blocknr != blocks[page_block-1] + 1)
+ if (bh->b_blocknr != first_block + page_block)
goto confused;
+ } else {
+ first_block = bh->b_blocknr;
}
- blocks[page_block++] = bh->b_blocknr;
+ page_block++;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
@@ -566,10 +558,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
boundary_bdev = map_bh.b_bdev;
}
if (page_block) {
- if (map_bh.b_blocknr != blocks[page_block-1] + 1)
+ if (map_bh.b_blocknr != first_block + page_block)
goto confused;
+ } else {
+ first_block = map_bh.b_blocknr;
}
- blocks[page_block++] = map_bh.b_blocknr;
+ page_block++;
boundary = buffer_boundary(&map_bh);
bdev = map_bh.b_bdev;
if (block_in_file == last_block)
@@ -601,7 +595,7 @@ page_is_mapped:
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
- if (bio && mpd->last_block_in_bio != blocks[0] - 1)
+ if (bio && mpd->last_block_in_bio != first_block - 1)
bio = mpage_bio_submit_write(bio);
alloc_new:
@@ -609,7 +603,7 @@ alloc_new:
bio = bio_alloc(bdev, BIO_MAX_VECS,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS);
- bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
}
@@ -625,7 +619,7 @@ alloc_new:
goto alloc_new;
}
- clean_buffers(&folio->page, first_unmapped);
+ clean_buffers(folio, first_unmapped);
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
@@ -637,7 +631,7 @@ alloc_new:
boundary_block, 1 << blkbits);
}
} else {
- mpd->last_block_in_bio = blocks[blocks_per_page - 1];
+ mpd->last_block_in_bio = first_block + blocks_per_page - 1;
}
goto out;
@@ -648,7 +642,7 @@ confused:
/*
* The caller has a ref on the inode, so *mapping is stable
*/
- ret = block_write_full_page(&folio->page, mpd->get_block, wbc);
+ ret = block_write_full_folio(folio, wbc, mpd->get_block);
mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
diff --git a/fs/namei.c b/fs/namei.c
index 71c13b2990b4..faae721e4d63 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int check_acl(struct mnt_idmap *idmap,
struct inode *inode, int mask)
@@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int acl_permission_check(struct mnt_idmap *idmap,
struct inode *inode, int mask)
@@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask)
@@ -2467,7 +2467,7 @@ static int handle_lookup_down(struct nameidata *nd)
return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
const char *s = path_init(nd, flags);
@@ -2522,7 +2522,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
return retval;
}
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
struct path *parent)
{
@@ -3158,7 +3158,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool want_excl)
@@ -3646,7 +3646,7 @@ static int do_open(struct nameidata *nd,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int vfs_tmpfile(struct mnt_idmap *idmap,
const struct path *parentpath,
@@ -3785,10 +3785,7 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- if (unlikely(file->f_mode & FMODE_OPENED))
- fput(file);
- else
- release_empty_file(file);
+ fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -3954,7 +3951,7 @@ EXPORT_SYMBOL(user_path_create);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
@@ -4080,7 +4077,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
@@ -4161,7 +4158,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry)
@@ -4290,7 +4287,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, struct inode **delegated_inode)
@@ -4443,7 +4440,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *oldname)
@@ -4535,7 +4532,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
struct inode *dir, struct dentry *new_dentry,
diff --git a/fs/namespace.c b/fs/namespace.c
index fbf0e596fcd3..95b2fff91f67 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
+#include <linux/nospec.h>
#include "pnode.h"
#include "internal.h"
@@ -68,6 +69,9 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
+/* Don't allow confusion with old 32bit mount ID */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
@@ -131,6 +135,7 @@ static int mnt_alloc_id(struct mount *mnt)
if (res < 0)
return res;
mnt->mnt_id = res;
+ mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
return 0;
}
@@ -730,21 +735,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
return m;
}
-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
- spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
- spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
- return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
/*
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
* current mount namespace.
@@ -763,19 +753,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
bool __is_local_mountpoint(struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool is_covered = false;
down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
- if (mnt_is_cursor(mnt))
- continue;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
is_covered = (mnt->mnt_mountpoint == dentry);
if (is_covered)
break;
}
- unlock_ns_list(ns);
up_read(&namespace_sem);
return is_covered;
@@ -1022,6 +1008,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
mnt_add_count(old_parent, -1);
}
+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+ return node ? rb_entry(node, struct mount, mnt_node) : NULL;
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+ struct rb_node **link = &ns->mounts.rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+ mnt->mnt_ns = ns;
+ while (*link) {
+ parent = *link;
+ if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ rb_link_node(&mnt->mnt_node, parent, link);
+ rb_insert_color(&mnt->mnt_node, &ns->mounts);
+ mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1035,12 +1045,13 @@ static void commit_tree(struct mount *mnt)
BUG_ON(parent == mnt);
list_add_tail(&head, &mnt->mnt_list);
- list_for_each_entry(m, &head, mnt_list)
- m->mnt_ns = n;
+ while (!list_empty(&head)) {
+ m = list_first_entry(&head, typeof(*m), mnt_list);
+ list_del(&m->mnt_list);
- list_splice(&head, n->list.prev);
-
- n->mounts += n->pending_mounts;
+ mnt_add_to_ns(n, m);
+ }
+ n->nr_mounts += n->pending_mounts;
n->pending_mounts = 0;
__attach_mnt(mnt, parent);
@@ -1188,7 +1199,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
}
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1413,65 +1424,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
return &p->mnt;
}
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
- struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
- struct mount *mnt, *ret = NULL;
+ struct rb_node *node = ns->mounts.rb_node;
+ struct mount *ret = NULL;
- lock_ns_list(ns);
- list_for_each_continue(p, &ns->list) {
- mnt = list_entry(p, typeof(*mnt), mnt_list);
- if (!mnt_is_cursor(mnt)) {
- ret = mnt;
- break;
+ while (node) {
+ struct mount *m = node_to_mount(node);
+
+ if (mnt_id <= m->mnt_id_unique) {
+ ret = node_to_mount(node);
+ if (mnt_id == m->mnt_id_unique)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
}
}
- unlock_ns_list(ns);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
- struct list_head *prev;
down_read(&namespace_sem);
- if (!*pos) {
- prev = &p->ns->list;
- } else {
- prev = &p->cursor.mnt_list;
-
- /* Read after we'd reached the end? */
- if (list_empty(prev))
- return NULL;
- }
- return mnt_list_next(p->ns, prev);
+ return mnt_find_id_at(p->ns, *pos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
+ struct mount *next = NULL, *mnt = v;
+ struct rb_node *node = rb_next(&mnt->mnt_node);
++*pos;
- return mnt_list_next(p->ns, &mnt->mnt_list);
+ if (node) {
+ next = node_to_mount(node);
+ *pos = next->mnt_id_unique;
+ }
+ return next;
}
static void m_stop(struct seq_file *m, void *v)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
-
- lock_ns_list(p->ns);
- if (mnt)
- list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
- else
- list_del_init(&p->cursor.mnt_list);
- unlock_ns_list(p->ns);
up_read(&namespace_sem);
}
@@ -1489,14 +1492,6 @@ const struct seq_operations mounts_op = {
.show = m_show,
};
-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
- down_read(&namespace_sem);
- lock_ns_list(ns);
- list_del(&cursor->mnt_list);
- unlock_ns_list(ns);
- up_read(&namespace_sem);
-}
#endif /* CONFIG_PROC_FS */
/**
@@ -1638,7 +1633,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
- list_move(&p->mnt_list, &tmp_list);
+ if (p->mnt.mnt_flags & MNT_ONRB)
+ move_from_ns(p, &tmp_list);
+ else
+ list_move(&p->mnt_list, &tmp_list);
}
/* Hide the mounts from mnt_mounts */
@@ -1658,7 +1656,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_del_init(&p->mnt_list);
ns = p->mnt_ns;
if (ns) {
- ns->mounts--;
+ ns->nr_mounts--;
__touch_mnt_namespace(ns);
}
p->mnt_ns = NULL;
@@ -1784,14 +1782,16 @@ static int do_umount(struct mount *mnt, int flags)
event++;
if (flags & MNT_DETACH) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
@@ -2209,9 +2209,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
unsigned int mounts = 0;
struct mount *p;
- if (ns->mounts >= max)
+ if (ns->nr_mounts >= max)
return -ENOSPC;
- max -= ns->mounts;
+ max -= ns->nr_mounts;
if (ns->pending_mounts >= max)
return -ENOSPC;
max -= ns->pending_mounts;
@@ -2355,8 +2355,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
+ LIST_HEAD(head);
+
/* move from anon - the caller will destroy */
- list_del_init(&source_mnt->mnt_ns->list);
+ for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+ move_from_ns(p, &head);
+ list_del_init(&head);
}
if (beneath)
mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2667,11 +2671,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
- p->mnt_ns = ns;
- ns->mounts++;
+ mnt_add_to_ns(ns, p);
+ ns->nr_mounts++;
}
ns->root = mnt;
- list_add_tail(&ns->list, &mnt->mnt_list);
mntget(&mnt->mnt);
unlock_mount_hash();
namespace_unlock();
@@ -2875,7 +2878,12 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ /*
+ * Indicate to the filesystem that the remount request is coming
+ * from the legacy mount system call.
+ */
fc->oldapi = true;
+
err = parse_monolithic_mount_data(fc, data);
if (!err) {
down_write(&sb->s_umount);
@@ -3026,6 +3034,7 @@ static inline bool path_overmounted(const struct path *path)
* can_move_mount_beneath - check that we can mount beneath the top mount
* @from: mount to mount beneath
* @to: mount under which to mount
+ * @mp: mountpoint of @to
*
* - Make sure that @to->dentry is actually the root of a mount under
* which we can mount another mount.
@@ -3324,6 +3333,12 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ /*
+ * Indicate to the filesystem that the mount request is coming
+ * from the legacy mount system call.
+ */
+ fc->oldapi = true;
+
if (subtype)
err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
@@ -3734,9 +3749,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
- INIT_LIST_HEAD(&new_ns->list);
+ new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
- spin_lock_init(&new_ns->ns_lock);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
return new_ns;
@@ -3783,7 +3797,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
unlock_mount_hash();
}
new_ns->root = new;
- list_add_tail(&new_ns->list, &new->mnt_list);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3793,8 +3806,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
p = old;
q = new;
while (p) {
- q->mnt_ns = new_ns;
- new_ns->mounts++;
+ mnt_add_to_ns(new_ns, q);
+ new_ns->nr_mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
@@ -3836,10 +3849,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
mntput(m);
return ERR_CAST(ns);
}
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts++;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts++;
+ mnt_add_to_ns(ns, mnt);
err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4017,10 +4029,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
goto err_path;
}
mnt = real_mount(newmount.mnt);
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts = 1;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, mnt);
mntget(newmount.mnt);
/* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4288,7 +4299,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
* Creating an idmapped mount with the filesystem wide idmapping
* doesn't make sense so block that. We don't allow mushy semantics.
*/
- if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
+ if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
return -EINVAL;
/*
@@ -4676,6 +4687,438 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+int show_path(struct seq_file *m, struct dentry *root)
+{
+ if (root->d_sb->s_op->show_path)
+ return root->d_sb->s_op->show_path(m, root);
+
+ seq_dentry(m, root, " \t\n\\");
+ return 0;
+}
+
+static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+ struct mount *mnt = mnt_find_id_at(ns, id);
+
+ if (!mnt || mnt->mnt_id_unique != id)
+ return NULL;
+
+ return &mnt->mnt;
+}
+
+struct kstatmount {
+ struct statmount __user *buf;
+ size_t bufsize;
+ struct vfsmount *mnt;
+ u64 mask;
+ struct path root;
+ struct statmount sm;
+ struct seq_file seq;
+};
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+ unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+ u64 attr_flags = 0;
+
+ if (mnt_flags & MNT_READONLY)
+ attr_flags |= MOUNT_ATTR_RDONLY;
+ if (mnt_flags & MNT_NOSUID)
+ attr_flags |= MOUNT_ATTR_NOSUID;
+ if (mnt_flags & MNT_NODEV)
+ attr_flags |= MOUNT_ATTR_NODEV;
+ if (mnt_flags & MNT_NOEXEC)
+ attr_flags |= MOUNT_ATTR_NOEXEC;
+ if (mnt_flags & MNT_NODIRATIME)
+ attr_flags |= MOUNT_ATTR_NODIRATIME;
+ if (mnt_flags & MNT_NOSYMFOLLOW)
+ attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+ if (mnt_flags & MNT_NOATIME)
+ attr_flags |= MOUNT_ATTR_NOATIME;
+ else if (mnt_flags & MNT_RELATIME)
+ attr_flags |= MOUNT_ATTR_RELATIME;
+ else
+ attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+ if (is_idmapped_mnt(mnt))
+ attr_flags |= MOUNT_ATTR_IDMAP;
+
+ return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+ u64 propagation = 0;
+
+ if (IS_MNT_SHARED(m))
+ propagation |= MS_SHARED;
+ if (IS_MNT_SLAVE(m))
+ propagation |= MS_SLAVE;
+ if (IS_MNT_UNBINDABLE(m))
+ propagation |= MS_UNBINDABLE;
+ if (!propagation)
+ propagation |= MS_PRIVATE;
+
+ return propagation;
+}
+
+static void statmount_sb_basic(struct kstatmount *s)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ s->sm.mask |= STATMOUNT_SB_BASIC;
+ s->sm.sb_dev_major = MAJOR(sb->s_dev);
+ s->sm.sb_dev_minor = MINOR(sb->s_dev);
+ s->sm.sb_magic = sb->s_magic;
+ s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+}
+
+static void statmount_mnt_basic(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ s->sm.mask |= STATMOUNT_MNT_BASIC;
+ s->sm.mnt_id = m->mnt_id_unique;
+ s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+ s->sm.mnt_id_old = m->mnt_id;
+ s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+ s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+ s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+ s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+ s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+}
+
+static void statmount_propagate_from(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
+ if (IS_MNT_SLAVE(m))
+ s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+}
+
+static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+ size_t start = seq->count;
+
+ ret = show_path(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /*
+ * Unescape the result. It would be better if supplied string was not
+ * escaped in the first place, but that's a pretty invasive change.
+ */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ return 0;
+}
+
+static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ int err;
+
+ err = seq_path_root(seq, &mnt_path, &s->root, "");
+ return err == SEQ_SKIP ? 0 : err;
+}
+
+static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ seq_puts(seq, sb->s_type->name);
+ return 0;
+}
+
+static int statmount_string(struct kstatmount *s, u64 flag)
+{
+ int ret;
+ size_t kbufsize;
+ struct seq_file *seq = &s->seq;
+ struct statmount *sm = &s->sm;
+
+ switch (flag) {
+ case STATMOUNT_FS_TYPE:
+ sm->fs_type = seq->count;
+ ret = statmount_fs_type(s, seq);
+ break;
+ case STATMOUNT_MNT_ROOT:
+ sm->mnt_root = seq->count;
+ ret = statmount_mnt_root(s, seq);
+ break;
+ case STATMOUNT_MNT_POINT:
+ sm->mnt_point = seq->count;
+ ret = statmount_mnt_point(s, seq);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return -EINVAL;
+ }
+
+ if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
+ return -EOVERFLOW;
+ if (kbufsize >= s->bufsize)
+ return -EOVERFLOW;
+
+ /* signal a retry */
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (ret)
+ return ret;
+
+ seq->buf[seq->count++] = '\0';
+ sm->mask |= flag;
+ return 0;
+}
+
+static int copy_statmount_to_user(struct kstatmount *s)
+{
+ struct statmount *sm = &s->sm;
+ struct seq_file *seq = &s->seq;
+ char __user *str = ((char __user *)s->buf) + sizeof(*sm);
+ size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
+
+ if (seq->count && copy_to_user(str, seq->buf, seq->count))
+ return -EFAULT;
+
+ /* Return the number of bytes copied to the buffer */
+ sm->size = copysize + seq->count;
+ if (copy_to_user(s->buf, sm, copysize))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int do_statmount(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+ int err;
+
+ /*
+ * Don't trigger audit denials. We just want to determine what
+ * mounts to show users.
+ */
+ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
+ !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ err = security_sb_statfs(s->mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (s->mask & STATMOUNT_SB_BASIC)
+ statmount_sb_basic(s);
+
+ if (s->mask & STATMOUNT_MNT_BASIC)
+ statmount_mnt_basic(s);
+
+ if (s->mask & STATMOUNT_PROPAGATE_FROM)
+ statmount_propagate_from(s);
+
+ if (s->mask & STATMOUNT_FS_TYPE)
+ err = statmount_string(s, STATMOUNT_FS_TYPE);
+
+ if (!err && s->mask & STATMOUNT_MNT_ROOT)
+ err = statmount_string(s, STATMOUNT_MNT_ROOT);
+
+ if (!err && s->mask & STATMOUNT_MNT_POINT)
+ err = statmount_string(s, STATMOUNT_MNT_POINT);
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static inline bool retry_statmount(const long ret, size_t *seq_size)
+{
+ if (likely(ret != -EAGAIN))
+ return false;
+ if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
+ return false;
+ if (unlikely(*seq_size > MAX_RW_COUNT))
+ return false;
+ return true;
+}
+
+static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
+ struct statmount __user *buf, size_t bufsize,
+ size_t seq_size)
+{
+ if (!access_ok(buf, bufsize))
+ return -EFAULT;
+
+ memset(ks, 0, sizeof(*ks));
+ ks->mask = kreq->param;
+ ks->buf = buf;
+ ks->bufsize = bufsize;
+ ks->seq.size = seq_size;
+ ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
+ if (!ks->seq.buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static int copy_mnt_id_req(const struct mnt_id_req __user *req,
+ struct mnt_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ return 0;
+}
+
+SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
+ struct statmount __user *, buf, size_t, bufsize,
+ unsigned int, flags)
+{
+ struct vfsmount *mnt;
+ struct mnt_id_req kreq;
+ struct kstatmount ks;
+ /* We currently support retrieval of 3 strings. */
+ size_t seq_size = 3 * PATH_MAX;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ ret = copy_mnt_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+retry:
+ ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
+ if (ret)
+ return ret;
+
+ down_read(&namespace_sem);
+ mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+ if (!mnt) {
+ up_read(&namespace_sem);
+ kvfree(ks.seq.buf);
+ return -ENOENT;
+ }
+
+ ks.mnt = mnt;
+ get_fs_root(current->fs, &ks.root);
+ ret = do_statmount(&ks);
+ path_put(&ks.root);
+ up_read(&namespace_sem);
+
+ if (!ret)
+ ret = copy_statmount_to_user(&ks);
+ kvfree(ks.seq.buf);
+ if (retry_statmount(ret, &seq_size))
+ goto retry;
+ return ret;
+}
+
+static struct mount *listmnt_next(struct mount *curr)
+{
+ return node_to_mount(rb_next(&curr->mnt_node));
+}
+
+static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
+ u64 __user *buf, size_t bufsize,
+ const struct path *root)
+{
+ struct mount *r;
+ ssize_t ctr;
+ int err;
+
+ /*
+ * Don't trigger audit denials. We just want to determine what
+ * mounts to show users.
+ */
+ if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
+ !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ err = security_sb_statfs(orig->dentry);
+ if (err)
+ return err;
+
+ for (ctr = 0, r = first; r && ctr < bufsize; r = listmnt_next(r)) {
+ if (r->mnt_id_unique == mnt_id)
+ continue;
+ if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+ continue;
+ ctr = array_index_nospec(ctr, bufsize);
+ if (put_user(r->mnt_id_unique, buf + ctr))
+ return -EFAULT;
+ if (check_add_overflow(ctr, 1, &ctr))
+ return -ERANGE;
+ }
+ return ctr;
+}
+
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
+ u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+{
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct mnt_id_req kreq;
+ struct mount *first;
+ struct path root, orig;
+ u64 mnt_id, last_mnt_id;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ ret = copy_mnt_id_req(req, &kreq);
+ if (ret)
+ return ret;
+ mnt_id = kreq.mnt_id;
+ last_mnt_id = kreq.param;
+
+ down_read(&namespace_sem);
+ get_fs_root(current->fs, &root);
+ if (mnt_id == LSMT_ROOT) {
+ orig = root;
+ } else {
+ ret = -ENOENT;
+ orig.mnt = lookup_mnt_in_ns(mnt_id, ns);
+ if (!orig.mnt)
+ goto err;
+ orig.dentry = orig.mnt->mnt_root;
+ }
+ if (!last_mnt_id)
+ first = node_to_mount(rb_first(&ns->mounts));
+ else
+ first = mnt_find_id_at(ns, last_mnt_id + 1);
+
+ ret = do_listmount(first, &orig, mnt_id, buf, bufsize, &root);
+err:
+ path_put(&root);
+ up_read(&namespace_sem);
+ return ret;
+}
+
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
@@ -4691,10 +5134,9 @@ static void __init init_mount_tree(void)
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
m = real_mount(mnt);
- m->mnt_ns = ns;
ns->root = m;
- ns->mounts = 1;
- list_add(&m->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, m);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
@@ -4821,18 +5263,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool visible = false;
down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;
- if (mnt_is_cursor(mnt))
- continue;
-
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
@@ -4880,7 +5318,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
next: ;
}
found:
- unlock_ns_list(ns);
up_read(&namespace_sem);
return visible;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3f9768810427..e8cccb94b927 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -567,7 +567,7 @@ const struct address_space_operations nfs_file_aops = {
.migrate_folio = nfs_migrate_folio,
.launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
.swap_rw = nfs_swap_rw,
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 2ad66a8922f4..49aaf28a6950 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- return list_lru_add(lru, &entry->lru);
+ return list_lru_add_obj(lru, &entry->lru);
}
static bool
@@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- return list_lru_del(lru, &entry->lru);
+ return list_lru_del_obj(lru, &entry->lru);
}
/*
@@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode)
oldcache = nfsi->xattr_cache;
if (oldcache != NULL) {
- list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru);
oldcache->inode = NULL;
}
nfsi->xattr_cache = NULL;
@@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add)
kref_get(&cache->ref);
nfsi->xattr_cache = cache;
cache->inode = inode;
- list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru);
}
spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 02788c3c85e5..e238abc78a13 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_ssc.h>
+#include <linux/splice.h>
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
@@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(file_in, pos_in, file_out,
- pos_out, count, flags);
+ ret = splice_copy_file_range(file_in, pos_in, file_out,
+ pos_out, count);
return ret;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b664caea8b4e..7248705faef4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
if (!folio_test_private(folio))
return NULL;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
req = nfs_folio_private_request(folio);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return req;
}
@@ -769,13 +769,13 @@ static void nfs_inode_add_request(struct nfs_page *req)
* Swap-space should not get truncated. Hence no need to plug the race
* with invalidate/truncate.
*/
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(!folio_test_swapcache(folio))) {
set_bit(PG_MAPPED, &req->wb_flags);
folio_set_private(folio);
folio->private = req;
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
atomic_long_inc(&nfsi->nrequests);
/* this a head request for a page group - mark it as having an
* extra reference so sub groups can follow suit.
@@ -796,13 +796,13 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio_file_mapping(folio);
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(folio && !folio_test_swapcache(folio))) {
folio->private = NULL;
folio_clear_private(folio);
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 66dd5059f1bb..9cb7f0c33df5 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+ if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
return true;
}
@@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf)
static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+ if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_del(nf);
return true;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cc63fd52a493..6e7e37192461 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1039,7 +1039,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_splice(rqstp, fhp, offset, *count);
- host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+ host_err = rw_verify_area(READ, file, &offset, *count);
+ if (!host_err)
+ host_err = splice_direct_to_actor(file, &sd,
+ nfsd_direct_splice_actor);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1176,9 +1179,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
- file_start_write(file);
host_err = vfs_iter_write(file, &iter, &pos, flags);
- file_end_write(file);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5710833ac1cc..0131d83b912d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -64,8 +64,8 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
return bh;
}
@@ -75,7 +75,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
{
struct buffer_head *bh;
struct inode *inode = btnc->host;
- struct page *page;
+ struct folio *folio;
int err;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
@@ -83,7 +83,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
return -ENOMEM;
err = -EEXIST; /* internal code */
- page = bh->b_page;
+ folio = bh->b_folio;
if (buffer_uptodate(bh) || buffer_dirty(bh))
goto found;
@@ -130,8 +130,8 @@ found:
*pbh = bh;
out_locked:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
@@ -145,19 +145,19 @@ out_locked:
void nilfs_btnode_delete(struct buffer_head *bh)
{
struct address_space *mapping;
- struct page *page = bh->b_page;
- pgoff_t index = page_index(page);
+ struct folio *folio = bh->b_folio;
+ pgoff_t index = folio->index;
int still_dirty;
- get_page(page);
- lock_page(page);
- wait_on_page_writeback(page);
+ folio_get(folio);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
nilfs_forget_buffer(bh);
- still_dirty = PageDirty(page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
+ still_dirty = folio_test_dirty(folio);
+ mapping = folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
if (!still_dirty && mapping)
invalidate_inode_pages2_range(mapping, index, index);
@@ -185,23 +185,23 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
ctxt->newbh = NULL;
if (inode->i_blkbits == PAGE_SHIFT) {
- struct page *opage = obh->b_page;
- lock_page(opage);
+ struct folio *ofolio = obh->b_folio;
+ folio_lock(ofolio);
retry:
/* BUG_ON(oldkey != obh->b_folio->index); */
- if (unlikely(oldkey != opage->index))
- NILFS_PAGE_BUG(opage,
+ if (unlikely(oldkey != ofolio->index))
+ NILFS_FOLIO_BUG(ofolio,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
xa_lock_irq(&btnc->i_pages);
- err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
+ err = __xa_insert(&btnc->i_pages, newkey, ofolio, GFP_NOFS);
xa_unlock_irq(&btnc->i_pages);
/*
- * Note: page->index will not change to newkey until
+ * Note: folio->index will not change to newkey until
* nilfs_btnode_commit_change_key() will be called.
- * To protect the page in intermediate state, the page lock
+ * To protect the folio in intermediate state, the folio lock
* is held.
*/
if (!err)
@@ -213,7 +213,7 @@ retry:
if (!err)
goto retry;
/* fallback to copy mode */
- unlock_page(opage);
+ folio_unlock(ofolio);
}
nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -225,7 +225,7 @@ retry:
return 0;
failed_unlock:
- unlock_page(obh->b_page);
+ folio_unlock(obh->b_folio);
return err;
}
@@ -238,15 +238,15 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
{
struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
- struct page *opage;
+ struct folio *ofolio;
if (oldkey == newkey)
return;
if (nbh == NULL) { /* blocksize == pagesize */
- opage = obh->b_page;
- if (unlikely(oldkey != opage->index))
- NILFS_PAGE_BUG(opage,
+ ofolio = obh->b_folio;
+ if (unlikely(oldkey != ofolio->index))
+ NILFS_FOLIO_BUG(ofolio,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
@@ -257,8 +257,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
__xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&btnc->i_pages);
- opage->index = obh->b_blocknr = newkey;
- unlock_page(opage);
+ ofolio->index = obh->b_blocknr = newkey;
+ folio_unlock(ofolio);
} else {
nilfs_copy_buffer(nbh, obh);
mark_buffer_dirty(nbh);
@@ -284,7 +284,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_erase_irq(&btnc->i_pages, newkey);
- unlock_page(ctxt->bh->b_page);
+ folio_unlock(ctxt->bh->b_folio);
} else {
/*
* When canceling a buffer that a prepare operation has
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 9ebefb3acb0e..39136637f715 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -552,11 +552,29 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
}
/**
- * nilfs_cpfile_get_cpinfo -
- * @cpfile:
- * @cno:
- * @ci:
- * @nci:
+ * nilfs_cpfile_get_cpinfo - get information on checkpoints
+ * @cpfile: checkpoint file inode
+ * @cnop: place to pass a starting checkpoint number and receive a
+ * checkpoint number to continue the search
+ * @mode: mode of checkpoints that the caller wants to retrieve
+ * @buf: buffer for storing checkpoints' information
+ * @cisz: byte size of one checkpoint info item in array
+ * @nci: number of checkpoint info items to retrieve
+ *
+ * nilfs_cpfile_get_cpinfo() searches for checkpoints in @mode state
+ * starting from the checkpoint number stored in @cnop, and stores
+ * information about found checkpoints in @buf.
+ * The buffer pointed to by @buf must be large enough to store information
+ * for @nci checkpoints. If at least one checkpoint information is
+ * successfully retrieved, @cnop is updated to point to the checkpoint
+ * number to continue searching.
+ *
+ * Return: Count of checkpoint info items stored in the output buffer on
+ * success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint mode.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOENT - Invalid checkpoint number specified.
*/
ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index de2073c47651..bc846b904b68 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -64,12 +64,6 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode)
return inode->i_sb->s_blocksize;
}
-static inline void nilfs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -84,48 +78,46 @@ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+static int nilfs_prepare_chunk(struct folio *folio, unsigned int from,
unsigned int to)
{
- loff_t pos = page_offset(page) + from;
+ loff_t pos = folio_pos(folio) + from;
- return __block_write_begin(page, pos, to - from, nilfs_get_block);
+ return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block);
}
-static void nilfs_commit_chunk(struct page *page,
- struct address_space *mapping,
- unsigned int from, unsigned int to)
+static void nilfs_commit_chunk(struct folio *folio,
+ struct address_space *mapping, size_t from, size_t to)
{
struct inode *dir = mapping->host;
- loff_t pos = page_offset(page) + from;
- unsigned int len = to - from;
- unsigned int nr_dirty, copied;
+ loff_t pos = folio_pos(folio) + from;
+ size_t copied, len = to - from;
+ unsigned int nr_dirty;
int err;
- nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
- copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to);
+ copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
if (pos + copied > dir->i_size)
i_size_write(dir, pos + copied);
if (IS_DIRSYNC(dir))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
err = nilfs_set_file_dirty(dir, nr_dirty);
WARN_ON(err); /* do not happen */
- unlock_page(page);
+ folio_unlock(folio);
}
-static bool nilfs_check_page(struct page *page)
+static bool nilfs_check_folio(struct folio *folio, char *kaddr)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned int chunk_size = nilfs_chunk_size(dir);
- char *kaddr = page_address(page);
- unsigned int offs, rec_len;
- unsigned int limit = PAGE_SIZE;
+ size_t offs, rec_len;
+ size_t limit = folio_size(folio);
struct nilfs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
+ if (dir->i_size < folio_pos(folio) + limit) {
+ limit = dir->i_size - folio_pos(folio);
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -147,7 +139,7 @@ static bool nilfs_check_page(struct page *page)
if (offs != limit)
goto Eend;
out:
- SetPageChecked(page);
+ folio_set_checked(folio);
return true;
/* Too bad, we had an error */
@@ -170,8 +162,8 @@ Espan:
error = "directory entry across blocks";
bad_entry:
nilfs_error(sb,
- "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index << PAGE_SHIFT) + offs,
+ "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
+ dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs,
(unsigned long)le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
@@ -179,29 +171,34 @@ Eend:
p = (struct nilfs_dir_entry *)(kaddr + offs);
nilfs_error(sb,
"entry in directory #%lu spans the page boundary offset=%lu, inode=%lu",
- dir->i_ino, (page->index << PAGE_SHIFT) + offs,
+ dir->i_ino, (folio->index << PAGE_SHIFT) + offs,
(unsigned long)le64_to_cpu(p->inode));
fail:
- SetPageError(page);
+ folio_set_error(folio);
return false;
}
-static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+static void *nilfs_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+ void *kaddr;
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (!nilfs_check_page(page))
- goto fail;
- }
+ if (IS_ERR(folio))
+ return folio;
+
+ kaddr = kmap_local_folio(folio, 0);
+ if (unlikely(!folio_test_checked(folio))) {
+ if (!nilfs_check_folio(folio, kaddr))
+ goto fail;
}
- return page;
+
+ *foliop = folio;
+ return kaddr;
fail:
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return ERR_PTR(-EIO);
}
@@ -275,21 +272,21 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct nilfs_dir_entry *de;
- struct page *page = nilfs_get_page(inode, n);
+ struct folio *folio;
- if (IS_ERR(page)) {
+ kaddr = nilfs_get_folio(inode, n, &folio);
+ if (IS_ERR(kaddr)) {
nilfs_error(sb, "bad page in #%lu", inode->i_ino);
ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
- kaddr = page_address(page);
de = (struct nilfs_dir_entry *)(kaddr + offset);
limit = kaddr + nilfs_last_byte(inode, n) -
NILFS_DIR_REC_LEN(1);
for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
if (de->rec_len == 0) {
nilfs_error(sb, "zero-length directory entry");
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return -EIO;
}
if (de->inode) {
@@ -302,72 +299,67 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode), t)) {
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return 0;
}
}
ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
}
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
/*
- * nilfs_find_entry()
+ * nilfs_find_entry()
+ *
+ * Finds an entry in the specified directory with the wanted name. It
+ * returns the folio in which the entry was found, and the entry itself.
+ * The folio is mapped and unlocked. When the caller is finished with
+ * the entry, it should call folio_release_kmap().
*
- * finds an entry in the specified directory with the wanted name. It
- * returns the page in which the entry was found, and the entry itself
- * (as a parameter - res_dir). Page is returned mapped and unlocked.
- * Entry is guaranteed to be valid.
+ * On failure, returns NULL and the caller should ignore foliop.
*/
-struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
- struct page **res_page)
+struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
+ const struct qstr *qstr, struct folio **foliop)
{
const unsigned char *name = qstr->name;
int namelen = qstr->len;
unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
struct nilfs_inode_info *ei = NILFS_I(dir);
struct nilfs_dir_entry *de;
if (npages == 0)
goto out;
- /* OFFSET_CACHE */
- *res_page = NULL;
-
start = ei->i_dir_start_lookup;
if (start >= npages)
start = 0;
n = start;
do {
- char *kaddr;
+ char *kaddr = nilfs_get_folio(dir, n, foliop);
- page = nilfs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
+ if (!IS_ERR(kaddr)) {
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
nilfs_error(dir->i_sb,
"zero-length directory entry");
- nilfs_put_page(page);
+ folio_release_kmap(*foliop, kaddr);
goto out;
}
if (nilfs_match(namelen, name, de))
goto found;
de = nilfs_next_entry(de);
}
- nilfs_put_page(page);
+ folio_release_kmap(*foliop, kaddr);
}
if (++n >= npages)
n = 0;
- /* next page is past the blocks we've got */
+ /* next folio is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
nilfs_error(dir->i_sb,
"dir %lu size %lld exceeds block count %llu",
@@ -380,55 +372,47 @@ out:
return NULL;
found:
- *res_page = page;
ei->i_dir_start_lookup = n;
return de;
}
-struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop)
{
- struct page *page = nilfs_get_page(dir, 0);
- struct nilfs_dir_entry *de = NULL;
+ struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop);
- if (!IS_ERR(page)) {
- de = nilfs_next_entry(
- (struct nilfs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
+ if (IS_ERR(de))
+ return NULL;
+ return nilfs_next_entry(de);
}
ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
{
ino_t res = 0;
struct nilfs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
- de = nilfs_find_entry(dir, qstr, &page);
+ de = nilfs_find_entry(dir, qstr, &folio);
if (de) {
res = le64_to_cpu(de->inode);
- kunmap(page);
- put_page(page);
+ folio_release_kmap(folio, de);
}
return res;
}
-/* Releases the page */
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
- struct page *page, struct inode *inode)
+ struct folio *folio, struct inode *inode)
{
- unsigned int from = (char *)de - (char *)page_address(page);
- unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
- struct address_space *mapping = page->mapping;
+ size_t from = offset_in_folio(folio, de);
+ size_t to = from + nilfs_rec_len_from_disk(de->rec_len);
+ struct address_space *mapping = folio->mapping;
int err;
- lock_page(page);
- err = nilfs_prepare_chunk(page, from, to);
+ folio_lock(folio);
+ err = nilfs_prepare_chunk(folio, from, to);
BUG_ON(err);
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
- nilfs_commit_chunk(page, mapping, from, to);
- nilfs_put_page(page);
+ nilfs_commit_chunk(folio, mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
@@ -443,31 +427,28 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
unsigned int chunk_size = nilfs_chunk_size(dir);
unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct nilfs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
- char *kaddr;
- unsigned int from, to;
+ size_t from, to;
int err;
/*
* We take care of directory expansion in the same loop.
- * This code plays outside i_size, so it locks the page
+ * This code plays outside i_size, so it locks the folio
* to protect that region.
*/
for (n = 0; n <= npages; n++) {
+ char *kaddr = nilfs_get_folio(dir, n, &folio);
char *dir_end;
- page = nilfs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
+ folio_lock(folio);
dir_end = kaddr + nilfs_last_byte(dir, n);
de = (struct nilfs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
+ kaddr += folio_size(folio) - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -494,16 +475,16 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
de = (struct nilfs_dir_entry *)((char *)de + rec_len);
}
- unlock_page(page);
- nilfs_put_page(page);
+ folio_unlock(folio);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- from = (char *)de - (char *)page_address(page);
+ from = offset_in_folio(folio, de);
to = from + rec_len;
- err = nilfs_prepare_chunk(page, from, to);
+ err = nilfs_prepare_chunk(folio, from, to);
if (err)
goto out_unlock;
if (de->inode) {
@@ -518,29 +499,28 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
- nilfs_commit_chunk(page, page->mapping, from, to);
+ nilfs_commit_chunk(folio, folio->mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
- nilfs_put_page(page);
-out:
+ folio_release_kmap(folio, de);
return err;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
goto out_put;
}
/*
* nilfs_delete_entry deletes a directory entry by merging it with the
- * previous entry. Page is up-to-date. Releases the page.
+ * previous entry. Folio is up-to-date.
*/
-int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
- char *kaddr = page_address(page);
- unsigned int from, to;
+ char *kaddr = (char *)((unsigned long)dir & ~(folio_size(folio) - 1));
+ size_t from, to;
struct nilfs_dir_entry *de, *pde = NULL;
int err;
@@ -559,17 +539,16 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
de = nilfs_next_entry(de);
}
if (pde)
- from = (char *)pde - (char *)page_address(page);
- lock_page(page);
- err = nilfs_prepare_chunk(page, from, to);
+ from = (char *)pde - kaddr;
+ folio_lock(folio);
+ err = nilfs_prepare_chunk(folio, from, to);
BUG_ON(err);
if (pde)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
- nilfs_commit_chunk(page, mapping, from, to);
+ nilfs_commit_chunk(folio, mapping, from, to);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
out:
- nilfs_put_page(page);
return err;
}
@@ -579,21 +558,21 @@ out:
int nilfs_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
+ struct folio *folio = filemap_grab_folio(mapping, 0);
unsigned int chunk_size = nilfs_chunk_size(inode);
struct nilfs_dir_entry *de;
int err;
void *kaddr;
- if (!page)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- err = nilfs_prepare_chunk(page, 0, chunk_size);
+ err = nilfs_prepare_chunk(folio, 0, chunk_size);
if (unlikely(err)) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_folio(folio, 0);
memset(kaddr, 0, chunk_size);
de = (struct nilfs_dir_entry *)kaddr;
de->name_len = 1;
@@ -608,10 +587,10 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
de->inode = cpu_to_le64(parent->i_ino);
memcpy(de->name, "..\0", 4);
nilfs_set_de_type(de, inode);
- kunmap_atomic(kaddr);
- nilfs_commit_chunk(page, mapping, 0, chunk_size);
+ kunmap_local(kaddr);
+ nilfs_commit_chunk(folio, mapping, 0, chunk_size);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -620,18 +599,17 @@ fail:
*/
int nilfs_empty_dir(struct inode *inode)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
+ char *kaddr;
unsigned long i, npages = dir_pages(inode);
for (i = 0; i < npages; i++) {
- char *kaddr;
struct nilfs_dir_entry *de;
- page = nilfs_get_page(inode, i);
- if (IS_ERR(page))
+ kaddr = nilfs_get_folio(inode, i, &folio);
+ if (IS_ERR(kaddr))
continue;
- kaddr = page_address(page);
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
@@ -657,12 +635,12 @@ int nilfs_empty_dir(struct inode *inode)
}
de = nilfs_next_entry(de);
}
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return 0;
}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 740ce26d1e76..bec33b89a075 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -45,34 +45,36 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vma->vm_file);
struct nilfs_transaction_info ti;
+ struct buffer_head *bh, *head;
int ret = 0;
if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
return VM_FAULT_SIGBUS; /* -ENOSPC */
sb_start_pagefault(inode->i_sb);
- lock_page(page);
- if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping ||
+ folio_pos(folio) >= i_size_read(inode) ||
+ !folio_test_uptodate(folio)) {
+ folio_unlock(folio);
ret = -EFAULT; /* make the VM retry the fault */
goto out;
}
/*
- * check to see if the page is mapped already (no holes)
+ * check to see if the folio is mapped already (no holes)
*/
- if (PageMappedToDisk(page))
+ if (folio_test_mappedtodisk(folio))
goto mapped;
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
int fully_mapped = 1;
- bh = head = page_buffers(page);
+ bh = head;
do {
if (!buffer_mapped(bh)) {
fully_mapped = 0;
@@ -81,11 +83,11 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
} while (bh = bh->b_this_page, bh != head);
if (fully_mapped) {
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
goto mapped;
}
}
- unlock_page(page);
+ folio_unlock(folio);
/*
* fill hole blocks
@@ -105,7 +107,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
out:
sb_end_pagefault(inode->i_sb);
return vmf_fs_error(ret);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8beb2730929d..bf9a11d58817 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -98,8 +98,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
*out_bh = bh;
failed:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
if (unlikely(err))
brelse(bh);
return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index f861f3a0bf5c..9c334c722fc1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -175,7 +175,8 @@ static int nilfs_writepages(struct address_space *mapping,
static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
int err;
if (sb_rdonly(inode->i_sb)) {
@@ -185,13 +186,13 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
* have dirty pages that try to be flushed in background.
* So, here we simply discard this dirty page.
*/
- nilfs_clear_dirty_page(page, false);
- unlock_page(page);
+ nilfs_clear_folio_dirty(folio, false);
+ folio_unlock(folio);
return -EROFS;
}
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
if (wbc->sync_mode == WB_SYNC_ALL) {
err = nilfs_construct_segment(inode->i_sb);
@@ -214,7 +215,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
/*
* The page may not be locked, eg if called from try_to_unmap_one()
*/
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
head = folio_buffers(folio);
if (head) {
struct buffer_head *bh = head;
@@ -230,7 +231,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
} else if (ret) {
nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 40ffade49f38..cfb6aca5ec38 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -872,16 +872,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
nsegs = argv[4].v_nmembs;
if (argv[4].v_size != argsz[4])
goto out;
- if (nsegs > UINT_MAX / sizeof(__u64))
- goto out;
/*
* argv[4] points to segment numbers this ioctl cleans. We
- * use kmalloc() for its buffer because memory used for the
- * segment numbers is enough small.
+ * use kmalloc() for its buffer because the memory used for the
+ * segment numbers is small enough.
*/
- kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
- nsegs * sizeof(__u64));
+ kbufs[4] = memdup_array_user((void __user *)(unsigned long)argv[4].v_base,
+ nsegs, sizeof(__u64));
if (IS_ERR(kbufs[4])) {
ret = PTR_ERR(kbufs[4]);
goto out;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c97c77a39668..e45c01a559c0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -97,8 +97,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
}
failed_bh:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
brelse(bh);
failed_unlock:
@@ -158,8 +158,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf,
*out_bh = bh;
failed_bh:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
brelse(bh);
failed:
return ret;
@@ -399,7 +399,8 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
static int
nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
struct super_block *sb;
int err = 0;
@@ -407,16 +408,16 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
/*
* It means that filesystem was remounted in read-only
* mode because of error or metadata corruption. But we
- * have dirty pages that try to be flushed in background.
- * So, here we simply discard this dirty page.
+ * have dirty folios that try to be flushed in background.
+ * So, here we simply discard this dirty folio.
*/
- nilfs_clear_dirty_page(page, false);
- unlock_page(page);
+ nilfs_clear_folio_dirty(folio, false);
+ folio_unlock(folio);
return -EROFS;
}
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
if (!inode)
return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 2a4e7f4a8102..959bd9fb3d81 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -260,11 +260,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
struct nilfs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
int err;
err = -ENOENT;
- de = nilfs_find_entry(dir, &dentry->d_name, &page);
+ de = nilfs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
goto out;
@@ -279,7 +279,8 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
- err = nilfs_delete_entry(de, page);
+ err = nilfs_delete_entry(de, folio);
+ folio_release_kmap(folio, de);
if (err)
goto out;
@@ -347,9 +348,9 @@ static int nilfs_rename(struct mnt_idmap *idmap,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
+ struct folio *dir_folio = NULL;
struct nilfs_dir_entry *dir_de = NULL;
- struct page *old_page;
+ struct folio *old_folio;
struct nilfs_dir_entry *old_de;
struct nilfs_transaction_info ti;
int err;
@@ -362,19 +363,19 @@ static int nilfs_rename(struct mnt_idmap *idmap,
return err;
err = -ENOENT;
- old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
goto out;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = nilfs_dotdot(old_inode, &dir_page);
+ dir_de = nilfs_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page *new_page;
+ struct folio *new_folio;
struct nilfs_dir_entry *new_de;
err = -ENOTEMPTY;
@@ -382,10 +383,11 @@ static int nilfs_rename(struct mnt_idmap *idmap,
goto out_dir;
err = -ENOENT;
- new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+ new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- nilfs_set_link(new_dir, new_de, new_page, old_inode);
+ nilfs_set_link(new_dir, new_de, new_folio, old_inode);
+ folio_release_kmap(new_folio, new_de);
nilfs_mark_inode_dirty(new_dir);
inode_set_ctime_current(new_inode);
if (dir_de)
@@ -408,12 +410,15 @@ static int nilfs_rename(struct mnt_idmap *idmap,
*/
inode_set_ctime_current(old_inode);
- nilfs_delete_entry(old_de, old_page);
+ nilfs_delete_entry(old_de, old_folio);
if (dir_de) {
- nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+ nilfs_set_link(old_inode, dir_de, dir_folio, new_dir);
+ folio_release_kmap(dir_folio, dir_de);
drop_nlink(old_dir);
}
+ folio_release_kmap(old_folio, old_de);
+
nilfs_mark_inode_dirty(old_dir);
nilfs_mark_inode_dirty(old_inode);
@@ -421,13 +426,10 @@ static int nilfs_rename(struct mnt_idmap *idmap,
return err;
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ folio_release_kmap(old_folio, old_de);
out:
nilfs_transaction_abort(old_dir->i_sb);
return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8046490cd7fe..98cffaf0ac12 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -226,16 +226,16 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
}
/* dir.c */
-extern int nilfs_add_link(struct dentry *, struct inode *);
-extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
-extern int nilfs_make_empty(struct inode *, struct inode *);
-extern struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
-extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
-extern int nilfs_empty_dir(struct inode *);
-extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
-extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
- struct page *, struct inode *);
+int nilfs_add_link(struct dentry *, struct inode *);
+ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+int nilfs_make_empty(struct inode *, struct inode *);
+struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
+ struct folio **);
+int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *);
+int nilfs_empty_dir(struct inode *);
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **);
+void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+ struct folio *, struct inode *);
/* file.c */
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 06b04758f289..5c2eba1987bd 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -73,7 +73,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
*/
void nilfs_forget_buffer(struct buffer_head *bh)
{
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
@@ -81,12 +81,12 @@ void nilfs_forget_buffer(struct buffer_head *bh)
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
- if (nilfs_page_buffers_clean(page))
- __nilfs_clear_page_dirty(page);
+ if (nilfs_folio_buffers_clean(folio))
+ __nilfs_clear_folio_dirty(folio);
bh->b_blocknr = -1;
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
+ folio_clear_uptodate(folio);
+ folio_clear_mappedtodisk(folio);
unlock_buffer(bh);
brelse(bh);
}
@@ -131,48 +131,49 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
}
/**
- * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
- * @page: page to be checked
+ * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not.
+ * @folio: Folio to be checked.
*
- * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
- * Otherwise, it returns non-zero value.
+ * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers.
+ * Otherwise, it returns true.
*/
-int nilfs_page_buffers_clean(struct page *page)
+bool nilfs_folio_buffers_clean(struct folio *folio)
{
struct buffer_head *bh, *head;
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
do {
if (buffer_dirty(bh))
- return 0;
+ return false;
bh = bh->b_this_page;
} while (bh != head);
- return 1;
+ return true;
}
-void nilfs_page_bug(struct page *page)
+void nilfs_folio_bug(struct folio *folio)
{
+ struct buffer_head *bh, *head;
struct address_space *m;
unsigned long ino;
- if (unlikely(!page)) {
- printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+ if (unlikely(!folio)) {
+ printk(KERN_CRIT "NILFS_FOLIO_BUG(NULL)\n");
return;
}
- m = page->mapping;
+ m = folio->mapping;
ino = m ? m->host->i_ino : 0;
- printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+ printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
- page, page_ref_count(page),
- (unsigned long long)page->index, page->flags, m, ino);
+ folio, folio_ref_count(folio),
+ (unsigned long long)folio->index, folio->flags, m, ino);
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
int i = 0;
- bh = head = page_buffers(page);
+ bh = head;
do {
printk(KERN_CRIT
" BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
@@ -258,7 +259,7 @@ repeat:
folio_lock(folio);
if (unlikely(!folio_test_dirty(folio)))
- NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
+ NILFS_FOLIO_BUG(folio, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
if (unlikely(IS_ERR(dfolio))) {
@@ -268,7 +269,7 @@ repeat:
break;
}
if (unlikely(!folio_buffers(folio)))
- NILFS_PAGE_BUG(&folio->page,
+ NILFS_FOLIO_BUG(folio,
"found empty page in dat page cache");
nilfs_copy_folio(dfolio, folio, true);
@@ -379,7 +380,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
* was acquired. Skip processing in that case.
*/
if (likely(folio->mapping == mapping))
- nilfs_clear_dirty_page(&folio->page, silent);
+ nilfs_clear_folio_dirty(folio, silent);
folio_unlock(folio);
}
@@ -389,32 +390,33 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
}
/**
- * nilfs_clear_dirty_page - discard dirty page
- * @page: dirty page that will be discarded
+ * nilfs_clear_folio_dirty - discard dirty folio
+ * @folio: dirty folio that will be discarded
* @silent: suppress [true] or print [false] warning messages
*/
-void nilfs_clear_dirty_page(struct page *page, bool silent)
+void nilfs_clear_folio_dirty(struct folio *folio, bool silent)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh, *head;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!silent)
nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
- page_offset(page), inode->i_ino);
+ folio_pos(folio), inode->i_ino);
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
+ folio_clear_uptodate(folio);
+ folio_clear_mappedtodisk(folio);
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
- bh = head = page_buffers(page);
+ bh = head;
do {
lock_buffer(bh);
if (!silent)
@@ -427,7 +429,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
} while (bh = bh->b_this_page, bh != head);
}
- __nilfs_clear_page_dirty(page);
+ __nilfs_clear_folio_dirty(folio);
}
unsigned int nilfs_page_count_clean_buffers(struct page *page,
@@ -457,22 +459,23 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page,
* 2) Some B-tree operations like insertion or deletion may dispose buffers
* in dirty state, and this needs to cancel the dirty state of their pages.
*/
-int __nilfs_clear_page_dirty(struct page *page)
+void __nilfs_clear_folio_dirty(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
if (mapping) {
xa_lock_irq(&mapping->i_pages);
- if (test_bit(PG_dirty, &page->flags)) {
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ if (folio_test_dirty(folio)) {
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&mapping->i_pages);
- return clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
+ return;
}
xa_unlock_irq(&mapping->i_pages);
- return 0;
+ return;
}
- return TestClearPageDirty(page);
+ folio_clear_dirty(folio);
}
/**
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index d249ea1cefff..7e1a2c455a10 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -30,18 +30,18 @@ BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
-int __nilfs_clear_page_dirty(struct page *);
+void __nilfs_clear_folio_dirty(struct folio *);
struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
unsigned long, unsigned long);
void nilfs_forget_buffer(struct buffer_head *);
void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
-int nilfs_page_buffers_clean(struct page *);
-void nilfs_page_bug(struct page *);
+bool nilfs_folio_buffers_clean(struct folio *);
+void nilfs_folio_bug(struct folio *);
int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
-void nilfs_clear_dirty_page(struct page *, bool);
+void nilfs_clear_folio_dirty(struct folio *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
unsigned int);
@@ -49,7 +49,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff);
-#define NILFS_PAGE_BUG(page, m, a...) \
- do { nilfs_page_bug(page); BUG(); } while (0)
+#define NILFS_FOLIO_BUG(folio, m, a...) \
+ do { nilfs_folio_bug(folio); BUG(); } while (0)
#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55e31cc903d1..2590a0860eab 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1665,39 +1665,39 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
return 0;
}
-static void nilfs_begin_page_io(struct page *page)
+static void nilfs_begin_folio_io(struct folio *folio)
{
- if (!page || PageWriteback(page))
+ if (!folio || folio_test_writeback(folio))
/*
* For split b-tree node pages, this function may be called
* twice. We ignore the 2nd or later calls by this check.
*/
return;
- lock_page(page);
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
- unlock_page(page);
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
}
static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
struct buffer_head *bh;
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- if (bh->b_page != bd_page) {
- if (bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
}
- bd_page = bh->b_page;
+ bd_folio = bh->b_folio;
}
}
@@ -1705,28 +1705,28 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
- if (bh->b_page != bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
+ bd_folio = bh->b_folio;
}
break;
}
- if (bh->b_page != fs_page) {
- nilfs_begin_page_io(fs_page);
- fs_page = bh->b_page;
+ if (bh->b_folio != fs_folio) {
+ nilfs_begin_folio_io(fs_folio);
+ fs_folio = bh->b_folio;
}
}
}
- if (bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
+ if (bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
}
- nilfs_begin_page_io(fs_page);
+ nilfs_begin_folio_io(fs_folio);
}
static int nilfs_segctor_write(struct nilfs_sc_info *sci,
@@ -1739,17 +1739,18 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
return ret;
}
-static void nilfs_end_page_io(struct page *page, int err)
+static void nilfs_end_folio_io(struct folio *folio, int err)
{
- if (!page)
+ if (!folio)
return;
- if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
+ if (buffer_nilfs_node(folio_buffers(folio)) &&
+ !folio_test_writeback(folio)) {
/*
* For b-tree node pages, this function may be called twice
* or more because they might be split in a segment.
*/
- if (PageDirty(page)) {
+ if (folio_test_dirty(folio)) {
/*
* For pages holding split b-tree node buffers, dirty
* flag on the buffers may be cleared discretely.
@@ -1757,30 +1758,30 @@ static void nilfs_end_page_io(struct page *page, int err)
* remaining buffers, and it must be cancelled if
* all the buffers get cleaned later.
*/
- lock_page(page);
- if (nilfs_page_buffers_clean(page))
- __nilfs_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ if (nilfs_folio_buffers_clean(folio))
+ __nilfs_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
return;
}
if (!err) {
- if (!nilfs_page_buffers_clean(page))
- __set_page_dirty_nobuffers(page);
- ClearPageError(page);
+ if (!nilfs_folio_buffers_clean(folio))
+ filemap_dirty_folio(folio->mapping, folio);
+ folio_clear_error(folio);
} else {
- __set_page_dirty_nobuffers(page);
- SetPageError(page);
+ filemap_dirty_folio(folio->mapping, folio);
+ folio_set_error(folio);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
static void nilfs_abort_logs(struct list_head *logs, int err)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
struct buffer_head *bh;
if (list_empty(logs))
@@ -1790,10 +1791,10 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
clear_buffer_uptodate(bh);
- if (bh->b_page != bd_page) {
- if (bd_page)
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
}
@@ -1802,22 +1803,22 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
- if (bh->b_page != bd_page) {
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
break;
}
- if (bh->b_page != fs_page) {
- nilfs_end_page_io(fs_page, err);
- fs_page = bh->b_page;
+ if (bh->b_folio != fs_folio) {
+ nilfs_end_folio_io(fs_folio, err);
+ fs_folio = bh->b_folio;
}
}
}
- if (bd_page)
- end_page_writeback(bd_page);
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
- nilfs_end_page_io(fs_page, err);
+ nilfs_end_folio_io(fs_folio, err);
}
static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
@@ -1859,7 +1860,7 @@ static void nilfs_set_next_segment(struct the_nilfs *nilfs,
static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int update_sr = false;
@@ -1870,21 +1871,21 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- if (bh->b_page != bd_page) {
- if (bd_page)
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
}
/*
- * We assume that the buffers which belong to the same page
+ * We assume that the buffers which belong to the same folio
* continue over the buffer list.
- * Under this assumption, the last BHs of pages is
- * identifiable by the discontinuity of bh->b_page
- * (page != fs_page).
+ * Under this assumption, the last BHs of folios is
+ * identifiable by the discontinuity of bh->b_folio
+ * (folio != fs_folio).
*
* For B-tree node blocks, however, this assumption is not
- * guaranteed. The cleanup code of B-tree node pages needs
+ * guaranteed. The cleanup code of B-tree node folios needs
* special care.
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
@@ -1897,16 +1898,16 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
- if (bh->b_page != bd_page) {
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
update_sr = true;
break;
}
- if (bh->b_page != fs_page) {
- nilfs_end_page_io(fs_page, 0);
- fs_page = bh->b_page;
+ if (bh->b_folio != fs_folio) {
+ nilfs_end_folio_io(fs_folio, 0);
+ fs_folio = bh->b_folio;
}
}
@@ -1920,13 +1921,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
}
}
/*
- * Since pages may continue over multiple segment buffers,
- * end of the last page must be checked outside of the loop.
+ * Since folios may continue over multiple segment buffers,
+ * end of the last folio must be checked outside of the loop.
*/
- if (bd_page)
- end_page_writeback(bd_page);
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
- nilfs_end_page_io(fs_page, 0);
+ nilfs_end_folio_io(fs_folio, 0);
nilfs_drop_collected_inodes(&sci->sc_dirty_files);
@@ -2587,6 +2588,7 @@ static int nilfs_segctor_thread(void *arg)
"segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ set_freezable();
spin_lock(&sci->sc_state_lock);
loop:
for (;;) {
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 58ca7c936393..0a8119456c21 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -471,10 +471,15 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kunmap_atomic(kaddr);
return;
}
- WARN_ON(nilfs_segment_usage_error(su));
- WARN_ON(!nilfs_segment_usage_dirty(su));
+ if (unlikely(nilfs_segment_usage_error(su)))
+ nilfs_warn(sufile->i_sb, "free segment %llu marked in error",
+ (unsigned long long)segnum);
sudirty = nilfs_segment_usage_dirty(su);
+ if (unlikely(!sudirty))
+ nilfs_warn(sufile->i_sb, "free unallocated segment %llu",
+ (unsigned long long)segnum);
+
nilfs_segment_usage_set_clean(su);
kunmap_atomic(kaddr);
mark_buffer_dirty(su_bh);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index a5d1fa4e7552..df8674173b22 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1314,15 +1314,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
return ERR_CAST(s);
if (!s->s_root) {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * __invalidate_device()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- up_write(&s->s_umount);
err = setup_bdev_super(s, flags, NULL);
- down_write(&s->s_umount);
if (!err)
err = nilfs_fill_super(s, data,
flags & SB_SILENT ? 1 : 0);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9dac7f6e72d2..1e4def21811e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path)
hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS);
}
-static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
- __kernel_fsid_t *fsid2)
-{
- return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
-}
-
static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid)
{
return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^
@@ -838,9 +832,8 @@ out:
}
/*
- * Get cached fsid of the filesystem containing the object from any connector.
- * All connectors are supposed to have the same fsid, but we do not verify that
- * here.
+ * Get cached fsid of the filesystem containing the object from any mark.
+ * All marks are supposed to have the same fsid, but we do not verify that here.
*/
static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
{
@@ -849,18 +842,11 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
__kernel_fsid_t fsid = {};
fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
- struct fsnotify_mark_connector *conn;
-
- conn = READ_ONCE(mark->connector);
- /* Mark is just getting destroyed or created? */
- if (!conn)
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID))
continue;
- if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID))
- continue;
- /* Pairs with smp_wmb() in fsnotify_add_mark_list() */
- smp_rmb();
- fsid = conn->fsid;
- if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ fsid = FANOTIFY_MARK(mark)->fsid;
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) &&
+ WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
continue;
return fsid;
}
@@ -942,12 +928,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
return 0;
}
- if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) {
+ if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
fsid = fanotify_get_fsid(iter_info);
- /* Racing with mark destruction or creation? */
- if (!fsid.val[0] && !fsid.val[1])
- return 0;
- }
event = fanotify_alloc_event(group, mask, data, data_type, dir,
file_name, &fsid, match_mask);
@@ -1068,7 +1050,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark,
static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
{
- kmem_cache_free(fanotify_mark_cache, fsn_mark);
+ kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark));
}
const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 6936671e148d..e5ab33cae6a7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -489,6 +489,22 @@ static inline unsigned int fanotify_event_hash_bucket(
return event->hash & FANOTIFY_HTABLE_MASK;
}
+struct fanotify_mark {
+ struct fsnotify_mark fsn_mark;
+ __kernel_fsid_t fsid;
+};
+
+static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark)
+{
+ return container_of(mark, struct fanotify_mark, fsn_mark);
+}
+
+static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
+ __kernel_fsid_t *fsid2)
+{
+ return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
+}
+
static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
{
unsigned int mflags = 0;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4d765c72496f..f83e7cc5ccf2 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -23,7 +23,7 @@
#include <asm/ioctls.h>
-#include "../../mount.h"
+#include "../fsnotify.h"
#include "../fdinfo.h"
#include "fanotify.h"
@@ -1192,13 +1192,71 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
return recalc;
}
+struct fan_fsid {
+ struct super_block *sb;
+ __kernel_fsid_t id;
+ bool weak;
+};
+
+static int fanotify_set_mark_fsid(struct fsnotify_group *group,
+ struct fsnotify_mark *mark,
+ struct fan_fsid *fsid)
+{
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark *old;
+ struct super_block *old_sb = NULL;
+
+ FANOTIFY_MARK(mark)->fsid = fsid->id;
+ mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
+ if (fsid->weak)
+ mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
+
+ /* First mark added will determine if group is single or multi fsid */
+ if (list_empty(&group->marks_list))
+ return 0;
+
+ /* Find sb of an existing mark */
+ list_for_each_entry(old, &group->marks_list, g_list) {
+ conn = READ_ONCE(old->connector);
+ if (!conn)
+ continue;
+ old_sb = fsnotify_connector_sb(conn);
+ if (old_sb)
+ break;
+ }
+
+ /* Only detached marks left? */
+ if (!old_sb)
+ return 0;
+
+ /* Do not allow mixing of marks with weak and strong fsid */
+ if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
+ return -EXDEV;
+
+ /* Allow mixing of marks with strong fsid from different fs */
+ if (!fsid->weak)
+ return 0;
+
+ /* Do not allow mixing marks with weak fsid from different fs */
+ if (old_sb != fsid->sb)
+ return -EXDEV;
+
+ /* Do not allow mixing marks from different btrfs sub-volumes */
+ if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
+ &FANOTIFY_MARK(mark)->fsid))
+ return -EXDEV;
+
+ return 0;
+}
+
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
unsigned int obj_type,
unsigned int fan_flags,
- __kernel_fsid_t *fsid)
+ struct fan_fsid *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
+ struct fanotify_mark *fan_mark;
struct fsnotify_mark *mark;
int ret;
@@ -1211,24 +1269,34 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
- mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!mark) {
+ fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+ if (!fan_mark) {
ret = -ENOMEM;
goto out_dec_ucounts;
}
+ mark = &fan_mark->fsn_mark;
fsnotify_init_mark(mark, group);
if (fan_flags & FAN_MARK_EVICTABLE)
mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
- if (ret) {
- fsnotify_put_mark(mark);
- goto out_dec_ucounts;
+ /* Cache fsid of filesystem containing the marked object */
+ if (fsid) {
+ ret = fanotify_set_mark_fsid(group, mark, fsid);
+ if (ret)
+ goto out_put_mark;
+ } else {
+ fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
}
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
+ if (ret)
+ goto out_put_mark;
+
return mark;
+out_put_mark:
+ fsnotify_put_mark(mark);
out_dec_ucounts:
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
@@ -1279,7 +1347,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
__u32 mask, unsigned int fan_flags,
- __kernel_fsid_t *fsid)
+ struct fan_fsid *fsid)
{
struct fsnotify_mark *fsn_mark;
bool recalc;
@@ -1327,7 +1395,7 @@ out:
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
@@ -1335,7 +1403,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
static int fanotify_add_sb_mark(struct fsnotify_group *group,
struct super_block *sb, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
return fanotify_add_mark(group, &sb->s_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
@@ -1343,7 +1411,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group,
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -1554,20 +1622,25 @@ out_destroy_group:
return fd;
}
-static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
+ struct fan_fsid *fsid)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
__kernel_fsid_t root_fsid;
int err;
/*
* Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
*/
- err = vfs_get_fsid(dentry, fsid);
+ err = vfs_get_fsid(dentry, &fsid->id);
if (err)
return err;
- if (!fsid->val[0] && !fsid->val[1])
- return -ENODEV;
+ fsid->sb = dentry->d_sb;
+ if (!fsid->id.val[0] && !fsid->id.val[1]) {
+ err = -ENODEV;
+ goto weak;
+ }
/*
* Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
@@ -1577,11 +1650,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
if (err)
return err;
- if (root_fsid.val[0] != fsid->val[0] ||
- root_fsid.val[1] != fsid->val[1])
- return -EXDEV;
+ if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
+ err = -EXDEV;
+ goto weak;
+ }
+ fsid->weak = false;
return 0;
+
+weak:
+ /* Allow weak fsid when marking inodes */
+ fsid->weak = true;
+ return (mark_type == FAN_MARK_INODE) ? 0 : err;
}
/* Check if filesystem can encode a unique fid */
@@ -1665,7 +1745,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
- __kernel_fsid_t __fsid, *fsid = NULL;
+ struct fan_fsid __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
@@ -1817,7 +1897,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
if (fid_mode) {
- ret = fanotify_test_fsid(path.dentry, &__fsid);
+ ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
if (ret)
goto path_put_and_out;
@@ -1935,7 +2015,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
SLAB_PANIC);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c74ef947447d..d6944ff86ffa 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -537,8 +537,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int obj_type,
- __kernel_fsid_t *fsid)
+ unsigned int obj_type)
{
struct fsnotify_mark_connector *conn;
@@ -550,14 +549,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->flags = 0;
conn->type = obj_type;
conn->obj = connp;
- /* Cache fsid of filesystem containing the object */
- if (fsid) {
- conn->fsid = *fsid;
- conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
- } else {
- conn->fsid.val[0] = conn->fsid.val[1] = 0;
- conn->flags = 0;
- }
+ conn->flags = 0;
fsnotify_get_sb_connectors(conn);
/*
@@ -608,8 +600,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
- unsigned int obj_type,
- int add_flags, __kernel_fsid_t *fsid)
+ unsigned int obj_type, int add_flags)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -619,41 +610,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
return -EINVAL;
- /* Backend is expected to check for zero fsid (e.g. tmpfs) */
- if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
- return -ENODEV;
-
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, obj_type,
- fsid);
+ err = fsnotify_attach_connector_to_object(connp, obj_type);
if (err)
return err;
goto restart;
- } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
- conn->fsid = *fsid;
- /* Pairs with smp_rmb() in fanotify_get_fsid() */
- smp_wmb();
- conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
- } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
- (fsid->val[0] != conn->fsid.val[0] ||
- fsid->val[1] != conn->fsid.val[1])) {
- /*
- * Backend is expected to check for non uniform fsid
- * (e.g. btrfs), but maybe we missed something?
- * Only allow setting conn->fsid once to non zero fsid.
- * inotify and non-fid fanotify groups do not set nor test
- * conn->fsid.
- */
- pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
- "%x.%x != %x.%x\n", __func__, conn->type,
- fsid->val[0], fsid->val[1],
- conn->fsid.val[0], conn->fsid.val[1]);
- err = -EXDEV;
- goto out_err;
}
/* is mark the first mark? */
@@ -703,7 +668,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int obj_type,
- int add_flags, __kernel_fsid_t *fsid)
+ int add_flags)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -723,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
if (ret)
goto err;
@@ -742,14 +707,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int obj_type, int add_flags,
- __kernel_fsid_t *fsid)
+ unsigned int obj_type, int add_flags)
{
int ret;
struct fsnotify_group *group = mark->group;
fsnotify_group_lock(group);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
fsnotify_group_unlock(group);
return ret;
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 71e31e789b29..2d01517a2d59 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1304,7 +1304,7 @@ done:
* page cleaned. The VM has already locked the page and marked it clean.
*
* For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_page() function,
+ * the ntfs version of the generic block_write_full_folio() function,
* ntfs_write_block(), which in turn if necessary creates and writes the
* buffers associated with the page asynchronously.
*
@@ -1314,7 +1314,7 @@ done:
* vfs inode dirty code path for the inode the mft record belongs to or via the
* vm page dirty code path for the page the mft record is in.
*
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
*
* Return 0 on success and -errno on error.
*/
@@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = {
.bmap = ntfs_bmap,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
/*
@@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = {
#endif /* NTFS_RW */
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
/*
@@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = {
#endif /* NTFS_RW */
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
#ifdef NTFS_RW
@@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = {
*
* If the page does not have buffers, we create them and set them uptodate.
* The page may not be locked which is why we need to handle the buffers under
- * the mapping->private_lock. Once the buffers are marked dirty we no longer
+ * the mapping->i_private_lock. Once the buffers are marked dirty we no longer
* need the lock since try_to_free_buffers() does not free dirty buffers.
*/
void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
@@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
BUG_ON(!PageUptodate(page));
end = ofs + ni->itype.index.block_size;
bh_size = VFS_I(ni)->i_sb->s_blocksize;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (unlikely(!page_has_buffers(page))) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
bh = head = alloc_page_buffers(page, bh_size, true);
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(!page_has_buffers(page))) {
struct buffer_head *tail;
@@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
break;
set_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
filemap_dirty_folio(mapping, page_folio(page));
if (unlikely(buffers_to_free)) {
do {
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 4596c90e7b7c..629723a8d712 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
/**
* ntfs_dir_fsync - sync a directory to disk
* @filp: directory to be synced
- * @dentry: dentry describing the directory to sync
+ * @start: offset in bytes of the beginning of data range to sync
+ * @end: offset in bytes of the end of data range (inclusive)
* @datasync: if non-zero only flush user data and not metadata
*
* Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 91b32b2377ac..ea9127ba3208 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
* nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
- * otherwise block_write_full_page() will skip writeout of pages past
+ * otherwise block_write_full_folio() will skip writeout of pages past
* i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ba790219d528..b82185075de7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -389,21 +389,18 @@ out_unlock:
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
- * ocfs2_writepage.
+ * ocfs2_writepages.
*
- * ->writepage is called during the process of invalidating the page cache
+ * ->writepages is called during the process of invalidating the page cache
* during blocked lock processing. It can't block on any cluster locks
* to during block mapping. It's relying on the fact that the block
* mapping can't have disappeared under the dirty pages that it is
* being asked to write back.
*/
-static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int ocfs2_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- trace_ocfs2_writepage(
- (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
- page->index);
-
- return block_write_full_page(page, ocfs2_get_block, wbc);
+ return mpage_writepages(mapping, wbc, ocfs2_get_block);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -2471,7 +2468,7 @@ const struct address_space_operations ocfs2_aops = {
.dirty_folio = block_dirty_folio,
.read_folio = ocfs2_read_folio,
.readahead = ocfs2_readahead,
- .writepage = ocfs2_writepage,
+ .writepages = ocfs2_writepages,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
@@ -2480,5 +2477,5 @@ const struct address_space_operations ocfs2_aops = {
.release_folio = ocfs2_release_folio,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index eaa8c80ace3c..b8b6a191b5cb 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,4 +280,5 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
+ .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 94e2a1244442..8b6d15010703 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/*
* fs-writeback will release the dirty pages without page lock
* whose offset are over inode size, the release happens at
- * block_write_full_page().
+ * block_write_full_folio().
*/
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index ac4fd1d5b128..9898c11bdfa1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1157,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
-DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
-
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
TRACE_EVENT(ocfs2_try_to_write_inline_data,
diff --git a/fs/open.c b/fs/open.c
index 3494a9cd8046..a84d21e55c39 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
+ ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
+ if (ret)
+ return ret;
+
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
@@ -442,7 +446,8 @@ static const struct cred *access_override_creds(void)
* 'get_current_cred()' function), that will clear the
* non_rcu field, because now that other user may be
* expecting RCU freeing. But normal thread-synchronous
- * cred accesses will keep things non-RCY.
+ * cred accesses will keep things non-racy to avoid RCU
+ * freeing.
*/
override_cred->non_rcu = 1;
@@ -1177,44 +1182,6 @@ struct file *kernel_file_open(const struct path *path, int flags,
}
EXPORT_SYMBOL_GPL(kernel_file_open);
-/**
- * backing_file_open - open a backing file for kernel internal use
- * @user_path: path that the user reuqested to open
- * @flags: open flags
- * @real_path: path of the backing file
- * @cred: credentials for open
- *
- * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @user_path may be on the stackable filesystem and @real_path on the
- * underlying filesystem. In this case, we want to be able to return the
- * @user_path of the stackable filesystem. This is done by embedding the
- * returned file into a container structure that also stores the stacked
- * file's path, which can be retrieved using backing_file_user_path().
- */
-struct file *backing_file_open(const struct path *user_path, int flags,
- const struct path *real_path,
- const struct cred *cred)
-{
- struct file *f;
- int error;
-
- f = alloc_empty_backing_file(flags, cred);
- if (IS_ERR(f))
- return f;
-
- path_get(user_path);
- *backing_file_user_path(f) = *user_path;
- f->f_path = *real_path;
- error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
-
- return f;
-}
-EXPORT_SYMBOL_GPL(backing_file_open);
-
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
@@ -1574,7 +1541,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
int retval;
struct file *file;
- file = close_fd_get_file(fd);
+ file = file_close_fd(fd);
if (!file)
return -EBADF;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index fec5020c3495..2ac67e04a6fb 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config OVERLAY_FS
tristate "Overlay filesystem support"
+ select FS_STACK
select EXPORTFS
help
An overlay filesystem combines two filesystems - an 'upper' filesystem
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8bea66c97316..45cadc3aed85 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
return ovl_real_fileattr_set(new, &newfa);
}
+static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
+{
+ loff_t tmp;
+
+ if (WARN_ON_ONCE(pos != pos2))
+ return -EIO;
+ if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0))
+ return -EIO;
+ if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp)))
+ return -EIO;
+ return 0;
+}
+
static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
struct file *new_file, loff_t len)
{
@@ -244,13 +257,20 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
int error = 0;
ovl_path_lowerdata(dentry, &datapath);
- if (WARN_ON(datapath.dentry == NULL))
+ if (WARN_ON_ONCE(datapath.dentry == NULL) ||
+ WARN_ON_ONCE(len < 0))
return -EIO;
old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
+ error = rw_verify_area(READ, old_file, &old_pos, len);
+ if (!error)
+ error = rw_verify_area(WRITE, new_file, &new_pos, len);
+ if (error)
+ goto out_fput;
+
/* Try to use clone_file_range to clone up within the same fs */
ovl_start_write(dentry);
cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
@@ -265,7 +285,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
- long bytes;
+ ssize_t bytes;
if (len < this_len)
this_len = len;
@@ -309,11 +329,13 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
}
}
- ovl_start_write(dentry);
+ error = ovl_verify_area(old_pos, new_pos, this_len, len);
+ if (error)
+ break;
+
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
- ovl_end_write(dentry);
if (bytes <= 0) {
error = bytes;
break;
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 131621daeb13..05536964d37f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,25 +9,11 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
-#include <linux/splice.h>
#include <linux/security.h>
-#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/backing-file.h>
#include "overlayfs.h"
-#include "../internal.h" /* for sb_init_dio_done_wq */
-
-struct ovl_aio_req {
- struct kiocb iocb;
- refcount_t ref;
- struct kiocb *orig_iocb;
- /* used for aio completion */
- struct work_struct work;
- long res;
-};
-
-static struct kmem_cache *ovl_aio_request_cachep;
-
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
if (realinode != ovl_inode_upper(inode))
@@ -274,83 +260,16 @@ static void ovl_file_accessed(struct file *file)
touch_atime(&file->f_path);
}
-#define OVL_IOCB_MASK \
- (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
-
-static rwf_t iocb_to_rw_flags(int flags)
-{
- return (__force rwf_t)(flags & OVL_IOCB_MASK);
-}
-
-static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
-{
- if (refcount_dec_and_test(&aio_req->ref)) {
- fput(aio_req->iocb.ki_filp);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
- }
-}
-
-static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
-{
- struct kiocb *iocb = &aio_req->iocb;
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- if (iocb->ki_flags & IOCB_WRITE) {
- kiocb_end_write(iocb);
- ovl_file_modified(orig_iocb->ki_filp);
- }
-
- orig_iocb->ki_pos = iocb->ki_pos;
- ovl_aio_put(aio_req);
-}
-
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res);
-}
-
-static void ovl_aio_complete_work(struct work_struct *work)
-{
- struct ovl_aio_req *aio_req = container_of(work,
- struct ovl_aio_req, work);
-
- ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
-}
-
-static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- /*
- * Punt to a work queue to serialize updates of mtime/size.
- */
- aio_req->res = res;
- INIT_WORK(&aio_req->work, ovl_aio_complete_work);
- queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
- &aio_req->work);
-}
-
-static int ovl_init_aio_done_wq(struct super_block *sb)
-{
- if (sb->s_dio_done_wq)
- return 0;
-
- return sb_init_dio_done_wq(sb);
-}
-
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -359,37 +278,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
-
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_complete = ovl_aio_rw_complete;
- refcount_set(&aio_req->ref, 2);
- ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
- ovl_file_accessed(file);
-out_fdput:
+ ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+ &ctx);
fdput(real);
return ret;
@@ -400,9 +290,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
int ifl = iocb->ki_flags;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = file,
+ .end_write = ovl_file_modified,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -410,19 +304,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(file);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(file, &real);
if (ret)
goto out_unlock;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
@@ -431,42 +317,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(ifl);
-
- file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(file);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = ovl_init_aio_done_wq(inode->i_sb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_flags = ifl;
- aio_req->iocb.ki_complete = ovl_aio_queue_completion;
- refcount_set(&aio_req->ref, 2);
- kiocb_start_write(&aio_req->iocb);
- ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
-out_fdput:
+ ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
fdput(real);
out_unlock:
@@ -479,20 +330,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- const struct cred *old_cred;
struct fd real;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(in)->i_sb),
+ .user_file = in,
+ .accessed = ovl_file_accessed,
+ };
ret = ovl_real_fdget(in, &real);
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(in)->i_sb);
- ret = vfs_splice_read(real.file, ppos, pipe, len, flags);
- revert_creds(old_cred);
- ovl_file_accessed(in);
-
+ ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
fdput(real);
+
return ret;
}
@@ -508,30 +360,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
struct fd real;
- const struct cred *old_cred;
struct inode *inode = file_inode(out);
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = out,
+ .end_write = ovl_file_modified,
+ };
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(out);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(out, &real);
if (ret)
goto out_unlock;
- old_cred = ovl_override_creds(inode->i_sb);
- file_start_write(real.file);
-
- ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
-
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(out);
- revert_creds(old_cred);
+ ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
fdput(real);
out_unlock:
@@ -569,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
{
struct file *realfile = file->private_data;
- const struct cred *old_cred;
- int ret;
-
- if (!realfile->f_op->mmap)
- return -ENODEV;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
- if (WARN_ON(file != vma->vm_file))
- return -EIO;
-
- vma_set_file(vma, realfile);
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
- ovl_file_accessed(file);
-
- return ret;
+ return backing_file_mmap(realfile, vma, &ctx);
}
static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
@@ -778,19 +613,3 @@ const struct file_operations ovl_file_operations = {
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
};
-
-int __init ovl_aio_request_cache_init(void)
-{
- ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
- sizeof(struct ovl_aio_req),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!ovl_aio_request_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
-void ovl_aio_request_cache_destroy(void)
-{
- kmem_cache_destroy(ovl_aio_request_cachep);
-}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 05c3dd597fa8..5ba11eb43767 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -425,6 +425,12 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+
+static inline const struct cred *ovl_creds(struct super_block *sb)
+{
+ return OVL_FS(sb)->creator_cred;
+}
+
int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
@@ -837,8 +843,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
/* file.c */
extern const struct file_operations ovl_file_operations;
-int __init ovl_aio_request_cache_init(void);
-void ovl_aio_request_cache_destroy(void);
int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a0967bb25003..7131e3f5b9f5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1454,6 +1454,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
* lead to unexpected results.
*/
sb->s_iflags |= SB_I_NOUMASK;
+ sb->s_iflags |= SB_I_EVM_UNSUPPORTED;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
@@ -1501,14 +1502,10 @@ static int __init ovl_init(void)
if (ovl_inode_cachep == NULL)
return -ENOMEM;
- err = ovl_aio_request_cache_init();
- if (!err) {
- err = register_filesystem(&ovl_fs_type);
- if (!err)
- return 0;
+ err = register_filesystem(&ovl_fs_type);
+ if (!err)
+ return 0;
- ovl_aio_request_cache_destroy();
- }
kmem_cache_destroy(ovl_inode_cachep);
return err;
@@ -1524,7 +1521,6 @@ static void __exit ovl_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(ovl_inode_cachep);
- ovl_aio_request_cache_destroy();
}
module_init(ovl_init);
diff --git a/fs/pipe.c b/fs/pipe.c
index 804a7d789452..8d9286a1f2e8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
bool was_empty = false;
bool wake_next_writer = false;
+ /*
+ * Reject writing to watch queue pipes before the point where we lock
+ * the pipe.
+ * Otherwise, lockdep would be unhappy if the caller already has another
+ * pipe locked.
+ * If we had to support locking a normal pipe and a notification pipe at
+ * the same time, we could set up lockdep annotations for that, but
+ * since we don't actually need that, it's simpler to just bail here.
+ */
+ if (pipe_has_watch_queue(pipe))
+ return -EXDEV;
+
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
@@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (pipe_has_watch_queue(pipe)) {
- ret = -EXDEV;
- goto out;
- }
-
/*
* If it wasn't empty we try to merge new data into
* the last buffer.
@@ -1317,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
pipe->tail = tail;
pipe->head = head;
+ if (!pipe_has_watch_queue(pipe)) {
+ pipe->max_usage = nr_slots;
+ pipe->nr_accounted = nr_slots;
+ }
+
spin_unlock_irq(&pipe->rd_wait.lock);
/* This might have made more room for writers */
@@ -1368,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
if (ret < 0)
goto out_revert_acct;
- pipe->max_usage = nr_slots;
- pipe->nr_accounted = nr_slots;
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
diff --git a/fs/pnode.c b/fs/pnode.c
index e4d0340393d5..a799e0315cc9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
mnt->mnt.mnt_flags |= MNT_UMOUNT;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_umounting);
- list_move_tail(&mnt->mnt_list, to_umount);
+ move_from_ns(mnt, to_umount);
}
/*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index a05fe94970ce..e1af20893ebe 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod);
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*/
int
posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*
* Called from set_acl inode operations.
*/
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd31e3b6bf77..98a031ac2648 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -97,6 +97,7 @@
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -146,10 +147,10 @@ struct pid_entry {
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
-#define ATTR(LSM, NAME, MODE) \
+#define ATTR(LSMID, NAME, MODE) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_pid_attr_operations, \
- { .lsm = LSM })
+ { .lsmid = LSMID })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -2726,7 +2727,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2784,7 +2785,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ rv = security_setprocattr(PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name, page,
count);
mutex_unlock(&current->signal->cred_guard_mutex);
@@ -2833,27 +2834,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
- ATTR("smack", "current", 0666),
+ ATTR(LSM_ID_SMACK, "current", 0666),
};
LSM_DIR_OPS(smack);
#endif
#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
- ATTR("apparmor", "current", 0666),
- ATTR("apparmor", "prev", 0444),
- ATTR("apparmor", "exec", 0666),
+ ATTR(LSM_ID_APPARMOR, "current", 0666),
+ ATTR(LSM_ID_APPARMOR, "prev", 0444),
+ ATTR(LSM_ID_APPARMOR, "exec", 0666),
};
LSM_DIR_OPS(apparmor);
#endif
static const struct pid_entry attr_dir_stuff[] = {
- ATTR(NULL, "current", 0666),
- ATTR(NULL, "prev", 0444),
- ATTR(NULL, "exec", 0666),
- ATTR(NULL, "fscreate", 0666),
- ATTR(NULL, "keycreate", 0666),
- ATTR(NULL, "sockcreate", 0666),
+ ATTR(LSM_ID_UNDEF, "current", 0666),
+ ATTR(LSM_ID_UNDEF, "prev", 0444),
+ ATTR(LSM_ID_UNDEF, "exec", 0666),
+ ATTR(LSM_ID_UNDEF, "fscreate", 0666),
+ ATTR(LSM_ID_UNDEF, "keycreate", 0666),
+ ATTR(LSM_ID_UNDEF, "sockcreate", 0666),
#ifdef CONFIG_SECURITY_SMACK
DIR("smack", 0555,
proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9a8f32f21ff5..a71ac5379584 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,7 +92,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
- const char *lsm;
+ int lsmid;
};
struct proc_inode {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 435b61054b5b..62b16f42d5d2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
const char *name = NULL;
if (file) {
- struct inode *inode = file_inode(vma->vm_file);
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -865,7 +866,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false, true));
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
+ true, THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1761,7 +1763,7 @@ static int pagemap_release(struct inode *inode, struct file *file)
#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
PAGE_IS_FILE | PAGE_IS_PRESENT | \
PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
- PAGE_IS_HUGE)
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
struct pagemap_scan_private {
@@ -1793,6 +1795,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (is_zero_pfn(pte_pfn(pte)))
categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t swp;
@@ -1806,6 +1810,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
!PageAnon(pfn_swap_entry_to_page(swp)))
categories |= PAGE_IS_FILE;
}
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
}
return categories;
@@ -1853,12 +1859,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (is_zero_pfn(pmd_pfn(pmd)))
categories |= PAGE_IS_PFNZERO;
+ if (pmd_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pmd(pmd)) {
swp_entry_t swp;
categories |= PAGE_IS_SWAPPED;
if (!pmd_swp_uffd_wp(pmd))
categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
@@ -1905,10 +1915,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
categories |= PAGE_IS_FILE;
if (is_zero_pfn(pte_pfn(pte)))
categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
categories |= PAGE_IS_SWAPPED;
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
}
return categories;
@@ -2007,6 +2021,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
if (wp_allowed)
vma_category |= PAGE_IS_WPALLOWED;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma_category |= PAGE_IS_SOFT_DIRTY;
+
if (!pagemap_scan_is_interesting_vma(vma_category, p))
return 1;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..0a808951b7d3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -142,13 +142,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path) {
- err = sb->s_op->show_path(m, mnt->mnt_root);
- if (err)
- goto out;
- } else {
- seq_dentry(m, mnt->mnt_root, " \t\n\\");
- }
+ err = show_path(m, mnt->mnt_root);
+ if (err)
+ goto out;
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -283,8 +279,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->ns = ns;
p->root = root;
p->show = show;
- INIT_LIST_HEAD(&p->cursor.mnt_list);
- p->cursor.mnt.mnt_flags = MNT_CURSOR;
return 0;
@@ -301,7 +295,6 @@ static int mounts_release(struct inode *inode, struct file *file)
struct seq_file *m = file->private_data;
struct proc_mounts *p = m->private;
path_put(&p->root);
- mnt_cursor_del(p->ns, &p->cursor);
put_mnt_ns(p->ns);
return seq_release_private(inode, file);
}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58b5de081b57..44ff2813ae51 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1787,7 +1787,7 @@ EXPORT_SYMBOL(dquot_alloc_inode);
/*
* Convert in-memory reserved quotas to real consumed quotas
*/
-int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
struct dquot **dquots;
int cnt, index;
@@ -1797,7 +1797,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
spin_unlock(&inode->i_lock);
- return 0;
+ return;
}
dquots = i_dquot(inode);
@@ -1822,7 +1822,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
- return 0;
+ return;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index efb1b4c1a0a4..7a6d980e614d 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
- if (unlikely(order > MAX_ORDER))
+ if (unlikely(order > MAX_PAGE_ORDER))
return -EFBIG;
ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/read_write.c b/fs/read_write.c
index 4771701c896b..d4c036e82b6c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -354,6 +354,9 @@ out_putf:
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
+ int mask = read_write == READ ? MAY_READ : MAY_WRITE;
+ int ret;
+
if (unlikely((ssize_t) count < 0))
return -EINVAL;
@@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
}
}
- return security_file_permission(file,
- read_write == READ ? MAY_READ : MAY_WRITE);
+ ret = security_file_permission(file, mask);
+ if (ret)
+ return ret;
+
+ return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);
@@ -773,12 +779,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
- loff_t *pos, rwf_t flags)
+ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
{
size_t tot_len;
ssize_t ret = 0;
+ if (!file->f_op->read_iter)
+ return -EINVAL;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
@@ -787,22 +795,20 @@ static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
tot_len = iov_iter_count(iter);
if (!tot_len)
goto out;
- ret = rw_verify_area(READ, file, pos, tot_len);
+ ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
if (ret < 0)
return ret;
- if (file->f_op->read_iter)
- ret = do_iter_readv_writev(file, iter, pos, READ, flags);
- else
- ret = do_loop_readv_writev(file, iter, pos, READ, flags);
+ ret = call_read_iter(file, iocb, iter);
out:
if (ret >= 0)
fsnotify_access(file);
return ret;
}
+EXPORT_SYMBOL(vfs_iocb_iter_read);
-ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
- struct iov_iter *iter)
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ rwf_t flags)
{
size_t tot_len;
ssize_t ret = 0;
@@ -817,33 +823,30 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
tot_len = iov_iter_count(iter);
if (!tot_len)
goto out;
- ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
+ ret = rw_verify_area(READ, file, ppos, tot_len);
if (ret < 0)
return ret;
- ret = call_read_iter(file, iocb, iter);
+ ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
if (ret >= 0)
fsnotify_access(file);
return ret;
}
-EXPORT_SYMBOL(vfs_iocb_iter_read);
-
-ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- rwf_t flags)
-{
- if (!file->f_op->read_iter)
- return -EINVAL;
- return do_iter_read(file, iter, ppos, flags);
-}
EXPORT_SYMBOL(vfs_iter_read);
-static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
- loff_t *pos, rwf_t flags)
+/*
+ * Caller is responsible for calling kiocb_end_write() on completion
+ * if async iocb was queued.
+ */
+ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
{
size_t tot_len;
ssize_t ret = 0;
+ if (!file->f_op->write_iter)
+ return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
@@ -852,88 +855,127 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
tot_len = iov_iter_count(iter);
if (!tot_len)
return 0;
- ret = rw_verify_area(WRITE, file, pos, tot_len);
+ ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
if (ret < 0)
return ret;
- if (file->f_op->write_iter)
- ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
- else
- ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
+ kiocb_start_write(iocb);
+ ret = call_write_iter(file, iocb, iter);
+ if (ret != -EIOCBQUEUED)
+ kiocb_end_write(iocb);
if (ret > 0)
fsnotify_modify(file);
+
return ret;
}
+EXPORT_SYMBOL(vfs_iocb_iter_write);
-ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
- struct iov_iter *iter)
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ rwf_t flags)
{
size_t tot_len;
- ssize_t ret = 0;
+ ssize_t ret;
- if (!file->f_op->write_iter)
- return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ if (!file->f_op->write_iter)
+ return -EINVAL;
tot_len = iov_iter_count(iter);
if (!tot_len)
return 0;
- ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
+
+ ret = rw_verify_area(WRITE, file, ppos, tot_len);
if (ret < 0)
return ret;
- ret = call_write_iter(file, iocb, iter);
+ file_start_write(file);
+ ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
if (ret > 0)
fsnotify_modify(file);
+ file_end_write(file);
return ret;
}
-EXPORT_SYMBOL(vfs_iocb_iter_write);
-
-ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- rwf_t flags)
-{
- if (!file->f_op->write_iter)
- return -EINVAL;
- return do_iter_write(file, iter, ppos, flags);
-}
EXPORT_SYMBOL(vfs_iter_write);
static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t ret;
+ size_t tot_len;
+ ssize_t ret = 0;
- ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
+ if (!(file->f_mode & FMODE_READ))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
+ &iter);
+ if (ret < 0)
+ return ret;
+
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
+ ret = rw_verify_area(READ, file, pos, tot_len);
+ if (ret < 0)
+ goto out;
+
+ if (file->f_op->read_iter)
+ ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
+ else
+ ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
+out:
+ if (ret >= 0)
+ fsnotify_access(file);
+ kfree(iov);
return ret;
}
static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t ret;
+ size_t tot_len;
+ ssize_t ret = 0;
- ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return -EINVAL;
+
+ ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
+ &iter);
+ if (ret < 0)
+ return ret;
+
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
+
+ ret = rw_verify_area(WRITE, file, pos, tot_len);
+ if (ret < 0)
+ goto out;
+
+ file_start_write(file);
+ if (file->f_op->write_iter)
+ ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
+ else
+ ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
+ if (ret > 0)
+ fsnotify_modify(file);
+ file_end_write(file);
+out:
+ kfree(iov);
return ret;
}
@@ -1178,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
- size_t count, loff_t max)
+ size_t count, loff_t max)
{
struct fd in, out;
struct inode *in_inode, *out_inode;
@@ -1250,10 +1292,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
retval = rw_verify_area(WRITE, out.file, &out_pos, count);
if (retval < 0)
goto fput_out;
- file_start_write(out.file);
retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
count, fl);
- file_end_write(out.file);
} else {
if (out.file->f_flags & O_NONBLOCK)
fl |= SPLICE_F_NONBLOCK;
@@ -1362,38 +1402,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
}
#endif
-/**
- * generic_copy_file_range - copy data between two files
- * @file_in: file structure to read from
- * @pos_in: file offset to read from
- * @file_out: file structure to write data to
- * @pos_out: file offset to write data to
- * @len: amount of data to copy
- * @flags: copy flags
- *
- * This is a generic filesystem helper to copy data from one file to another.
- * It has no constraints on the source or destination file owners - the files
- * can belong to different superblocks and different filesystem types. Short
- * copies are allowed.
- *
- * This should be called from the @file_out filesystem, as per the
- * ->copy_file_range() method.
- *
- * Returns the number of bytes copied or a negative error indicating the
- * failure.
- */
-
-ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
-
- return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-}
-EXPORT_SYMBOL(generic_copy_file_range);
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1478,6 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;
bool splice = flags & COPY_FILE_SPLICE;
+ bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
if (flags & ~COPY_FILE_SPLICE)
return -EINVAL;
@@ -1509,19 +1518,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
ret = file_out->f_op->copy_file_range(file_in, pos_in,
file_out, pos_out,
len, flags);
- goto done;
- }
-
- if (!splice && file_in->f_op->remap_file_range &&
- file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+ } else if (!splice && file_in->f_op->remap_file_range && samesb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
- if (ret > 0)
- goto done;
+ /* fallback to splice */
+ if (ret <= 0)
+ splice = true;
+ } else if (samesb) {
+ /* Fallback to splice for same sb copy for backward compat */
+ splice = true;
}
+ file_end_write(file_out);
+
+ if (!splice)
+ goto done;
+
/*
* We can get here for same sb copy of filesystems that do not implement
* ->copy_file_range() in case filesystem does not support clone or in
@@ -1533,11 +1547,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* and which filesystems do not, that will allow userspace tools to
* make consistent desicions w.r.t using copy_file_range().
*
- * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
+ * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
+ * for server-side-copy between any two sb.
+ *
+ * In any case, we call do_splice_direct() and not splice_file_range(),
+ * without file_start_write() held, to avoid possible deadlocks related
+ * to splicing from input file, while file_start_write() is held on
+ * the output file on a different sb.
*/
- ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ min_t(size_t, len, MAX_RW_COUNT), 0);
done:
if (ret > 0) {
fsnotify_access(file_in);
@@ -1549,8 +1568,6 @@ done:
inc_syscr(current);
inc_syscw(current);
- file_end_write(file_out);
-
return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);
diff --git a/fs/readdir.c b/fs/readdir.c
index c8c46e294431..278bc0254732 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
if (res)
goto out;
+ res = fsnotify_file_perm(file, MAY_READ);
+ if (res)
+ goto out;
+
res = down_read_killable(&inode->i_rwsem);
if (res)
goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 2138ee7d271d..5faf702f8d15 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
INITIALIZE_PATH(path);
int item_len = 0;
int tb_init = 0;
- struct cpu_key cpu_key;
+ struct cpu_key cpu_key = {};
int retval;
int quota_cut_bytes = 0;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 87ae4f0dc3aa..f8c1120b8311 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
+ int mask = write ? MAY_WRITE : MAY_READ;
loff_t tmp;
+ int ret;
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
@@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
if (unlikely(check_add_overflow(pos, len, &tmp)))
return -EINVAL;
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+ ret = security_file_permission(file, mask);
+ if (ret)
+ return ret;
+
+ return fsnotify_file_area_perm(file, mask, &pos, len);
}
/*
@@ -385,14 +391,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
if (!file_in->f_op->remap_file_range)
return -EOPNOTSUPP;
- ret = remap_verify_area(file_in, pos_in, len, false);
- if (ret)
- return ret;
-
- ret = remap_verify_area(file_out, pos_out, len, true);
- if (ret)
- return ret;
-
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out, len, remap_flags);
if (ret < 0)
@@ -410,6 +408,14 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
{
loff_t ret;
+ ret = remap_verify_area(file_in, pos_in, len, false);
+ if (ret)
+ return ret;
+
+ ret = remap_verify_area(file_out, pos_out, len, true);
+ if (ret)
+ return ret;
+
file_start_write(file_out);
ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
remap_flags);
@@ -420,7 +426,7 @@ loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
EXPORT_SYMBOL(vfs_clone_file_range);
/* Check whether we are allowed to dedupe the destination file */
-static bool allow_file_dedupe(struct file *file)
+static bool may_dedupe_file(struct file *file)
{
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct inode *inode = file_inode(file);
@@ -445,24 +451,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
REMAP_FILE_CAN_SHORTEN));
- ret = mnt_want_write_file(dst_file);
- if (ret)
- return ret;
-
/*
* This is redundant if called from vfs_dedupe_file_range(), but other
* callers need it and it's not performance sesitive...
*/
ret = remap_verify_area(src_file, src_pos, len, false);
if (ret)
- goto out_drop_write;
+ return ret;
ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret)
- goto out_drop_write;
+ return ret;
+
+ /*
+ * This needs to be called after remap_verify_area() because of
+ * sb_start_write() and before may_dedupe_file() because the mount's
+ * MAY_WRITE need to be checked with mnt_get_write_access_file() held.
+ */
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
ret = -EPERM;
- if (!allow_file_dedupe(dst_file))
+ if (!may_dedupe_file(dst_file))
goto out_drop_write;
ret = -EXDEV;
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2131638f26d0..99b0ade833aa 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -25,6 +25,7 @@
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
+#include <linux/splice.h>
#include <linux/uuid.h>
#include <linux/xattr.h>
#include <uapi/linux/magic.h>
@@ -1506,8 +1507,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
free_xid(xid);
if (rc == -EOPNOTSUPP || rc == -EXDEV)
- rc = generic_copy_file_range(src_file, off, dst_file,
- destoff, len, flags);
+ rc = splice_copy_file_range(src_file, off, dst_file,
+ destoff, len);
return rc;
}
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 32a8525415d9..4e84e88b47e3 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2706,8 +2706,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
*/
if (!folio_clear_dirty_for_io(folio))
WARN_ON(1);
- if (folio_start_writeback(folio))
- WARN_ON(1);
+ folio_start_writeback(folio);
*_count -= folio_nr_pages(folio);
folio_unlock(folio);
@@ -2742,8 +2741,7 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
int rc;
/* The folio should be locked, dirty and not undergoing writeback. */
- if (folio_start_writeback(folio))
- WARN_ON(1);
+ folio_start_writeback(folio);
count -= folio_nr_pages(folio);
len = folio_size(folio);
diff --git a/fs/splice.c b/fs/splice.c
index d983d375ff11..218e24b1ac40 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int tail = pipe->tail;
unsigned int head = pipe->head;
unsigned int mask = pipe->ring_size - 1;
- int ret = 0, page_nr = 0;
+ ssize_t ret = 0;
+ int page_nr = 0;
if (!spd_pages)
return 0;
@@ -673,10 +674,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.u.file = out,
};
int nbufs = pipe->max_usage;
- struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
- GFP_KERNEL);
+ struct bio_vec *array;
ssize_t ret;
+ if (!out->f_op->write_iter)
+ return -EINVAL;
+
+ array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
if (unlikely(!array))
return -ENOMEM;
@@ -684,6 +688,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
+ struct kiocb kiocb;
struct iov_iter from;
unsigned int head, tail, mask;
size_t left;
@@ -733,7 +738,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
}
iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
- ret = vfs_iter_write(out, &from, &sd.pos, 0);
+ init_sync_kiocb(&kiocb, out);
+ kiocb.ki_pos = sd.pos;
+ ret = call_write_iter(out, &kiocb, &from);
+ sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
@@ -925,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op)
/*
* Attempt to initiate a splice from pipe to file.
*/
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
if (unlikely(!out->f_op->splice_write))
return warn_unsupported(out, "write");
@@ -944,27 +952,15 @@ static void do_splice_eof(struct splice_desc *sd)
sd->splice_eof(sd);
}
-/**
- * vfs_splice_read - Read data from a file and splice it into a pipe
- * @in: File to splice from
- * @ppos: Input file offset
- * @pipe: Pipe to splice to
- * @len: Number of bytes to splice
- * @flags: Splice modifier flags (SPLICE_F_*)
- *
- * Splice the requested amount of data from the input file to the pipe. This
- * is synchronous as the caller must hold the pipe lock across the entire
- * operation.
- *
- * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
- * a hole and a negative error code otherwise.
+/*
+ * Callers already called rw_verify_area() on the entire range.
+ * No need to call it for sub ranges.
*/
-long vfs_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+static ssize_t do_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
unsigned int p_space;
- int ret;
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
@@ -975,10 +971,6 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
len = min_t(size_t, len, p_space << PAGE_SHIFT);
- ret = rw_verify_area(READ, in, ppos, len);
- if (unlikely(ret < 0))
- return ret;
-
if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;
@@ -992,6 +984,34 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
return copy_splice_read(in, ppos, pipe, len, flags);
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
+
+/**
+ * vfs_splice_read - Read data from a file and splice it into a pipe
+ * @in: File to splice from
+ * @ppos: Input file offset
+ * @pipe: Pipe to splice to
+ * @len: Number of bytes to splice
+ * @flags: Splice modifier flags (SPLICE_F_*)
+ *
+ * Splice the requested amount of data from the input file to the pipe. This
+ * is synchronous as the caller must hold the pipe lock across the entire
+ * operation.
+ *
+ * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
+ * a hole and a negative error code otherwise.
+ */
+ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = rw_verify_area(READ, in, ppos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return do_splice_read(in, ppos, pipe, len, flags);
+}
EXPORT_SYMBOL_GPL(vfs_splice_read);
/**
@@ -1011,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
splice_direct_actor *actor)
{
struct pipe_inode_info *pipe;
- long ret, bytes;
+ ssize_t ret, bytes;
size_t len;
int i, flags, more;
@@ -1066,7 +1086,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
- ret = vfs_splice_read(in, &pos, pipe, len, flags);
+ ret = do_splice_read(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
goto read_failure;
@@ -1138,9 +1158,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
struct splice_desc *sd)
{
struct file *file = sd->u.file;
+ long ret;
+
+ file_start_write(file);
+ ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
+ file_end_write(file);
+ return ret;
+}
+
+static int splice_file_range_actor(struct pipe_inode_info *pipe,
+ struct splice_desc *sd)
+{
+ struct file *file = sd->u.file;
- return do_splice_from(pipe, file, sd->opos, sd->total_len,
- sd->flags);
+ return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
}
static void direct_file_splice_eof(struct splice_desc *sd)
@@ -1151,24 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
file->f_op->splice_eof(file);
}
-/**
- * do_splice_direct - splices data directly between two files
- * @in: file to splice from
- * @ppos: input file offset
- * @out: file to splice to
- * @opos: output file offset
- * @len: number of bytes to splice
- * @flags: splice modifier flags
- *
- * Description:
- * For use by do_sendfile(). splice can easily emulate sendfile, but
- * doing it in the application would incur an extra system call
- * (splice in + splice out, as compared to just sendfile()). So this helper
- * can splice directly through a process-private pipe.
- *
- */
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags)
+static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
+ struct file *out, loff_t *opos,
+ size_t len, unsigned int flags,
+ splice_direct_actor *actor)
{
struct splice_desc sd = {
.len = len,
@@ -1179,7 +1196,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.splice_eof = direct_file_splice_eof,
.opos = opos,
};
- long ret;
+ ssize_t ret;
if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
@@ -1187,18 +1204,63 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
if (unlikely(out->f_flags & O_APPEND))
return -EINVAL;
- ret = rw_verify_area(WRITE, out, opos, len);
- if (unlikely(ret < 0))
- return ret;
-
- ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+ ret = splice_direct_to_actor(in, &sd, actor);
if (ret > 0)
*ppos = sd.pos;
return ret;
}
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in: file to splice from
+ * @ppos: input file offset
+ * @out: file to splice to
+ * @opos: output file offset
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * For use by do_sendfile(). splice can easily emulate sendfile, but
+ * doing it in the application would incur an extra system call
+ * (splice in + splice out, as compared to just sendfile()). So this helper
+ * can splice directly through a process-private pipe.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len, unsigned int flags)
+{
+ return do_splice_direct_actor(in, ppos, out, opos, len, flags,
+ direct_splice_actor);
+}
EXPORT_SYMBOL(do_splice_direct);
+/**
+ * splice_file_range - splices data between two files for copy_file_range()
+ * @in: file to splice from
+ * @ppos: input file offset
+ * @out: file to splice to
+ * @opos: output file offset
+ * @len: number of bytes to splice
+ *
+ * Description:
+ * For use by ->copy_file_range() methods.
+ * Like do_splice_direct(), but vfs_copy_file_range() already holds
+ * start_file_write() on @out file.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len)
+{
+ lockdep_assert(file_write_started(out));
+
+ return do_splice_direct_actor(in, ppos, out, opos,
+ min_t(size_t, len, MAX_RW_COUNT),
+ 0, splice_file_range_actor);
+}
+EXPORT_SYMBOL(splice_file_range);
+
static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
for (;;) {
@@ -1220,17 +1282,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
-long splice_file_to_pipe(struct file *in,
- struct pipe_inode_info *opipe,
- loff_t *offset,
- size_t len, unsigned int flags)
+ssize_t splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags)
{
- long ret;
+ ssize_t ret;
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
- ret = vfs_splice_read(in, offset, opipe, len, flags);
+ ret = do_splice_read(in, offset, opipe, len, flags);
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
@@ -1240,13 +1302,13 @@ long splice_file_to_pipe(struct file *in,
/*
* Determine where to splice to/from.
*/
-long do_splice(struct file *in, loff_t *off_in, struct file *out,
- loff_t *off_out, size_t len, unsigned int flags)
+ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
+ loff_t *off_out, size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
- long ret;
+ ssize_t ret;
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
@@ -1307,6 +1369,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
offset = in->f_pos;
}
+ ret = rw_verify_area(READ, in, &offset, len);
+ if (unlikely(ret < 0))
+ return ret;
+
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
@@ -1333,14 +1399,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
return ret;
}
-static long __do_splice(struct file *in, loff_t __user *off_in,
- struct file *out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset, *__off_in = NULL, *__off_out = NULL;
- long ret;
+ ssize_t ret;
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
@@ -1379,16 +1445,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in,
return ret;
}
-static int iter_to_pipe(struct iov_iter *from,
- struct pipe_inode_info *pipe,
- unsigned flags)
+static ssize_t iter_to_pipe(struct iov_iter *from,
+ struct pipe_inode_info *pipe,
+ unsigned int flags)
{
struct pipe_buffer buf = {
.ops = &user_page_pipe_buf_ops,
.flags = flags
};
size_t total = 0;
- int ret = 0;
+ ssize_t ret = 0;
while (iov_iter_count(from)) {
struct page *pages[16];
@@ -1437,8 +1503,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
* For lack of a better implementation, implement vmsplice() to userspace
* as a simple copy of the pipes pages to the user iov.
*/
-static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
- unsigned int flags)
+static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
+ unsigned int flags)
{
struct pipe_inode_info *pipe = get_pipe_info(file, true);
struct splice_desc sd = {
@@ -1446,7 +1512,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
.flags = flags,
.u.data = iter
};
- long ret = 0;
+ ssize_t ret = 0;
if (!pipe)
return -EBADF;
@@ -1470,11 +1536,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
* as splice-from-memory, where the regular splice is splice-from-file (or
* to file). In both cases the output is a pipe, naturally.
*/
-static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
- unsigned int flags)
+static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
+ unsigned int flags)
{
struct pipe_inode_info *pipe;
- long ret = 0;
+ ssize_t ret = 0;
unsigned buf_flag = 0;
if (flags & SPLICE_F_GIFT)
@@ -1570,7 +1636,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
size_t, len, unsigned int, flags)
{
struct fd in, out;
- long error;
+ ssize_t error;
if (unlikely(!len))
return 0;
@@ -1584,7 +1650,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
out = fdget(fd_out);
if (out.file) {
error = __do_splice(in.file, off_in, out.file, off_out,
- len, flags);
+ len, flags);
fdput(out);
}
fdput(in);
@@ -1807,15 +1873,15 @@ retry:
/*
* Link contents of ipipe to opipe.
*/
-static int link_pipe(struct pipe_inode_info *ipipe,
- struct pipe_inode_info *opipe,
- size_t len, unsigned int flags)
+static ssize_t link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
unsigned int i_mask, o_mask;
- int ret = 0;
+ ssize_t ret = 0;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1898,11 +1964,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* The 'flags' used are the SPLICE_F_* variants, currently the only
* applicable one is SPLICE_F_NONBLOCK.
*/
-long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
+ssize_t do_tee(struct file *in, struct file *out, size_t len,
+ unsigned int flags)
{
struct pipe_inode_info *ipipe = get_pipe_info(in, true);
struct pipe_inode_info *opipe = get_pipe_info(out, true);
- int ret = -EINVAL;
+ ssize_t ret = -EINVAL;
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
@@ -1939,7 +2006,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
struct fd in, out;
- int error;
+ ssize_t error;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 8ba8c4c50770..e8df6430444b 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -544,7 +544,8 @@ static void squashfs_readahead(struct readahead_control *ractl)
struct squashfs_page_actor *actor;
unsigned int nr_pages = 0;
struct page **pages;
- int i, file_end = i_size_read(inode) >> msblk->block_log;
+ int i;
+ loff_t file_end = i_size_read(inode) >> msblk->block_log;
unsigned int max_pages = 1UL << shift;
readahead_expand(ractl, start, (len | mask) + 1);
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index f1ccad519e28..763a3f7a75f6 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -26,10 +26,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
struct inode *inode = target_page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
- int start_index = target_page->index & ~mask;
- int end_index = start_index | mask;
+ loff_t start_index = target_page->index & ~mask;
+ loff_t end_index = start_index | mask;
int i, n, pages, bytes, res = -ENOMEM;
struct page **page;
struct squashfs_page_actor *actor;
diff --git a/fs/stat.c b/fs/stat.c
index f721d26ec3f7..77cdc69eb422 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -41,7 +41,7 @@
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before filling in the
* uid and gid filds. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*/
void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
struct inode *inode, struct kstat *stat)
@@ -247,8 +247,13 @@ retry:
error = vfs_getattr(&path, stat, request_mask, flags);
- stat->mnt_id = real_mount(path.mnt)->mnt_id;
- stat->result_mask |= STATX_MNT_ID;
+ if (request_mask & STATX_MNT_ID_UNIQUE) {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
+ stat->result_mask |= STATX_MNT_ID_UNIQUE;
+ } else {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id;
+ stat->result_mask |= STATX_MNT_ID;
+ }
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..e35936000408 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -81,16 +81,13 @@ static inline void super_unlock_shared(struct super_block *sb)
super_unlock(sb, false);
}
-static inline bool wait_born(struct super_block *sb)
+static bool super_flags(const struct super_block *sb, unsigned int flags)
{
- unsigned int flags;
-
/*
* Pairs with smp_store_release() in super_wake() and ensures
- * that we see SB_BORN or SB_DYING after we're woken.
+ * that we see @flags after we're woken.
*/
- flags = smp_load_acquire(&sb->s_flags);
- return flags & (SB_BORN | SB_DYING);
+ return smp_load_acquire(&sb->s_flags) & flags;
}
/**
@@ -105,15 +102,21 @@ static inline bool wait_born(struct super_block *sb)
*
* The caller must have acquired a temporary reference on @sb->s_count.
*
- * Return: This returns true if SB_BORN was set, false if SB_DYING was
- * set. The function acquires s_umount and returns with it held.
+ * Return: The function returns true if SB_BORN was set and with
+ * s_umount held. The function returns false if SB_DYING was
+ * set and without s_umount held.
*/
static __must_check bool super_lock(struct super_block *sb, bool excl)
{
-
lockdep_assert_not_held(&sb->s_umount);
-relock:
+ /* wait until the superblock is ready or dying */
+ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));
+
+ /* Don't pointlessly acquire s_umount. */
+ if (super_flags(sb, SB_DYING))
+ return false;
+
__super_lock(sb, excl);
/*
@@ -121,32 +124,22 @@ relock:
* @sb->s_root is NULL and @sb->s_active is 0. No one needs to
* grab a reference to this. Tell them so.
*/
- if (sb->s_flags & SB_DYING)
+ if (sb->s_flags & SB_DYING) {
+ super_unlock(sb, excl);
return false;
+ }
- /* Has called ->get_tree() successfully. */
- if (sb->s_flags & SB_BORN)
- return true;
-
- super_unlock(sb, excl);
-
- /* wait until the superblock is ready or dying */
- wait_var_event(&sb->s_flags, wait_born(sb));
-
- /*
- * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
- * Just reacquire @sb->s_umount for the caller.
- */
- goto relock;
+ WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
+ return true;
}
-/* wait and acquire read-side of @sb->s_umount */
+/* wait and try to acquire read-side of @sb->s_umount */
static inline bool super_lock_shared(struct super_block *sb)
{
return super_lock(sb, false);
}
-/* wait and acquire write-side of @sb->s_umount */
+/* wait and try to acquire write-side of @sb->s_umount */
static inline bool super_lock_excl(struct super_block *sb)
{
return super_lock(sb, true);
@@ -323,7 +316,7 @@ static void destroy_unused_super(struct super_block *s)
static struct super_block *alloc_super(struct file_system_type *type, int flags,
struct user_namespace *user_ns)
{
- struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
+ struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL);
static const struct super_operations default_op;
int i;
@@ -521,48 +514,7 @@ void deactivate_super(struct super_block *s)
EXPORT_SYMBOL(deactivate_super);
/**
- * grab_super - acquire an active reference
- * @s: reference we are trying to make active
- *
- * Tries to acquire an active reference. grab_super() is used when we
- * had just found a superblock in super_blocks or fs_type->fs_supers
- * and want to turn it into a full-blown active reference. grab_super()
- * is called with sb_lock held and drops it. Returns 1 in case of
- * success, 0 if we had failed (superblock contents was already dead or
- * dying when grab_super() had been called). Note that this is only
- * called for superblocks not in rundown mode (== ones still on ->fs_supers
- * of their type), so increment of ->s_count is OK here.
- */
-static int grab_super(struct super_block *s) __releases(sb_lock)
-{
- bool born;
-
- s->s_count++;
- spin_unlock(&sb_lock);
- born = super_lock_excl(s);
- if (born && atomic_inc_not_zero(&s->s_active)) {
- put_super(s);
- return 1;
- }
- super_unlock_excl(s);
- put_super(s);
- return 0;
-}
-
-static inline bool wait_dead(struct super_block *sb)
-{
- unsigned int flags;
-
- /*
- * Pairs with memory barrier in super_wake() and ensures
- * that we see SB_DEAD after we're woken.
- */
- flags = smp_load_acquire(&sb->s_flags);
- return flags & SB_DEAD;
-}
-
-/**
- * grab_super_dead - acquire an active reference to a superblock
+ * grab_super - acquire an active reference to a superblock
* @sb: superblock to acquire
*
* Acquire a temporary reference on a superblock and try to trade it for
@@ -573,17 +525,21 @@ static inline bool wait_dead(struct super_block *sb)
* Return: This returns true if an active reference could be acquired,
* false if not.
*/
-static bool grab_super_dead(struct super_block *sb)
+static bool grab_super(struct super_block *sb)
{
+ bool locked;
sb->s_count++;
- if (grab_super(sb)) {
- put_super(sb);
- lockdep_assert_held(&sb->s_umount);
- return true;
+ spin_unlock(&sb_lock);
+ locked = super_lock_excl(sb);
+ if (locked) {
+ if (atomic_inc_not_zero(&sb->s_active)) {
+ put_super(sb);
+ return true;
+ }
+ super_unlock_excl(sb);
}
- wait_var_event(&sb->s_flags, wait_dead(sb));
- lockdep_assert_not_held(&sb->s_umount);
+ wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
put_super(sb);
return false;
}
@@ -834,7 +790,7 @@ share_extant_sb:
warnfc(fc, "reusing existing filesystem in another namespace not allowed");
return ERR_PTR(-EBUSY);
}
- if (!grab_super_dead(old))
+ if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -878,7 +834,7 @@ retry:
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
- if (!grab_super_dead(old))
+ if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -930,8 +886,7 @@ static void __iterate_supers(void (*f)(struct super_block *))
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- /* Pairs with memory marrier in super_wake(). */
- if (smp_load_acquire(&sb->s_flags) & SB_DYING)
+ if (super_flags(sb, SB_DYING))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
@@ -961,15 +916,17 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
- born = super_lock_shared(sb);
- if (born && sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ locked = super_lock_shared(sb);
+ if (locked) {
+ if (sb->s_root)
+ f(sb, arg);
+ super_unlock_shared(sb);
+ }
spin_lock(&sb_lock);
if (p)
@@ -997,15 +954,17 @@ void iterate_supers_type(struct file_system_type *type,
spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
- born = super_lock_shared(sb);
- if (born && sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ locked = super_lock_shared(sb);
+ if (locked) {
+ if (sb->s_root)
+ f(sb, arg);
+ super_unlock_shared(sb);
+ }
spin_lock(&sb_lock);
if (p)
@@ -1019,34 +978,6 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
-/**
- * get_active_super - get an active reference to the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. Returns the superblock with an active
- * reference or %NULL if none was found.
- */
-struct super_block *get_active_super(struct block_device *bdev)
-{
- struct super_block *sb;
-
- if (!bdev)
- return NULL;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdev == bdev) {
- if (!grab_super(sb))
- return NULL;
- super_unlock_excl(sb);
- return sb;
- }
- }
- spin_unlock(&sb_lock);
- return NULL;
-}
-
struct super_block *user_get_super(dev_t dev, bool excl)
{
struct super_block *sb;
@@ -1054,15 +985,17 @@ struct super_block *user_get_super(dev_t dev, bool excl)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_dev == dev) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
/* still alive? */
- born = super_lock(sb, excl);
- if (born && sb->s_root)
- return sb;
- super_unlock(sb, excl);
+ locked = super_lock(sb, excl);
+ if (locked) {
+ if (sb->s_root)
+ return sb;
+ super_unlock(sb, excl);
+ }
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
@@ -1173,9 +1106,9 @@ cancel_readonly:
static void do_emergency_remount_callback(struct super_block *sb)
{
- bool born = super_lock_excl(sb);
+ bool locked = super_lock_excl(sb);
- if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
+ if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1186,7 +1119,8 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- super_unlock_excl(sb);
+ if (locked)
+ super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
@@ -1209,16 +1143,17 @@ void emergency_remount(void)
static void do_thaw_all_callback(struct super_block *sb)
{
- bool born = super_lock_excl(sb);
+ bool locked = super_lock_excl(sb);
- if (born && sb->s_root) {
+ if (locked && sb->s_root) {
if (IS_ENABLED(CONFIG_BLOCK))
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+ while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
- } else {
- super_unlock_excl(sb);
+ return;
}
+ if (locked)
+ super_unlock_excl(sb);
}
static void do_thaw_all(struct work_struct *work)
@@ -1428,11 +1363,11 @@ EXPORT_SYMBOL(sget_dev);
*
* The function must be called with bdev->bd_holder_lock and releases it.
*/
-static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
__releases(&bdev->bd_holder_lock)
{
struct super_block *sb = bdev->bd_holder;
- bool born;
+ bool locked;
lockdep_assert_held(&bdev->bd_holder_lock);
lockdep_assert_not_held(&sb->s_umount);
@@ -1442,19 +1377,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
spin_lock(&sb_lock);
sb->s_count++;
spin_unlock(&sb_lock);
+
mutex_unlock(&bdev->bd_holder_lock);
- born = super_lock_shared(sb);
- if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
- super_unlock_shared(sb);
- put_super(sb);
- return NULL;
- }
+ locked = super_lock(sb, excl);
+
/*
- * The superblock is active and we hold s_umount, we can drop our
- * temporary reference now.
- */
+ * If the superblock wasn't already SB_DYING then we hold
+ * s_umount and can safely drop our temporary reference.
+ */
put_super(sb);
+
+ if (!locked)
+ return NULL;
+
+ if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
+ super_unlock(sb, excl);
+ return NULL;
+ }
+
return sb;
}
@@ -1462,7 +1403,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
struct super_block *sb;
- sb = bdev_super_lock_shared(bdev);
+ sb = bdev_super_lock(bdev, false);
if (!sb)
return;
@@ -1480,16 +1421,110 @@ static void fs_bdev_sync(struct block_device *bdev)
{
struct super_block *sb;
- sb = bdev_super_lock_shared(bdev);
+ sb = bdev_super_lock(bdev, false);
if (!sb)
return;
+
sync_filesystem(sb);
super_unlock_shared(sb);
}
+static struct super_block *get_bdev_super(struct block_device *bdev)
+{
+ bool active = false;
+ struct super_block *sb;
+
+ sb = bdev_super_lock(bdev, true);
+ if (sb) {
+ active = atomic_inc_not_zero(&sb->s_active);
+ super_unlock_excl(sb);
+ }
+ if (!active)
+ return NULL;
+ return sb;
+}
+
+/**
+ * fs_bdev_freeze - freeze owning filesystem of block device
+ * @bdev: block device
+ *
+ * Freeze the filesystem that owns this block device if it is still
+ * active.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ * failed a negative error code is returned.
+ */
+static int fs_bdev_freeze(struct block_device *bdev)
+{
+ struct super_block *sb;
+ int error = 0;
+
+ lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+ sb = get_bdev_super(bdev);
+ if (!sb)
+ return -EINVAL;
+
+ if (sb->s_op->freeze_super)
+ error = sb->s_op->freeze_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ else
+ error = freeze_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ if (!error)
+ error = sync_blockdev(bdev);
+ deactivate_super(sb);
+ return error;
+}
+
+/**
+ * fs_bdev_thaw - thaw owning filesystem of block device
+ * @bdev: block device
+ *
+ * Thaw the filesystem that owns this block device.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the thaw was successful zero is returned. If the thaw
+ * failed a negative error code is returned. If this function
+ * returns zero it doesn't mean that the filesystem is unfrozen
+ * as it may have been frozen multiple times (kernel may hold a
+ * freeze or might be frozen from other block devices).
+ */
+static int fs_bdev_thaw(struct block_device *bdev)
+{
+ struct super_block *sb;
+ int error;
+
+ lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+ sb = get_bdev_super(bdev);
+ if (WARN_ON_ONCE(!sb))
+ return -EINVAL;
+
+ if (sb->s_op->thaw_super)
+ error = sb->s_op->thaw_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ else
+ error = thaw_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ deactivate_super(sb);
+ return error;
+}
+
const struct blk_holder_ops fs_holder_ops = {
.mark_dead = fs_bdev_mark_dead,
.sync = fs_bdev_sync,
+ .freeze = fs_bdev_freeze,
+ .thaw = fs_bdev_thaw,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);
@@ -1519,15 +1554,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
}
/*
- * Until SB_BORN flag is set, there can be no active superblock
- * references and thus no filesystem freezing. get_active_super() will
- * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed.
- *
- * It is enough to check bdev was not frozen before we set s_bdev.
+ * It is enough to check bdev was not frozen before we set
+ * s_bdev as freezing will wait until SB_BORN is set.
*/
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
bdev_release(bdev_handle);
@@ -1540,7 +1570,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
if (bdev_stable_writes(bdev))
sb->s_iflags |= SB_I_STABLE_WRITES;
spin_unlock(&sb_lock);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
@@ -1585,15 +1614,7 @@ int get_tree_bdev(struct fs_context *fc,
return -EBUSY;
}
} else {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * bdev_mark_dead()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- super_unlock_excl(s);
error = setup_bdev_super(s, fc->sb_flags, fc);
- __super_lock_excl(s);
if (!error)
error = fill_super(s, fc);
if (error) {
@@ -1637,15 +1658,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
return ERR_PTR(-EBUSY);
}
} else {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * bdev_mark_dead()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- super_unlock_excl(s);
error = setup_bdev_super(s, flags, NULL);
- __super_lock_excl(s);
if (!error)
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
@@ -1914,6 +1927,47 @@ static int wait_for_partially_frozen(struct super_block *sb)
return ret;
}
+#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
+#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
+
+static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_HOLDER_KERNEL)
+ ++sb->s_writers.freeze_kcount;
+ if (who & FREEZE_HOLDER_USERSPACE)
+ ++sb->s_writers.freeze_ucount;
+ return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
+ --sb->s_writers.freeze_kcount;
+ if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
+ --sb->s_writers.freeze_ucount;
+ return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_HOLDER_KERNEL)
+ return (who & FREEZE_MAY_NEST) ||
+ sb->s_writers.freeze_kcount == 0;
+ if (who & FREEZE_HOLDER_USERSPACE)
+ return (who & FREEZE_MAY_NEST) ||
+ sb->s_writers.freeze_ucount == 0;
+ return false;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
@@ -1926,6 +1980,7 @@ static int wait_for_partially_frozen(struct super_block *sb)
* @who should be:
* * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
* * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
*
* The @who argument distinguishes between the kernel and userspace trying to
* freeze the filesystem. Although there cannot be multiple kernel freezes or
@@ -1933,6 +1988,13 @@ static int wait_for_partially_frozen(struct super_block *sb)
* userspace can both hold a filesystem frozen. The filesystem remains frozen
* until there are no kernel or userspace freezes in effect.
*
+ * A filesystem may hold multiple devices and thus a filesystems may be
+ * frozen through the block layer via multiple block devices. In this
+ * case the request is marked as being allowed to nest by passing
+ * FREEZE_MAY_NEST. The filesystem remains frozen until all block
+ * devices are unfrozen. If multiple freezes are attempted without
+ * FREEZE_MAY_NEST -EBUSY will be returned.
+ *
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1957,31 +2019,29 @@ static int wait_for_partially_frozen(struct super_block *sb)
* mostly auxiliary for filesystems to verify they do not modify frozen fs.
*
* sb->s_writers.frozen is protected by sb->s_umount.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ * failed a negative error code is returned.
*/
int freeze_super(struct super_block *sb, enum freeze_holder who)
{
int ret;
+ if (!super_lock_excl(sb)) {
+ WARN_ON_ONCE("Dying superblock while freezing!");
+ return -EINVAL;
+ }
atomic_inc(&sb->s_active);
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while freezing!");
retry:
if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (sb->s_writers.freeze_holders & who) {
- deactivate_locked_super(sb);
- return -EBUSY;
- }
-
- WARN_ON(sb->s_writers.freeze_holders == 0);
-
- /*
- * Someone else already holds this type of freeze; share the
- * freeze and assign the active ref to the freeze.
- */
- sb->s_writers.freeze_holders |= who;
- super_unlock_excl(sb);
- return 0;
+ if (may_freeze(sb, who))
+ ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
+ else
+ ret = -EBUSY;
+ /* All freezers share a single active reference. */
+ deactivate_locked_super(sb);
+ return ret;
}
if (sb->s_writers.frozen != SB_UNFROZEN) {
@@ -1994,14 +2054,9 @@ retry:
goto retry;
}
- if (!(sb->s_flags & SB_BORN)) {
- super_unlock_excl(sb);
- return 0; /* sic - it's "nothing to do" */
- }
-
if (sb_rdonly(sb)) {
/* Nothing to do really... */
- sb->s_writers.freeze_holders |= who;
+ WARN_ON_ONCE(freeze_inc(sb, who) > 1);
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
super_unlock_excl(sb);
@@ -2012,8 +2067,7 @@ retry:
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
super_unlock_excl(sb);
sb_wait_write(sb, SB_FREEZE_WRITE);
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while freezing!");
+ __super_lock_excl(sb);
/* Now we go and block page faults... */
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
@@ -2049,7 +2103,7 @@ retry:
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
- sb->s_writers.freeze_holders |= who;
+ WARN_ON_ONCE(freeze_inc(sb, who) > 1);
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
@@ -2066,34 +2120,22 @@ EXPORT_SYMBOL(freeze_super);
*/
static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
- int error;
+ int error = -EINVAL;
- if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (!(sb->s_writers.freeze_holders & who)) {
- super_unlock_excl(sb);
- return -EINVAL;
- }
+ if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
+ goto out_unlock;
- /*
- * Freeze is shared with someone else. Release our hold and
- * drop the active ref that freeze_super assigned to the
- * freezer.
- */
- if (sb->s_writers.freeze_holders & ~who) {
- sb->s_writers.freeze_holders &= ~who;
- deactivate_locked_super(sb);
- return 0;
- }
- } else {
- super_unlock_excl(sb);
- return -EINVAL;
- }
+ /*
+ * All freezers share a single active reference.
+ * So just unlock in case there are any left.
+ */
+ if (freeze_dec(sb, who))
+ goto out_unlock;
if (sb_rdonly(sb)) {
- sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
wake_up_var(&sb->s_writers.frozen);
- goto out;
+ goto out_deactivate;
}
lockdep_sb_freeze_acquire(sb);
@@ -2101,20 +2143,23 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
- printk(KERN_ERR "VFS:Filesystem thaw failed\n");
+ pr_err("VFS: Filesystem thaw failed\n");
+ freeze_inc(sb, who);
lockdep_sb_freeze_release(sb);
- super_unlock_excl(sb);
- return error;
+ goto out_unlock;
}
}
- sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
-out:
+out_deactivate:
deactivate_locked_super(sb);
return 0;
+
+out_unlock:
+ super_unlock_excl(sb);
+ return error;
}
/**
@@ -2128,11 +2173,18 @@ out:
* @who should be:
* * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
* * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
+ *
+ * A filesystem may hold multiple devices and thus a filesystems may
+ * have been frozen through the block layer via multiple block devices.
+ * The filesystem remains frozen until all block devices are unfrozen.
*/
int thaw_super(struct super_block *sb, enum freeze_holder who)
{
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while thawing!");
+ if (!super_lock_excl(sb)) {
+ WARN_ON_ONCE("Dying superblock while thawing!");
+ return -EINVAL;
+ }
return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 725981474e5f..410ab2a44d2f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -8,6 +8,7 @@
#include <linux/buffer_head.h>
#include <linux/mount.h>
+#include <linux/mpage.h>
#include <linux/string.h>
#include "sysv.h"
@@ -456,9 +457,10 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
-static int sysv_writepage(struct page *page, struct writeback_control *wbc)
+static int sysv_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page,get_block,wbc);
+ return mpage_writepages(mapping, wbc, get_block);
}
static int sysv_read_folio(struct file *file, struct folio *folio)
@@ -503,8 +505,9 @@ const struct address_space_operations sysv_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = sysv_read_folio,
- .writepage = sysv_writepage,
+ .writepages = sysv_writepages,
.write_begin = sysv_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = sysv_bmap
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ebce93b08281..a7bb2e63cdde 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -35,6 +35,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/buffer_head.h>
+#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/iversion.h>
@@ -390,7 +391,7 @@ out:
/**
* ufs_getfrag_block() - `get_block_t' function, interface between UFS and
- * read_folio, writepage and so on
+ * read_folio, writepages and so on
*/
static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
@@ -467,9 +468,10 @@ done:
return 0;
}
-static int ufs_writepage(struct page *page, struct writeback_control *wbc)
+static int ufs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page,ufs_getfrag_block,wbc);
+ return mpage_writepages(mapping, wbc, ufs_getfrag_block);
}
static int ufs_read_folio(struct file *file, struct folio *folio)
@@ -528,9 +530,10 @@ const struct address_space_operations ufs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = ufs_read_folio,
- .writepage = ufs_writepage,
+ .writepages = ufs_writepages,
.write_begin = ufs_write_begin,
.write_end = ufs_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = ufs_bmap
};
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e8af40b05549..6e2a4d6a0d8f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,6 +2005,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
+static int userfaultfd_move(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_move uffdio_move;
+ struct uffdio_move __user *user_uffdio_move;
+ struct userfaultfd_wake_range range;
+ struct mm_struct *mm = ctx->mm;
+
+ user_uffdio_move = (struct uffdio_move __user *) arg;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_move, user_uffdio_move,
+ /* don't copy "move" last field */
+ sizeof(uffdio_move)-sizeof(__s64)))
+ return -EFAULT;
+
+ /* Do not allow cross-mm moves. */
+ if (mm != current->mm)
+ return -EINVAL;
+
+ ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
+ UFFDIO_MOVE_MODE_DONTWAKE))
+ return -EINVAL;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+
+ /* Re-check after taking mmap_lock */
+ if (likely(!atomic_read(&ctx->mmap_changing)))
+ ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
+ else
+ ret = -EINVAL;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
+ range.start = uffdio_move.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
/*
* userland asks for a certain API version and we return which bits
* and ioctl commands are implemented in this kernel for such API
@@ -2097,6 +2166,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
+ case UFFDIO_MOVE:
+ ret = userfaultfd_move(ctx, arg);
+ break;
case UFFDIO_WRITEPROTECT:
ret = userfaultfd_writeprotect(ctx, arg);
break;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7762c01a85cf..fbe3cdc79036 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
xfs-y += $(addprefix scrub/, \
trace.o \
+ agb_bitmap.o \
agheader.o \
alloc.o \
attr.o \
@@ -175,14 +176,32 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtsummary.o \
)
-xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ dqiterate.o \
+ quota.o \
+ )
# online repair
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ alloc_repair.o \
+ bmap_repair.o \
+ cow_repair.o \
+ ialloc_repair.o \
+ inode_repair.o \
+ newbt.o \
reap.o \
+ refcount_repair.o \
repair.o \
)
+
+xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
+ rtbitmap_repair.o \
+ )
+
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ quota_repair.o \
+ )
endif
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index f9f4d694640d..39d9525270b7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -332,6 +332,31 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Free perag within the specified AG range, it is only used to free unused
+ * perags under the error handling path.
+ */
+void
+xfs_free_unused_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agstart,
+ xfs_agnumber_t agend)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+
+ for (index = agstart; index < agend; index++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ xfs_defer_drain_free(&pag->pag_intents_drain);
+ kmem_free(pag);
+ }
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
@@ -424,19 +449,14 @@ xfs_initialize_perag(
out_remove_pag:
xfs_defer_drain_free(&pag->pag_intents_drain);
+ spin_lock(&mp->m_perag_lock);
radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
out_free_pag:
kmem_free(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- xfs_defer_drain_free(&pag->pag_intents_drain);
- kmem_free(pag);
- }
+ xfs_free_unused_perag_range(mp, first_initialised, agcount);
return error;
}
@@ -984,7 +1004,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+ err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
XFS_AG_RESV_NONE, true);
if (err2)
goto resv_err;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 2e0aef87d633..4b343c4fac28 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -80,6 +80,16 @@ struct xfs_perag {
*/
uint16_t pag_checked;
uint16_t pag_sick;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Alternate btree heights so that online repair won't trip the write
+ * verifiers while rebuilding the AG btrees.
+ */
+ uint8_t pagf_repair_levels[XFS_BTNUM_AGF];
+ uint8_t pagf_repair_refcount_level;
+#endif
+
spinlock_t pag_state_lock;
spinlock_t pagb_lock; /* lock for pagb_tree */
@@ -133,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
+void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
+ xfs_agnumber_t agend);
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 7fd1fea95552..da1057bd0e60 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ fallthrough;
+ case XFS_AG_RESV_IGNORE:
return;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 100ab5931b31..3bd0a33fee0a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec(
/* Simple checks for free space records. */
xfs_failaddr_t
xfs_alloc_check_irec(
- struct xfs_btree_cur *cur,
- const struct xfs_alloc_rec_incore *irec)
+ struct xfs_perag *pag,
+ const struct xfs_alloc_rec_incore *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->ar_blockcount == 0)
return __this_address;
@@ -299,7 +297,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -2514,7 +2512,7 @@ xfs_defer_agfl_block(
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
return 0;
}
@@ -2522,14 +2520,15 @@ xfs_defer_agfl_block(
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-int
-__xfs_free_extent_later(
+static int
+xfs_defer_extent_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
- bool skip_discard)
+ bool skip_discard,
+ struct xfs_defer_pending **dfpp)
{
struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
@@ -2577,10 +2576,105 @@ __xfs_free_extent_later(
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
return 0;
}
+int
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
+{
+ struct xfs_defer_pending *dontcare = NULL;
+
+ return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+ &dontcare);
+}
+
+/*
+ * Set up automatic freeing of unwritten space in the filesystem.
+ *
+ * This function attached a paused deferred extent free item to the
+ * transaction. Pausing means that the EFI will be logged in the next
+ * transaction commit, but the pending EFI will not be finished until the
+ * pending item is unpaused.
+ *
+ * If the system goes down after the EFI has been persisted to the log but
+ * before the pending item is unpaused, log recovery will find the EFI, fail to
+ * find the EFD, and free the space.
+ *
+ * If the pending item is unpaused, the next transaction commit will log an EFD
+ * without freeing the space.
+ *
+ * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
+ * @args structure are set to the relevant values.
+ */
+int
+xfs_alloc_schedule_autoreap(
+ const struct xfs_alloc_arg *args,
+ bool skip_discard,
+ struct xfs_alloc_autoreap *aarp)
+{
+ int error;
+
+ error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
+ &args->oinfo, args->resv, skip_discard, &aarp->dfp);
+ if (error)
+ return error;
+
+ xfs_defer_item_pause(args->tp, aarp->dfp);
+ return 0;
+}
+
+/*
+ * Cancel automatic freeing of unwritten space in the filesystem.
+ *
+ * Earlier, we created a paused deferred extent free item and attached it to
+ * this transaction so that we could automatically roll back a new space
+ * allocation if the system went down. Now we want to cancel the paused work
+ * item by marking the EFI stale so we don't actually free the space, unpausing
+ * the pending item and logging an EFD.
+ *
+ * The caller generally should have already mapped the space into the ondisk
+ * filesystem. If the reserved space was partially used, the caller must call
+ * xfs_free_extent_later to create a new EFI to free the unused space.
+ */
+void
+xfs_alloc_cancel_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ struct xfs_defer_pending *dfp = aarp->dfp;
+ struct xfs_extent_free_item *xefi;
+
+ if (!dfp)
+ return;
+
+ list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
+ xefi->xefi_flags |= XFS_EFI_CANCELLED;
+
+ xfs_defer_item_unpause(tp, dfp);
+}
+
+/*
+ * Commit automatic freeing of unwritten space in the filesystem.
+ *
+ * This unpauses an earlier _schedule_autoreap and commits to freeing the
+ * allocated space. Call this if none of the reserved space was used.
+ */
+void
+xfs_alloc_commit_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ if (aarp->dfp)
+ xfs_defer_item_unpause(tp, aarp->dfp);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -3848,7 +3942,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6bb8d295c321..0b956f8b9d5a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -185,7 +185,7 @@ xfs_alloc_get_rec(
union xfs_btree_rec;
void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_alloc_rec_incore *irec);
-xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *irec);
int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
-int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type, bool skip_discard);
@@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */
-static inline int
-xfs_free_extent_later(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
-{
- return __xfs_free_extent_later(tp, bno, len, oinfo, type, false);
-}
+struct xfs_alloc_autoreap {
+ struct xfs_defer_pending *dfp;
+};
+int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
+ bool skip_discard, struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
extern struct kmem_cache *xfs_extfree_item_cache;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index c65228efed4a..a7032bf0cd37 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -323,7 +323,18 @@ xfs_allocbt_verify(
if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[btnum])
+ unsigned int maxlevel = pag->pagf_levels[btnum];
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_levels[btnum]);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..9976a00a73f9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -862,8 +862,11 @@ xfs_attr_lookup(
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
- if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_findname(args, NULL, NULL);
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ if (xfs_attr_sf_findname(args))
+ return -EEXIST;
+ return -ENOATTR;
+ }
if (xfs_attr_is_leaf(dp)) {
error = xfs_attr_leaf_hasname(args, &bp);
@@ -880,11 +883,10 @@ xfs_attr_lookup(
return error;
}
-static int
-xfs_attr_intent_init(
+static void
+xfs_attr_defer_add(
struct xfs_da_args *args,
- unsigned int op_flags, /* op flag (set or remove) */
- struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+ unsigned int op_flags)
{
struct xfs_attr_intent *new;
@@ -893,66 +895,22 @@ xfs_attr_intent_init(
new->xattri_op_flags = op_flags;
new->xattri_da_args = args;
- *attr = new;
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_add(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
- if (error)
- return error;
+ switch (op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ }
- new->xattri_dela_state = xfs_attr_init_add_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_replace(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_replace_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Removes an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_remove(
- struct xfs_da_args *args)
-{
-
- struct xfs_attr_intent *new;
- int error;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_remove_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
-
- return 0;
}
/*
@@ -1038,16 +996,16 @@ xfs_attr_set(
error = xfs_attr_lookup(args);
switch (error) {
case -EEXIST:
- /* if no value, we are performing a remove operation */
if (!args->value) {
- error = xfs_attr_defer_remove(args);
+ /* if no value, we are performing a remove operation */
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
break;
}
+
/* Pure create fails if the attr already exists */
if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_replace(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
break;
case -ENOATTR:
/* Can't remove what isn't there. */
@@ -1057,14 +1015,11 @@ xfs_attr_set(
/* Pure replace fails if no existing attr to replace. */
if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_add(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
break;
default:
goto out_trans_cancel;
}
- if (error)
- goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -1097,10 +1052,9 @@ out_trans_cancel:
static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
- return be16_to_cpu(sf->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
/*
@@ -1112,19 +1066,13 @@ xfs_attr_shortform_addname(
struct xfs_da_args *args)
{
int newsize, forkoff;
- int error;
trace_xfs_attr_sf_addname(args);
- error = xfs_attr_shortform_lookup(args);
- switch (error) {
- case -ENOATTR:
- if (args->op_flags & XFS_DA_OP_REPLACE)
- return error;
- break;
- case -EEXIST:
- if (!(args->op_flags & XFS_DA_OP_REPLACE))
- return error;
+ if (xfs_attr_sf_findname(args)) {
+ int error;
+
+ ASSERT(args->op_flags & XFS_DA_OP_REPLACE);
error = xfs_attr_sf_removename(args);
if (error)
@@ -1137,11 +1085,8 @@ xfs_attr_shortform_addname(
* around.
*/
args->op_flags &= ~XFS_DA_OP_REPLACE;
- break;
- case 0:
- break;
- default:
- return error;
+ } else {
+ ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE));
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2580ae47209a..6374bf107242 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -690,56 +690,32 @@ xfs_attr_shortform_create(
ASSERT(ifp->if_bytes == 0);
if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
ifp->if_format = XFS_DINODE_FMT_LOCAL;
- xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+
+ hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
/*
- * Return -EEXIST if attr is found, or -ENOATTR if not
- * args: args containing attribute name and namelen
- * sfep: If not null, pointer will be set to the last attr entry found on
- -EEXIST. On -ENOATTR pointer is left at the last entry in the list
- * basep: If not null, pointer is set to the byte offset of the entry in the
- * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
- * the last entry in the list
+ * Return the entry if the attr in args is found, or NULL if not.
*/
-int
+struct xfs_attr_sf_entry *
xfs_attr_sf_findname(
- struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- unsigned int base = sizeof(struct xfs_attr_sf_hdr);
- int size = 0;
- int end;
- int i;
+ struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- end = sf->hdr.count;
- for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
- base += size, i++) {
- size = xfs_attr_sf_entsize(sfe);
- if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- continue;
- break;
+ for (sfe = xfs_attr_sf_firstentry(sf);
+ sfe < xfs_attr_sf_endptr(sf);
+ sfe = xfs_attr_sf_nextentry(sfe)) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return sfe;
}
- if (sfep != NULL)
- *sfep = sfe;
-
- if (basep != NULL)
- *basep = base;
-
- if (i == end)
- return -ENOATTR;
- return -EEXIST;
+ return NULL;
}
/*
@@ -751,38 +727,31 @@ xfs_attr_shortform_add(
struct xfs_da_args *args,
int forkoff)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
- int offset, size;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- struct xfs_ifork *ifp;
+ int size;
trace_xfs_attr_sf_add(args);
- dp = args->dp;
- mp = dp->i_mount;
dp->i_forkoff = forkoff;
- ifp = &dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
- ASSERT(0);
+ ASSERT(!xfs_attr_sf_findname(args));
- offset = (char *)sfe - (char *)sf;
size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+ sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+ sfe = xfs_attr_sf_endptr(sf);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
- sf->hdr.count++;
- be16_add_cpu(&sf->hdr.totsize, size);
+ sf->count++;
+ be16_add_cpu(&sf->totsize, size);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
xfs_sbversion_add_attr2(mp, args->trans);
@@ -811,48 +780,43 @@ int
xfs_attr_sf_removename(
struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
- int size = 0, end, totsize;
- unsigned int base;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- int error;
+ uint16_t totsize = be16_to_cpu(sf->totsize);
+ void *next, *end;
+ int size = 0;
trace_xfs_attr_sf_remove(args);
- dp = args->dp;
- mp = dp->i_mount;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
-
- error = xfs_attr_sf_findname(args, &sfe, &base);
-
- /*
- * If we are recovering an operation, finding nothing to
- * remove is not an error - it just means there was nothing
- * to clean up.
- */
- if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
- return 0;
- if (error != -EEXIST)
- return error;
- size = xfs_attr_sf_entsize(sfe);
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe) {
+ /*
+ * If we are recovering an operation, finding nothing to remove
+ * is not an error, it just means there was nothing to clean up.
+ */
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
+ return -ENOATTR;
+ }
/*
* Fix up the attribute fork data, covering the hole
*/
- end = base + size;
- totsize = be16_to_cpu(sf->hdr.totsize);
- if (end != totsize)
- memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
- sf->hdr.count--;
- be16_add_cpu(&sf->hdr.totsize, -size);
+ size = xfs_attr_sf_entsize(sfe);
+ next = xfs_attr_sf_nextentry(sfe);
+ end = xfs_attr_sf_endptr(sf);
+ if (next < end)
+ memmove(sfe, next, end - next);
+ sf->count--;
+ totsize -= size;
+ sf->totsize = cpu_to_be16(totsize);
/*
* Fix up the start offset of the attribute fork
*/
- totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
@@ -860,7 +824,7 @@ xfs_attr_sf_removename(
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
ASSERT(dp->i_forkoff);
- ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
@@ -874,33 +838,6 @@ xfs_attr_sf_removename(
}
/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
- struct xfs_ifork *ifp;
-
- trace_xfs_attr_sf_lookup(args);
-
- ifp = &args->dp->i_af;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return -EEXIST;
- }
- return -ENOATTR;
-}
-
-/*
* Retrieve the attribute value and length.
*
* If args->valuelen is zero, only the length needs to be returned. Unlike a
@@ -909,23 +846,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
*/
int
xfs_attr_shortform_getvalue(
- struct xfs_da_args *args)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
+ struct xfs_attr_sf_entry *sfe;
ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return xfs_attr_copy_value(args,
- &sfe->nameval[args->namelen], sfe->valuelen);
- }
- return -ENOATTR;
+
+ trace_xfs_attr_sf_lookup(args);
+
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe)
+ return -ENOATTR;
+ return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+ sfe->valuelen);
}
/* Convert from using the shortform to the leaf format. */
@@ -933,26 +866,23 @@ int
xfs_attr_shortform_to_leaf(
struct xfs_da_args *args)
{
- struct xfs_inode *dp;
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
+ int size = be16_to_cpu(sf->totsize);
struct xfs_da_args nargs;
char *tmpbuffer;
- int error, i, size;
+ int error, i;
xfs_dablk_t blkno;
struct xfs_buf *bp;
- struct xfs_ifork *ifp;
trace_xfs_attr_sf_to_leaf(args);
- dp = args->dp;
- ifp = &dp->i_af;
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (struct xfs_attr_shortform *)tmpbuffer;
+ memcpy(tmpbuffer, ifp->if_data, size);
+ sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf(
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count; i++) {
+ sfe = xfs_attr_sf_firstentry(sf);
+ for (i = 0; i < sf->count; i++) {
nargs.name = sfe->nameval;
nargs.namelen = sfe->namelen;
nargs.value = &sfe->nameval[nargs.namelen];
@@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit(
return xfs_attr_shortform_bytesfit(dp, bytes);
}
-/* Verify the consistency of an inline attribute fork. */
+/* Verify the consistency of a raw inline attribute fork. */
xfs_failaddr_t
xfs_attr_shortform_verify(
- struct xfs_inode *ip)
+ struct xfs_attr_sf_hdr *sfp,
+ size_t size)
{
- struct xfs_attr_shortform *sfp;
- struct xfs_attr_sf_entry *sfep;
+ struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp);
struct xfs_attr_sf_entry *next_sfep;
char *endp;
- struct xfs_ifork *ifp;
int i;
- int64_t size;
-
- ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
- sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
/*
* Give up if the attribute is way too short.
@@ -1067,8 +990,7 @@ xfs_attr_shortform_verify(
endp = (char *)sfp + size;
/* Check all reported entries */
- sfep = &sfp->list[0];
- for (i = 0; i < sfp->hdr.count; i++) {
+ for (i = 0; i < sfp->count; i++) {
/*
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
@@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node(
if (error)
goto out;
- /* copy leaf to new buffer, update identifiers */
- xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
- bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
- if (xfs_has_crc(mp)) {
- struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
- hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
- }
+ /*
+ * Copy leaf to new buffer and log it.
+ */
+ xfs_da_buf_copy(bp2, bp1, args->geo->blksize);
xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 368f4d9fa1d5..9b9948639c0f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr {
*/
void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
-int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
-int xfs_attr_sf_findname(struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep);
+struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
-xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp,
+ size_t size);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 37578b369d9b..bc4422223024 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -7,14 +7,6 @@
#define __XFS_ATTR_SF_H__
/*
- * Attribute storage when stored inside the inode.
- *
- * Small attribute lists are packed as tightly as possible so as
- * to fit into the literal area of the inode.
- */
-typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-
-/*
* We generate this then sort it, attr_list() must return things in hash-order.
*/
typedef struct xfs_attr_sf_sort {
@@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
}
-/* next entry in struct */
+/* first entry in the SF attr fork */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr)
+{
+ return (struct xfs_attr_sf_entry *)(hdr + 1);
+}
+
+/* next entry after sfep */
static inline struct xfs_attr_sf_entry *
xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
{
return (void *)sfep + xfs_attr_sf_entsize(sfep);
}
+/* pointer to the space after the last entry, e.g. for adding a new one */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf)
+{
+ return (void *)sf + be16_to_cpu(sf->totsize);
+}
+
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index be62acffad6c..98aaca933bdd 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty(
ASSERT(ifp->if_nextents == 0);
xfs_bmap_forkoff_reset(ip, whichfork);
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -832,7 +832,7 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
rec.br_startoff = 0;
@@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-void
+/* returns true if ap->blkno was modified */
+bool
xfs_bmap_adjacent(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
@@ -3079,13 +3080,14 @@ xfs_bmap_adjacent(
if (adjust &&
ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
ap->blkno += adjust;
+ return true;
}
/*
* If not at eof, then compare the two neighbor blocks.
* Figure out whether either one gives us a good starting point,
* and pick the better one.
*/
- else if (!ap->eof) {
+ if (!ap->eof) {
xfs_fsblock_t gotbno; /* right side block number */
xfs_fsblock_t gotdiff=0; /* right side difference */
xfs_fsblock_t prevbno; /* left side block number */
@@ -3165,14 +3167,21 @@ xfs_bmap_adjacent(
* If both valid, pick the better one, else the only good
* one, else ap->blkno is already set (to 0 or the inode block).
*/
- if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) {
ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
- else if (prevbno != NULLFSBLOCK)
+ return true;
+ }
+ if (prevbno != NULLFSBLOCK) {
ap->blkno = prevbno;
- else if (gotbno != NULLFSBLOCK)
+ return true;
+ }
+ if (gotbno != NULLFSBLOCK) {
ap->blkno = gotbno;
+ return true;
+ }
}
#undef ISVALID
+ return false;
}
int
@@ -3263,11 +3272,14 @@ xfs_bmap_btalloc_select_lengths(
}
/* Update all inode and quota accounting for the allocation we just did. */
-static void
-xfs_bmap_btalloc_accounting(
- struct xfs_bmalloca *ap,
- struct xfs_alloc_arg *args)
+void
+xfs_bmap_alloc_account(
+ struct xfs_bmalloca *ap)
{
+ bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ (ap->flags & XFS_BMAPI_ATTRFORK);
+ uint fld;
+
if (ap->flags & XFS_BMAPI_COWFORK) {
/*
* COW fork blocks are in-core only and thus are treated as
@@ -3279,7 +3291,7 @@ xfs_bmap_btalloc_accounting(
* yet.
*/
if (ap->wasdel) {
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
return;
}
@@ -3291,22 +3303,25 @@ xfs_bmap_btalloc_accounting(
* This essentially transfers the transaction quota reservation
* to that of a delalloc extent.
*/
- ap->ip->i_delayed_blks += args->len;
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
- -(long)args->len);
+ ap->ip->i_delayed_blks += ap->length;
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ?
+ XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
+ -(long)ap->length);
return;
}
/* data/attr fork only */
- ap->ip->i_nblocks += args->len;
+ ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
- ap->ip->i_delayed_blks -= args->len;
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ ap->ip->i_delayed_blks -= ap->length;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
+ } else {
+ fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
}
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
- args->len);
+
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length);
}
static int
@@ -3380,7 +3395,7 @@ xfs_bmap_process_allocated_extent(
ap->offset = orig_offset;
else if (ap->offset + ap->length < orig_offset + orig_length)
ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, args);
+ xfs_bmap_alloc_account(ap);
}
#ifdef DEBUG
@@ -5010,7 +5025,6 @@ xfs_bmap_del_extent_real(
xfs_fileoff_t del_endoff; /* first offset past del */
int do_fx; /* free extent at end of routine */
int error; /* error return value */
- int flags = 0;/* inode logging flags */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5023,6 +5037,8 @@ xfs_bmap_del_extent_real(
uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
+ *logflagsp = 0;
+
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -5035,7 +5051,6 @@ xfs_bmap_del_extent_real(
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
- error = 0;
/*
* If it's the case where the directory code is running with no block
@@ -5051,13 +5066,13 @@ xfs_bmap_del_extent_real(
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
- flags = XFS_ILOG_CORE;
+ *logflagsp = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
if (error)
- goto done;
+ return error;
}
do_fx = 0;
@@ -5072,11 +5087,9 @@ xfs_bmap_del_extent_real(
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
if (got.br_startoff == del->br_startoff)
@@ -5093,17 +5106,15 @@ xfs_bmap_del_extent_real(
xfs_iext_prev(ifp, icur);
ifp->if_nextents--;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_btree_delete(cur, &i)))
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
break;
case BMAP_LEFT_FILLING:
/*
@@ -5114,12 +5125,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case BMAP_RIGHT_FILLING:
/*
@@ -5128,12 +5139,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case 0:
/*
@@ -5150,18 +5161,18 @@ xfs_bmap_del_extent_real(
new.br_state = got.br_state;
new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
error = xfs_btree_increment(cur, 0, &i);
if (error)
- goto done;
+ return error;
cur->bc_rec.b = new;
error = xfs_btree_insert(cur, &i);
if (error && error != -ENOSPC)
- goto done;
+ return error;
/*
* If get no-space back from btree insert, it tried a
* split, and we have a zero block reservation. Fix up
@@ -5174,33 +5185,28 @@ xfs_bmap_del_extent_real(
*/
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/*
* Update the btree record back
* to the original value.
*/
error = xfs_bmbt_update(cur, &old);
if (error)
- goto done;
+ return error;
/*
* Reset the extent record back
* to the original value.
*/
xfs_iext_update_extent(ip, state, icur, &old);
- flags = 0;
- error = -ENOSPC;
- goto done;
- }
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ *logflagsp = 0;
+ return -ENOSPC;
}
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
} else
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
ifp->if_nextents++;
xfs_iext_next(ifp, icur);
@@ -5218,13 +5224,13 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- error = __xfs_free_extent_later(tp, del->br_startblock,
+ error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE,
((bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN));
if (error)
- goto done;
+ return error;
}
}
@@ -5239,9 +5245,7 @@ xfs_bmap_del_extent_real(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-done:
- *logflagsp = flags;
- return error;
+ return 0;
}
/*
@@ -5250,7 +5254,7 @@ done:
* that value. If not all extents in the block range can be removed then
* *done is set.
*/
-int /* error */
+static int
__xfs_bunmapi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
@@ -6102,7 +6106,7 @@ __xfs_bmap_add(
bi->bi_bmap = *bmap;
xfs_bmap_update_get_group(tp->t_mountp, bi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
return 0;
}
@@ -6179,19 +6183,18 @@ xfs_bmap_finish_one(
return error;
}
-/* Check that an inode's extent does not have invalid flags or bad ranges. */
+/* Check that an extent does not have invalid flags or bad ranges. */
xfs_failaddr_t
-xfs_bmap_validate_extent(
- struct xfs_inode *ip,
+xfs_bmap_validate_extent_raw(
+ struct xfs_mount *mp,
+ bool rtfile,
int whichfork,
struct xfs_bmbt_irec *irec)
{
- struct xfs_mount *mp = ip->i_mount;
-
if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
return __this_address;
- if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+ if (rtfile && whichfork == XFS_DATA_FORK) {
if (!xfs_verify_rtbext(mp, irec->br_startblock,
irec->br_blockcount))
return __this_address;
@@ -6221,3 +6224,53 @@ xfs_bmap_intent_destroy_cache(void)
kmem_cache_destroy(xfs_bmap_intent_cache);
xfs_bmap_intent_cache = NULL;
}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ return xfs_bmap_validate_extent_raw(ip->i_mount,
+ XFS_IS_REALTIME_INODE(ip), whichfork, irec);
+}
+
+/*
+ * Used in xfs_itruncate_extents(). This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS 2
+
+/*
+ * Unmap every extent in part of an inode's fork. We don't do any higher level
+ * invalidation work at all.
+ */
+int
+xfs_bunmapi_range(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ uint32_t flags,
+ xfs_fileoff_t startoff,
+ xfs_fileoff_t endoff)
+{
+ xfs_filblks_t unmap_len = endoff - startoff + 1;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ while (unmap_len > 0) {
+ ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
+ error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags,
+ XFS_ITRUNC_MAX_EXTENTS);
+ if (error)
+ goto out;
+
+ /* free the just unmapped extents */
+ error = xfs_defer_finish(tpp);
+ if (error)
+ goto out;
+ }
+out:
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e33470e39728..f6b73f1bad5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
return XFS_DATA_FORK;
}
+void xfs_bmap_alloc_account(struct xfs_bmalloca *ap);
+
/*
* Special values for xfs_bmbt_irec_t br_startblock field.
*/
@@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
-int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
- xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
@@ -263,6 +262,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
}
}
+xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile,
+ int whichfork, struct xfs_bmbt_irec *irec);
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
@@ -271,6 +272,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
uint32_t flags);
+int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip,
+ uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index bf3f1b36fdd2..71f2d50f7823 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -15,6 +15,7 @@
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
@@ -272,7 +273,7 @@ xfs_bmbt_free_block(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_contiguous = xfs_bmbt_keys_contiguous,
};
-/*
- * Allocate a new bmap btree cursor.
- */
-struct xfs_btree_cur * /* new bmap btree cursor */
-xfs_bmbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* inode owning the btree */
- int whichfork) /* data or attr fork */
+static struct xfs_btree_cur *
+xfs_bmbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
+
ASSERT(whichfork != XFS_COW_FORK);
cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
- cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
@@ -567,10 +558,30 @@ xfs_bmbt_init_cursor(
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.ip = ip;
cur->bc_ino.allocated = 0;
cur->bc_ino.flags = 0;
+
+ return cur;
+}
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.whichfork = whichfork;
return cur;
@@ -588,6 +599,76 @@ xfs_bmbt_block_maxrecs(
}
/*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping data
+ * structure. Note that callers can use the staged cursor to reload extents
+ * format inode forks if they rebuild the iext tree and commit the staged
+ * cursor immediately.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_btree_ops *ops;
+
+ /* data fork always has larger maxheight */
+ cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ino.forksize = ifake->if_fork_size;
+
+ /* Don't let anyone think we're attached to the real fork yet. */
+ cur->bc_ino.whichfork = -1;
+ xfs_btree_stage_ifakeroot(cur, ifake, &ops);
+ ops->update_cursor = NULL;
+ return cur;
+}
+
+/*
+ * Swap in the new inode fork root. Once we pass this point the newly rebuilt
+ * mappings are in place and we have to kill off any old btree blocks.
+ */
+void
+xfs_bmbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT};
+ static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT};
+ int flags = XFS_ILOG_CORE;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(whichfork != XFS_COW_FORK);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ flags |= extflag[whichfork];
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ flags |= brootflag[whichfork];
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+}
+
+/*
* Calculate number of records in a bmap btree block.
*/
int
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 3e7a40a83835..151b8491f60e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_block;
struct xfs_mount;
struct xfs_inode;
struct xfs_trans;
+struct xbtree_ifakeroot;
/*
* Btree block header size depends on a superblock flag.
@@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
+void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, int whichfork);
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 6a6503ab0cd7..ea8d3659df20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block(
* Read in the buffer at the given ptr and return the buffer and
* the block pointer within the buffer.
*/
-STATIC int
+int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
@@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void)
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
}
+
+/* Move the btree cursor before the first record. */
+int
+xfs_btree_goto_left_edge(
+ struct xfs_btree_cur *cur)
+{
+ int stat = 0;
+ int error;
+
+ memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return 0;
+
+ error = xfs_btree_decrement(cur, 0, &stat);
+ if (error)
+ return error;
+ if (stat != 0) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4d68a58be160..d906324e25c8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
struct xfs_buf **bpp);
+int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int flags,
+ struct xfs_btree_block **block, struct xfs_buf **bpp);
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
int lr);
@@ -735,4 +738,6 @@ xfs_btree_alloc_cursor(
int __init xfs_btree_init_cur_caches(void);
void xfs_btree_destroy_cur_caches(void);
+int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index dd75e208b543..e276eba87cb1 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot(
/*
* Put a btree block that we're loading onto the ordered list and release it.
* The btree blocks will be written to disk when bulk loading is finished.
+ * If we reach the dirty buffer threshold, flush them to disk before
+ * continuing.
*/
-static void
+static int
xfs_btree_bload_drop_buf(
- struct list_head *buffers_list,
- struct xfs_buf **bpp)
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
{
- if (*bpp == NULL)
- return;
+ struct xfs_buf *bp = *bpp;
+ int error;
+
+ if (!bp)
+ return 0;
- if (!xfs_buf_delwri_queue(*bpp, buffers_list))
- ASSERT(0);
+ /*
+ * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
+ * xfs_buf_read will not pointlessly reread the contents from the disk.
+ */
+ bp->b_flags |= XBF_DONE;
- xfs_buf_relse(*bpp);
+ xfs_buf_delwri_queue_here(bp, buffers_list);
+ xfs_buf_relse(bp);
*bpp = NULL;
+ bbl->nr_dirty++;
+
+ if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
+ return 0;
+
+ error = xfs_buf_delwri_submit(buffers_list);
+ if (error)
+ return error;
+
+ bbl->nr_dirty = 0;
+ return 0;
}
/*
@@ -384,7 +405,7 @@ xfs_btree_bload_prep_block(
ASSERT(*bpp == NULL);
/* Allocate a new incore btree root block. */
- new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
ifp->if_broot = kmem_zalloc(new_size, 0);
ifp->if_broot_bytes = (int)new_size;
@@ -418,7 +439,10 @@ xfs_btree_bload_prep_block(
*/
if (*blockp)
xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
- xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
+ if (ret)
+ return ret;
/* Initialize the new btree block. */
xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
@@ -436,22 +460,19 @@ STATIC int
xfs_btree_bload_leaf(
struct xfs_btree_cur *cur,
unsigned int recs_this_block,
- xfs_btree_bload_get_record_fn get_record,
+ xfs_btree_bload_get_records_fn get_records,
struct xfs_btree_block *block,
void *priv)
{
- unsigned int j;
+ unsigned int j = 1;
int ret;
/* Fill the leaf block with records. */
- for (j = 1; j <= recs_this_block; j++) {
- union xfs_btree_rec *block_rec;
-
- ret = get_record(cur, priv);
- if (ret)
+ while (j <= recs_this_block) {
+ ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
+ if (ret < 0)
return ret;
- block_rec = xfs_btree_rec_addr(cur, j, block);
- cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ j += ret;
}
return 0;
@@ -485,7 +506,12 @@ xfs_btree_bload_node(
ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
- ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ /*
+ * Read the lower-level block in case the buffer for it has
+ * been reclaimed. LRU refs will be set on the block, which is
+ * desirable if the new btree commits.
+ */
+ ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
&child_bp);
if (ret)
return ret;
@@ -570,7 +596,14 @@ xfs_btree_bload_level_geometry(
unsigned int desired_npb;
unsigned int maxnr;
- maxnr = cur->bc_ops->get_maxrecs(cur, level);
+ /*
+ * Compute the absolute maximum number of records that we can store in
+ * the ondisk block or inode root.
+ */
+ if (cur->bc_ops->get_dmaxrecs)
+ maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
+ else
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
/*
* Compute the number of blocks we need to fill each block with the
@@ -764,6 +797,7 @@ xfs_btree_bload(
cur->bc_nlevels = bbl->btree_height;
xfs_btree_set_ptr_null(cur, &child_ptr);
xfs_btree_set_ptr_null(cur, &ptr);
+ bbl->nr_dirty = 0;
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &blocks, &blocks_with_extra);
@@ -789,7 +823,7 @@ xfs_btree_bload(
trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
nr_this_block);
- ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
block, priv);
if (ret)
goto out;
@@ -802,7 +836,10 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
/* Populate the internal btree nodes. */
for (level = 1; level < cur->bc_nlevels; level++) {
@@ -844,7 +881,11 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
+
xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index f0d2976050ae..055ea43b1e18 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -37,12 +37,6 @@ struct xbtree_ifakeroot {
/* Number of bytes available for this fork in the inode. */
unsigned int if_fork_size;
-
- /* Fork format. */
- unsigned int if_format;
-
- /* Number of records. */
- unsigned int if_extents;
};
/* Cursor interactions with fake roots for inode-rooted btrees. */
@@ -53,19 +47,24 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
int whichfork, const struct xfs_btree_ops *ops);
/* Bulk loading of staged btrees. */
-typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
+ unsigned int idx, struct xfs_btree_block *block,
+ unsigned int nr_wanted, void *priv);
typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr, void *priv);
typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
- unsigned int nr_this_level, void *priv);
+ unsigned int level, unsigned int nr_this_level, void *priv);
struct xfs_btree_bload {
/*
- * This function will be called nr_records times to load records into
- * the btree. The function does this by setting the cursor's bc_rec
- * field in in-core format. Records must be returned in sort order.
+ * This function will be called to load @nr_wanted records into the
+ * btree. The implementation does this by setting the cursor's bc_rec
+ * field in in-core format and using init_rec_from_cur to set the
+ * records in the btree block. Records must be returned in sort order.
+ * The function must return the number of records loaded or the usual
+ * negative errno.
*/
- xfs_btree_bload_get_record_fn get_record;
+ xfs_btree_bload_get_records_fn get_records;
/*
* This function will be called nr_blocks times to obtain a pointer
@@ -113,6 +112,16 @@ struct xfs_btree_bload {
* height of the new btree.
*/
unsigned int btree_height;
+
+ /*
+ * Flush the new btree block buffer list to disk after this many blocks
+ * have been formatted. Zero prohibits writing any buffers until all
+ * blocks have been formatted.
+ */
+ uint16_t max_dirty;
+
+ /* Number of dirty buffers. */
+ uint16_t nr_dirty;
};
int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..5457188bb4de 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -421,6 +421,25 @@ xfs_da3_node_read_mapped(
return xfs_da3_node_set_type(tp, *bpp);
}
+/*
+ * Copy src directory/attr leaf/node buffer to the dst.
+ * For v5 file systems make sure the right blkno is stamped in.
+ */
+void
+xfs_da_buf_copy(
+ struct xfs_buf *dst,
+ struct xfs_buf *src,
+ size_t size)
+{
+ struct xfs_da3_blkinfo *da3 = dst->b_addr;
+
+ memcpy(dst->b_addr, src->b_addr, size);
+ dst->b_ops = src->b_ops;
+ xfs_trans_buf_copy_type(dst, src);
+ if (xfs_has_crc(dst->b_mount))
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(dst));
+}
+
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -690,12 +709,6 @@ xfs_da3_root_split(
btree = icnodehdr.btree;
size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
level = icnodehdr.level;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -707,31 +720,17 @@ xfs_da3_root_split(
size = (int)((char *)&leafhdr.ents[leafhdr.count] -
(char *)leaf);
level = 0;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
/*
- * we can copy most of the information in the node from one block to
- * another, but for CRC enabled headers we have to make sure that the
- * block specific identifiers are kept intact. We update the buffer
- * directly for this.
+ * Copy old root to new buffer and log it.
*/
- memcpy(node, oldroot, size);
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
- oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
- node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- }
+ xfs_da_buf_copy(bp, blk1->bp, size);
xfs_trans_log_buf(tp, bp, 0, size - 1);
- bp->b_ops = blk1->bp->b_ops;
- xfs_trans_buf_copy_type(bp, blk1->bp);
+ /*
+ * Update blk1 to point to new buffer.
+ */
blk1->bp = bp;
blk1->blkno = blkno;
@@ -1220,21 +1219,14 @@ xfs_da3_root_join(
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
- * This could be copying a leaf back into the root block in the case of
- * there only being a single leaf block left in the tree. Hence we have
- * to update the b_ops pointer as well to match the buffer type change
- * that could occur. For dir3 blocks we also need to update the block
- * number in the buffer header.
+ * Copy child to root buffer and log it.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
- root_blk->bp->b_ops = bp->b_ops;
- xfs_trans_buf_copy_type(root_blk->bp, bp);
- if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
- struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
- da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
- }
+ xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize);
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
+ /*
+ * Now we can drop the child buffer.
+ */
error = xfs_da_shrink_inode(args, child, bp);
return error;
}
@@ -2317,9 +2309,10 @@ xfs_da3_swap_lastblock(
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize);
xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
+
/*
* Get values from the moved block.
*/
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ffa3df5b2893..706baf36e175 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
+void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
+ size_t size);
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index f9015f88eca7..24f9d1461f9a 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
/*
- * Entries are packed toward the top as tight as possible.
- */
-struct xfs_attr_shortform {
- struct xfs_attr_sf_hdr { /* constant-structure header block */
- __be16 totsize; /* total bytes in shortform list */
- __u8 count; /* count of active entries */
- __u8 padding;
- } hdr;
- struct xfs_attr_sf_entry {
- uint8_t namelen; /* actual length of name (no NULL) */
- uint8_t valuelen; /* actual length of value (no NULL) */
- uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[]; /* name & value bytes concatenated */
- } list[]; /* variable sized array */
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as to fit into the
+ * literal area of the inode.
+ *
+ * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header
+ * followed by zero or more xfs_attr_sf_entry structures.
+ */
+struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ __u8 padding;
+};
+
+struct xfs_attr_sf_entry {
+ __u8 namelen; /* actual length of name (no NULL) */
+ __u8 valuelen; /* actual length of value (no NULL) */
+ __u8 flags; /* flags bits (XFS_ATTR_*) */
+ __u8 nameval[]; /* name & value bytes concatenated */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index f71679ce23b9..66a17910d021 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -26,6 +26,7 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
+#include "xfs_trans_priv.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache;
* Note that the continuation requested between t2 and t3 is likely to
* reoccur.
*/
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return NULL;
+}
-static const struct xfs_defer_op_type *defer_op_types[] = {
- [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
- [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
- [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
- [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
+STATIC void
+xfs_defer_barrier_abort_intent(
+ struct xfs_log_item *intent)
+{
+ /* empty */
+}
+
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ return NULL;
+}
+
+STATIC int
+xfs_defer_barrier_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+STATIC void
+xfs_defer_barrier_cancel_item(
+ struct list_head *item)
+{
+ ASSERT(0);
+}
+
+static const struct xfs_defer_op_type xfs_barrier_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_defer_barrier_create_intent,
+ .abort_intent = xfs_defer_barrier_abort_intent,
+ .create_done = xfs_defer_barrier_create_done,
+ .finish_item = xfs_defer_barrier_finish_item,
+ .cancel_item = xfs_defer_barrier_cancel_item,
};
+/* Create a log intent done item for a log intent item. */
+static inline void
+xfs_defer_create_done(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ /* If there is no log intent item, there can be no log done item. */
+ if (!dfp->dfp_intent)
+ return;
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the log intent item and frees the log done item
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ if (!lip)
+ return;
+
+ tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
+ dfp->dfp_done = lip;
+}
+
/*
* Ensure there's a log intent item associated with this deferred work item if
* the operation must be restarted on crash. Returns 1 if there's a log item;
@@ -202,18 +276,21 @@ xfs_defer_create_intent(
struct xfs_defer_pending *dfp,
bool sort)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
struct xfs_log_item *lip;
if (dfp->dfp_intent)
return 1;
- lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
+ sort);
if (!lip)
return 0;
if (IS_ERR(lip))
return PTR_ERR(lip);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
dfp->dfp_intent = lip;
return 1;
}
@@ -245,23 +322,50 @@ xfs_defer_create_intents(
return ret;
}
-STATIC void
+static inline void
xfs_defer_pending_abort(
struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ trace_xfs_defer_pending_abort(mp, dfp);
+
+ if (dfp->dfp_intent && !dfp->dfp_done) {
+ dfp->dfp_ops->abort_intent(dfp->dfp_intent);
+ dfp->dfp_intent = NULL;
+ }
+}
+
+static inline void
+xfs_defer_pending_cancel_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ struct list_head *pwi;
+ struct list_head *n;
+
+ trace_xfs_defer_cancel_list(mp, dfp);
+
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ trace_xfs_defer_cancel_item(mp, dfp, pwi);
+ dfp->dfp_ops->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+}
+
+STATIC void
+xfs_defer_pending_abort_list(
+ struct xfs_mount *mp,
struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
/* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_list, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(mp, dfp);
- if (dfp->dfp_intent && !dfp->dfp_done) {
- ops->abort_intent(dfp->dfp_intent);
- dfp->dfp_intent = NULL;
- }
- }
+ list_for_each_entry(dfp, dop_list, dfp_list)
+ xfs_defer_pending_abort(mp, dfp);
}
/* Abort all the intents that were committed. */
@@ -271,7 +375,7 @@ xfs_defer_trans_abort(
struct list_head *dop_pending)
{
trace_xfs_defer_trans_abort(tp, _RET_IP_);
- xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+ xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
}
/*
@@ -389,27 +493,31 @@ xfs_defer_cancel_list(
{
struct xfs_defer_pending *dfp;
struct xfs_defer_pending *pli;
- struct list_head *pwi;
- struct list_head *n;
- const struct xfs_defer_op_type *ops;
/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
- list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_cancel_list(mp, dfp);
- list_del(&dfp->dfp_list);
- list_for_each_safe(pwi, n, &dfp->dfp_work) {
- list_del(pwi);
- dfp->dfp_count--;
- trace_xfs_defer_cancel_item(mp, dfp, pwi);
- ops->cancel_item(pwi);
- }
- ASSERT(dfp->dfp_count == 0);
- kmem_cache_free(xfs_defer_pending_cache, dfp);
+ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+static inline void
+xfs_defer_relog_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ xfs_defer_create_done(tp, dfp);
+
+ lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
+ if (lip) {
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
}
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = lip;
}
/*
@@ -417,7 +525,7 @@ xfs_defer_cancel_list(
* done item to release the intent item; and then log a new intent item.
* The caller should provide a fresh transaction and roll it after we're done.
*/
-static int
+static void
xfs_defer_relog(
struct xfs_trans **tpp,
struct list_head *dfops)
@@ -456,31 +564,28 @@ xfs_defer_relog(
trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
- dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
- }
- if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
- return xfs_defer_trans_roll(tpp);
- return 0;
+ xfs_defer_relog_intent(*tpp, dfp);
+ }
}
/*
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
-static int
+int
xfs_defer_finish_one(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
struct xfs_btree_cur *state = NULL;
struct list_head *li, *n;
int error;
trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ xfs_defer_create_done(tp, dfp);
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
@@ -517,6 +622,24 @@ out:
return error;
}
+/* Move all paused deferred work from @tp to @paused_list. */
+static void
+xfs_defer_isolate_paused(
+ struct xfs_trans *tp,
+ struct list_head *paused_list)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+
+ list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
+ if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
+ continue;
+
+ list_move_tail(&dfp->dfp_list, paused_list);
+ trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
+ }
+}
+
/*
* Finish all the pending work. This involves logging intent items for
* any work items that wandered in since the last transaction roll (if
@@ -532,6 +655,7 @@ xfs_defer_finish_noroll(
struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
+ LIST_HEAD(dop_paused);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -550,6 +674,8 @@ xfs_defer_finish_noroll(
*/
int has_intents = xfs_defer_create_intents(*tp);
+ xfs_defer_isolate_paused(*tp, &dop_paused);
+
list_splice_init(&(*tp)->t_dfops, &dop_pending);
if (has_intents < 0) {
@@ -562,22 +688,33 @@ xfs_defer_finish_noroll(
goto out_shutdown;
/* Relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
- goto out_shutdown;
+ xfs_defer_relog(tp, &dop_pending);
+ xfs_defer_relog(tp, &dop_paused);
+
+ if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+ }
}
- dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
- dfp_list);
+ dfp = list_first_entry_or_null(&dop_pending,
+ struct xfs_defer_pending, dfp_list);
+ if (!dfp)
+ break;
error = xfs_defer_finish_one(*tp, dfp);
if (error && error != -EAGAIN)
goto out_shutdown;
}
+ /* Requeue the paused items in the outgoing transaction. */
+ list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
+
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
out_shutdown:
+ list_splice_tail_init(&dop_paused, &dop_pending);
xfs_defer_trans_abort(*tp, &dop_pending);
xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
trace_xfs_defer_finish_error(*tp, error);
@@ -590,6 +727,9 @@ int
xfs_defer_finish(
struct xfs_trans **tp)
{
+#ifdef DEBUG
+ struct xfs_defer_pending *dfp;
+#endif
int error;
/*
@@ -609,7 +749,10 @@ xfs_defer_finish(
}
/* Reset LOWMODE now that we've finished all the dfops. */
- ASSERT(list_empty(&(*tp)->t_dfops));
+#ifdef DEBUG
+ list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+#endif
(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -621,48 +764,165 @@ xfs_defer_cancel(
struct xfs_mount *mp = tp->t_mountp;
trace_xfs_defer_cancel(tp, _RET_IP_);
+ xfs_defer_trans_abort(tp, &tp->t_dfops);
xfs_defer_cancel_list(mp, &tp->t_dfops);
}
+/*
+ * Return the last pending work item attached to this transaction if it matches
+ * the deferred op type.
+ */
+static inline struct xfs_defer_pending *
+xfs_defer_find_last(
+ struct xfs_trans *tp,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp = NULL;
+
+ /* No dfops at all? */
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
+ dfp_list);
+
+ /* Wrong type? */
+ if (dfp->dfp_ops != ops)
+ return NULL;
+ return dfp;
+}
+
+/*
+ * Decide if we can add a deferred work item to the last dfops item attached
+ * to the transaction.
+ */
+static inline bool
+xfs_defer_can_append(
+ struct xfs_defer_pending *dfp,
+ const struct xfs_defer_op_type *ops)
+{
+ /* Already logged? */
+ if (dfp->dfp_intent)
+ return false;
+
+ /* Paused items cannot absorb more work */
+ if (dfp->dfp_flags & XFS_DEFER_PAUSED)
+ return NULL;
+
+ /* Already full? */
+ if (ops->max_items && dfp->dfp_count >= ops->max_items)
+ return false;
+
+ return true;
+}
+
+/* Create a new pending item at the end of the transaction list. */
+static inline struct xfs_defer_pending *
+xfs_defer_alloc(
+ struct xfs_trans *tp,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp;
+
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ dfp->dfp_ops = ops;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+
+ return dfp;
+}
+
/* Add an item for later deferred processing. */
-void
+struct xfs_defer_pending *
xfs_defer_add(
struct xfs_trans *tp,
- enum xfs_defer_ops_type type,
- struct list_head *li)
+ struct list_head *li,
+ const struct xfs_defer_op_type *ops)
{
struct xfs_defer_pending *dfp = NULL;
- const struct xfs_defer_op_type *ops = defer_op_types[type];
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
- /*
- * Add the item to a pending item at the end of the intake list.
- * If the last pending item has the same type, reuse it. Else,
- * create a new pending item at the end of the intake list.
- */
- if (!list_empty(&tp->t_dfops)) {
- dfp = list_last_entry(&tp->t_dfops,
- struct xfs_defer_pending, dfp_list);
- if (dfp->dfp_type != type ||
- (ops->max_items && dfp->dfp_count >= ops->max_items))
- dfp = NULL;
- }
- if (!dfp) {
- dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
- dfp->dfp_type = type;
- dfp->dfp_intent = NULL;
- dfp->dfp_done = NULL;
- dfp->dfp_count = 0;
- INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, &tp->t_dfops);
- }
+ dfp = xfs_defer_find_last(tp, ops);
+ if (!dfp || !xfs_defer_can_append(dfp, ops))
+ dfp = xfs_defer_alloc(tp, ops);
- list_add_tail(li, &dfp->dfp_work);
+ xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
- dfp->dfp_count++;
+ return dfp;
+}
+
+/*
+ * Add a defer ops barrier to force two otherwise adjacent deferred work items
+ * to be tracked separately and have separate log items.
+ */
+void
+xfs_defer_add_barrier(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_pending *dfp;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /* If the last defer op added was a barrier, we're done. */
+ dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
+ if (dfp)
+ return;
+
+ xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+
+ trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
+}
+
+/*
+ * Create a pending deferred work item to replay the recovered intent item
+ * and add it to the list.
+ */
+void
+xfs_defer_start_recovery(
+ struct xfs_log_item *lip,
+ struct list_head *r_dfops,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp;
+
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ dfp->dfp_ops = ops;
+ dfp->dfp_intent = lip;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, r_dfops);
+}
+
+/*
+ * Cancel a deferred work item created to recover a log intent item. @dfp
+ * will be freed after this function returns.
+ */
+void
+xfs_defer_cancel_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ xfs_defer_pending_abort(mp, dfp);
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+/* Replay the deferred work item created from a recovered log intent item. */
+int
+xfs_defer_finish_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
+ int error;
+
+ /* dfp is freed by recover_work and must not be accessed afterwards */
+ error = ops->recover_work(dfp, capture_list);
+ if (error)
+ trace_xlog_intent_recovery_failed(mp, ops, error);
+ return error;
}
/*
@@ -769,7 +1029,7 @@ xfs_defer_ops_capture_abort(
{
unsigned short i;
- xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
+ xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -938,3 +1198,36 @@ xfs_defer_destroy_item_caches(void)
xfs_rmap_intent_destroy_cache();
xfs_defer_destroy_cache();
}
+
+/*
+ * Mark a deferred work item so that it will be requeued indefinitely without
+ * being finished. Caller must ensure there are no data dependencies on this
+ * work item in the meantime.
+ */
+void
+xfs_defer_item_pause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
+
+ dfp->dfp_flags |= XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_pause(tp->t_mountp, dfp);
+}
+
+/*
+ * Release a paused deferred work item so that it will be finished during the
+ * next transaction roll.
+ */
+void
+xfs_defer_item_unpause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+
+ dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 8788ad5f6a73..18a9fb92dde8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -11,19 +11,6 @@ struct xfs_defer_op_type;
struct xfs_defer_capture;
/*
- * Header for deferred operation list.
- */
-enum xfs_defer_ops_type {
- XFS_DEFER_OPS_TYPE_BMAP,
- XFS_DEFER_OPS_TYPE_REFCOUNT,
- XFS_DEFER_OPS_TYPE_RMAP,
- XFS_DEFER_OPS_TYPE_FREE,
- XFS_DEFER_OPS_TYPE_AGFL_FREE,
- XFS_DEFER_OPS_TYPE_ATTR,
- XFS_DEFER_OPS_TYPE_MAX,
-};
-
-/*
* Save a log intent item and a list of extents, so that we can replay
* whatever action had to happen to the extent list and file the log done
* item.
@@ -33,19 +20,35 @@ struct xfs_defer_pending {
struct list_head dfp_work; /* work items */
struct xfs_log_item *dfp_intent; /* log intent item */
struct xfs_log_item *dfp_done; /* log done item */
+ const struct xfs_defer_op_type *dfp_ops;
unsigned int dfp_count; /* # extent items */
- enum xfs_defer_ops_type dfp_type;
+ unsigned int dfp_flags;
};
-void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
- struct list_head *h);
+/*
+ * Create a log intent item for this deferred item, but don't actually finish
+ * the work. Caller must clear this before the final transaction commit.
+ */
+#define XFS_DEFER_PAUSED (1U << 0)
+
+#define XFS_DEFER_PENDING_STRINGS \
+ { XFS_DEFER_PAUSED, "paused" }
+
+void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+
+struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h,
+ const struct xfs_defer_op_type *ops);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
int xfs_defer_finish(struct xfs_trans **tp);
+int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
void xfs_defer_cancel(struct xfs_trans *);
void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
+ const char *name;
+ unsigned int max_items;
struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
struct list_head *items, unsigned int count, bool sort);
void (*abort_intent)(struct xfs_log_item *intent);
@@ -56,7 +59,11 @@ struct xfs_defer_op_type {
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
void (*cancel_item)(struct list_head *item);
- unsigned int max_items;
+ int (*recover_work)(struct xfs_defer_pending *dfp,
+ struct list_head *capture_list);
+ struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ struct xfs_log_item *done_item);
};
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
@@ -125,7 +132,25 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+void xfs_defer_start_recovery(struct xfs_log_item *lip,
+ struct list_head *r_dfops, const struct xfs_defer_op_type *ops);
+void xfs_defer_cancel_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp);
+int xfs_defer_finish_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp, struct list_head *capture_list);
+
+static inline void
+xfs_defer_add_item(
+ struct xfs_defer_pending *dfp,
+ struct list_head *work)
+{
+ list_add_tail(work, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
int __init xfs_defer_init_item_caches(void);
void xfs_defer_destroy_item_caches(void);
+void xfs_defer_add_barrier(struct xfs_trans *tp);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index f5462fd582d5..a76673281514 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -196,7 +196,7 @@ xfs_dir_isempty(
return 1;
if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
return !sfp->count;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 00f960a703b2..3c256d4cc40b 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block(
int newoffset; /* offset from current entry */
unsigned int offset = geo->data_entry_offset;
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
- xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
+ struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data;
xfs_dir2_sf_hdr_t *sfp; /* shortform header */
__be16 *tagp; /* end of data entry */
struct xfs_name name;
@@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block(
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
ASSERT(ifp->if_bytes == dp->i_disk_size);
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(oldsfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
ASSERT(dp->i_df.if_nextents == 0);
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 7404a9ff1a92..1db2e60ba827 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp, int64_t size);
int xfs_dir2_sf_entsize(struct xfs_mount *mp,
struct xfs_dir2_sf_hdr *hdr, int len);
void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 8cd37e6e9d38..e1f83fc7b6ad 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -364,25 +364,23 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
int pick; /* which algorithm to use */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
trace_xfs_dir2_sf_addname(args);
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
- dp = args->dp;
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Compute entry (and change in) size.
@@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- int byteoff; /* byte offset in sf dir */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
+ int byteoff = (int)((char *)sfep - (char *)sfp);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+ sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
/*
* Fill in the new entry.
@@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard(
/*
* Copy the old directory to the stack buffer.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_disk_size;
buf = kmem_alloc(old_isize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- memcpy(oldsfp, sfp, old_isize);
+ memcpy(oldsfp, dp->i_df.if_data, old_isize);
/*
* Loop over the old directory finding the place we're going
* to insert the new entry.
@@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard(
* the data.
*/
xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
- xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
- /*
- * Reset the pointer since the buffer was reallocated.
- */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+
/*
* Copy the first part of the directory, including the header.
*/
@@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick(
int i; /* entry number */
xfs_dir2_data_aoff_t offset; /* data block offset */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int size; /* entry's data size */
int used; /* data bytes used */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = xfs_dir2_data_entsize(mp, args->namelen);
offset = args->geo->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -673,14 +663,13 @@ xfs_dir2_sf_check(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry number */
int i8count; /* number of big inode#s */
xfs_ino_t ino; /* entry inode number */
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
offset = args->geo->data_first_offset;
ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -707,11 +696,10 @@ xfs_dir2_sf_check(
/* Verify the consistency of an inline directory. */
xfs_failaddr_t
xfs_dir2_sf_verify(
- struct xfs_inode *ip)
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp,
+ int64_t size)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
@@ -719,15 +707,9 @@ xfs_dir2_sf_verify(
int i;
int i8count;
int offset;
- int64_t size;
int error;
uint8_t filetype;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
- sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
-
/*
* Give up if the directory is way too short.
*/
@@ -834,15 +816,13 @@ xfs_dir2_sf_create(
ASSERT(dp->i_df.if_bytes == 0);
i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
size = xfs_dir2_sf_hdr_size(i8count);
+
/*
- * Make a buffer for the data.
- */
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- /*
- * Fill in the header,
+ * Make a buffer for the data and fill in the header.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK);
sfp->i8count = i8count;
+
/*
* Now can put in the inode number, since i8count is set.
*/
@@ -864,9 +844,9 @@ xfs_dir2_sf_lookup(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
@@ -877,8 +857,7 @@ xfs_dir2_sf_lookup(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Special case for .
@@ -940,13 +919,13 @@ xfs_dir2_sf_removename(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int byteoff; /* offset of removed entry */
int entsize; /* this entry's size */
int i; /* shortform entry index */
int newsize; /* new inode size */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_removename(args);
@@ -954,8 +933,7 @@ xfs_dir2_sf_removename(
oldsize = (int)dp->i_disk_size;
ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Loop over the old directory entries.
@@ -992,11 +970,12 @@ xfs_dir2_sf_removename(
*/
sfp->count--;
dp->i_disk_size = newsize;
+
/*
* Reallocate, making it smaller.
*/
- xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+
/*
* Are we changing inode number size?
*/
@@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
{
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int newsize;
- struct xfs_dir2_sf_hdr *sfp;
if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
return false;
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
@@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_ino_t ino=0; /* entry old inode number */
int i8elevated; /* sf_toino8 set i8count=1 */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_replace(args);
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
@@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace(
*/
xfs_dir2_sf_toino8(args);
i8elevated = 1;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
} else
i8elevated = 0;
@@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1175,7 +1152,6 @@ xfs_dir2_sf_toino4(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
@@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1248,7 +1224,6 @@ xfs_dir2_sf_toino8(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9a88aba1589f..382ab1e71c0b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt {
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
- ((char *)dip + xfs_dinode_size(dip->di_version))
+ ((void *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
@@ -1156,20 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
-#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
-#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-
-/*
- * RT bit manipulation macros.
- */
-#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
-#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
-
-#define XFS_RTLOBIT(w) xfs_lowbit32(w)
-#define XFS_RTHIBIT(w) xfs_highbit32(w)
-
-#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
-
/*
* Dquot and dquot block format definitions
*/
@@ -1272,6 +1258,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
#define XFS_DQ_GRACE_MIN ((int64_t)0)
#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+/* Maximum id value for a quota record */
+#define XFS_DQ_ID_MAX (U32_MAX)
+
/*
* This is the main portion of the on-disk representation of quota information
* for a user. We pad this with some more expansion room to construct the on
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 99e796256c5d..6296993ff8f3 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -68,6 +68,11 @@ struct xfs_fsop_geom;
#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */
+#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */
+#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */
+#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */
+
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
XFS_SICK_FS_UQUOTA | \
@@ -97,6 +102,11 @@ struct xfs_fsop_geom;
XFS_SICK_INO_SYMLINK | \
XFS_SICK_INO_PARENT)
+#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \
+ XFS_SICK_INO_BMBTA_ZAPPED | \
+ XFS_SICK_INO_DIR_ZAPPED | \
+ XFS_SICK_INO_SYMLINK_ZAPPED)
+
/* These functions must be provided by the xfs implementation. */
void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b83e54c70906..2361a22035b0 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec(
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
+/* Compute the freecount of an incore inode record. */
+uint8_t
+xfs_inobt_rec_freecount(
+ const struct xfs_inobt_rec_incore *irec)
+{
+ uint64_t realfree = irec->ir_free;
+
+ if (xfs_inobt_issparse(irec->ir_holemask))
+ realfree &= xfs_inobt_irec_to_allocmask(irec);
+ return hweight64(realfree);
+}
+
/* Simple checks for inode records. */
xfs_failaddr_t
xfs_inobt_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec)
{
- uint64_t realfree;
-
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+ if (!xfs_verify_agino(pag, irec->ir_startino))
return __this_address;
- if (!xfs_verify_agino(cur->bc_ag.pag,
+ if (!xfs_verify_agino(pag,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
return __this_address;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
@@ -115,12 +125,7 @@ xfs_inobt_check_irec(
if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
return __this_address;
- /* if there are no holes, return the first available offset */
- if (!xfs_inobt_issparse(irec->ir_holemask))
- realfree = irec->ir_free;
- else
- realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
- if (hweight64(realfree) != irec->ir_freecount)
+ if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
return __this_address;
return NULL;
@@ -164,7 +169,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
- fa = xfs_inobt_check_irec(cur, irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
@@ -1854,7 +1859,7 @@ xfs_difree_inode_chunk(
return xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1900,7 +1905,8 @@ xfs_difree_inode_chunk(
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
error = xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
+ &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
+ false);
if (error)
return error;
@@ -2739,7 +2745,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
- fa = xfs_inobt_check_irec(cur, &irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index fe824bb04a09..f1412183bb44 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
*/
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec);
/*
* Inode chunk initialisation routine
@@ -93,7 +94,7 @@ union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
-xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9258f01c0015..42a5e1f227a0 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -161,7 +161,7 @@ __xfs_inobt_free_block(
xfs_inobt_mod_blockcount(cur, -1);
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_INOBT, resv);
+ &XFS_RMAP_OINFO_INOBT, resv, false);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 773cf4349428..f4e6b200cdf8 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -158,7 +158,7 @@ static void *
xfs_iext_find_first_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height;
if (!ifp->if_height)
@@ -176,7 +176,7 @@ static void *
xfs_iext_find_last_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -306,7 +306,7 @@ xfs_iext_find_level(
xfs_fileoff_t offset,
int level)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -402,12 +402,12 @@ xfs_iext_grow(
int i;
if (ifp->if_height == 1) {
- struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+ struct xfs_iext_leaf *prev = ifp->if_data;
node->keys[0] = xfs_iext_leaf_key(prev, 0);
node->ptrs[0] = prev;
} else {
- struct xfs_iext_node *prev = ifp->if_u1.if_root;
+ struct xfs_iext_node *prev = ifp->if_data;
ASSERT(ifp->if_height > 1);
@@ -418,7 +418,7 @@ xfs_iext_grow(
for (i = 1; i < KEYS_PER_NODE; i++)
node->keys[i] = XFS_IEXT_KEY_INVALID;
- ifp->if_u1.if_root = node;
+ ifp->if_data = node;
ifp->if_height++;
}
@@ -430,7 +430,7 @@ xfs_iext_update_node(
int level,
void *ptr)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
for (height = ifp->if_height; height > level; height--) {
@@ -583,11 +583,11 @@ xfs_iext_alloc_root(
{
ASSERT(ifp->if_bytes == 0);
- ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
ifp->if_height = 1;
/* now that we have a node step into it */
- cur->leaf = ifp->if_u1.if_root;
+ cur->leaf = ifp->if_data;
cur->pos = 0;
}
@@ -603,9 +603,9 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+ new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
- ifp->if_u1.if_root = new;
+ ifp->if_data = new;
cur->leaf = new;
}
@@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
}
void
-xfs_iext_insert(
- struct xfs_inode *ip,
+xfs_iext_insert_raw(
+ struct xfs_ifork *ifp,
struct xfs_iext_cursor *cur,
- struct xfs_bmbt_irec *irec,
- int state)
+ struct xfs_bmbt_irec *irec)
{
- struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
xfs_fileoff_t offset = irec->br_startoff;
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
@@ -662,12 +660,23 @@ xfs_iext_insert(
xfs_iext_set(cur_rec(cur), irec);
ifp->if_bytes += sizeof(struct xfs_iext_rec);
- trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
if (new)
xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
}
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ xfs_iext_insert_raw(ifp, cur, irec);
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+}
+
static struct xfs_iext_node *
xfs_iext_rebalance_node(
struct xfs_iext_node *parent,
@@ -777,8 +786,8 @@ again:
* If we are at the root and only one entry is left we can just
* free this node and update the root pointer.
*/
- ASSERT(node == ifp->if_u1.if_root);
- ifp->if_u1.if_root = node->ptrs[0];
+ ASSERT(node == ifp->if_data);
+ ifp->if_data = node->ptrs[0];
ifp->if_height--;
kmem_free(node);
}
@@ -854,8 +863,8 @@ xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
ifp->if_height--;
- kmem_free(ifp->if_u1.if_root);
- ifp->if_u1.if_root = NULL;
+ kmem_free(ifp->if_data);
+ ifp->if_data = NULL;
}
void
@@ -872,7 +881,7 @@ xfs_iext_remove(
trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
ASSERT(ifp->if_height > 0);
- ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(xfs_iext_valid(ifp, cur));
xfs_iext_inc_seq(ifp);
@@ -1042,9 +1051,9 @@ void
xfs_iext_destroy(
struct xfs_ifork *ifp)
{
- xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+ xfs_iext_destroy_node(ifp->if_data, ifp->if_height);
ifp->if_bytes = 0;
ifp->if_height = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5a2e7ddfa76d..f4569e18a8d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -50,12 +50,15 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
- memcpy(ifp->if_u1.if_data, data, size);
+ char *new_data = kmem_alloc(mem_size, KM_NOFS);
+
+ memcpy(new_data, data, size);
if (zero_terminate)
- ifp->if_u1.if_data[size] = '\0';
+ new_data[size] = '\0';
+
+ ifp->if_data = new_data;
} else {
- ifp->if_u1.if_data = NULL;
+ ifp->if_data = NULL;
}
ifp->if_bytes = size;
@@ -125,7 +128,7 @@ xfs_iformat_extents(
}
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
@@ -212,7 +215,7 @@ xfs_iformat_btree(
ifp->if_broot, size);
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
return 0;
}
@@ -276,10 +279,9 @@ static uint16_t
xfs_dfork_attr_shortform_size(
struct xfs_dinode *dip)
{
- struct xfs_attr_shortform *atp =
- (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+ struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip);
- return be16_to_cpu(atp->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
void
@@ -493,7 +495,7 @@ xfs_iroot_realloc(
* byte_diff -- the change in the number of bytes, positive or negative,
* requested for the if_data array.
*/
-void
+void *
xfs_idata_realloc(
struct xfs_inode *ip,
int64_t byte_diff,
@@ -505,21 +507,18 @@ xfs_idata_realloc(
ASSERT(new_size >= 0);
ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
- if (byte_diff == 0)
- return;
-
- if (new_size == 0) {
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- ifp->if_bytes = 0;
- return;
+ if (byte_diff) {
+ ifp->if_data = krealloc(ifp->if_data, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (new_size == 0)
+ ifp->if_data = NULL;
+ ifp->if_bytes = new_size;
}
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
- GFP_NOFS | __GFP_NOFAIL);
- ifp->if_bytes = new_size;
+ return ifp->if_data;
}
+/* Free all memory and reset a fork back to its initial state. */
void
xfs_idestroy_fork(
struct xfs_ifork *ifp)
@@ -531,8 +530,8 @@ xfs_idestroy_fork(
switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
+ kmem_free(ifp->if_data);
+ ifp->if_data = NULL;
break;
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -625,9 +624,9 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
- memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(cp, ifp->if_data, ifp->if_bytes);
}
break;
@@ -702,19 +701,27 @@ xfs_ifork_verify_local_data(
xfs_failaddr_t fa = NULL;
switch (VFS_I(ip)->i_mode & S_IFMT) {
- case S_IFDIR:
- fa = xfs_dir2_sf_verify(ip);
+ case S_IFDIR: {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_dir2_sf_hdr *sfp = ifp->if_data;
+
+ fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes);
break;
- case S_IFLNK:
- fa = xfs_symlink_shortform_verify(ip);
+ }
+ case S_IFLNK: {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes);
break;
+ }
default:
break;
}
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ ip->i_df.if_data, ip->i_df.if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -729,14 +736,17 @@ xfs_ifork_verify_local_attr(
struct xfs_ifork *ifp = &ip->i_af;
xfs_failaddr_t fa;
- if (!xfs_inode_has_attr_fork(ip))
+ if (!xfs_inode_has_attr_fork(ip)) {
fa = __this_address;
- else
- fa = xfs_attr_shortform_verify(ip);
+ } else {
+ struct xfs_ifork *ifp = &ip->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes);
+ }
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
+ ifp->if_data, ifp->if_bytes, fa);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96d307784c85..96303249d28a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -13,14 +13,12 @@ struct xfs_dinode;
* File incore extent information, present for each of data & attr forks.
*/
struct xfs_ifork {
- int64_t if_bytes; /* bytes in if_u1 */
+ int64_t if_bytes; /* bytes in if_data */
struct xfs_btree_block *if_broot; /* file's incore btree root */
unsigned int if_seq; /* fork mod counter */
int if_height; /* height of the extent tree */
- union {
- void *if_root; /* extent tree root */
- char *if_data; /* inline file data */
- } if_u1;
+ void *if_data; /* extent tree root or
+ inline data */
xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
@@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_ifork *ifp);
-void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
@@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
const void *data, int64_t size);
xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert_raw(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec);
void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *, int);
void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index a5100a11faf9..9fe7a9564bca 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -11,6 +11,7 @@
* define how recovery should work for that type of log item.
*/
struct xlog_recover_item;
+struct xfs_defer_op_type;
/* Sorting hat for log items as they're read in. */
enum xlog_recover_reorder {
@@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r)
return ret;
}
+struct xfs_defer_pending;
+
+void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
+ xfs_lsn_t lsn, const struct xfs_defer_op_type *ops);
+int xlog_recover_finish_intent(struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp);
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 21a7e350b4c5..81885a6a028e 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -7,16 +7,16 @@
#define __XFS_ONDISK_H
#define XFS_CHECK_STRUCT_SIZE(structname, size) \
- BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
- #structname ") is wrong, expected " #size)
+ static_assert(sizeof(structname) == (size), \
+ "XFS: sizeof(" #structname ") is wrong, expected " #size)
#define XFS_CHECK_OFFSET(structname, member, off) \
- BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+ static_assert(offsetof(structname, member) == (off), \
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
#define XFS_CHECK_VALUE(value, expected) \
- BUILD_BUG_ON_MSG((value) != (expected), \
+ static_assert((value) == (expected), \
"XFS: value of " #value " is wrong, expected " #expected)
static inline void __init
@@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 646b3fa362ad..6709a7f8bad5 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec(
/* Simple checks for refcount records. */
xfs_failaddr_t
xfs_refcount_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
return __this_address;
@@ -179,7 +177,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(cur, irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -1153,7 +1151,7 @@ xfs_refcount_adjust_extents(
tmp.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1215,7 +1213,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1458,7 +1456,7 @@ __xfs_refcount_add(
ri->ri_blockcount = blockcount;
xfs_refcount_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
}
/*
@@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+ if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
kfree(rr);
@@ -1985,7 +1983,7 @@ xfs_refcount_recover_cow_leftovers(
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_trans;
@@ -2033,6 +2031,47 @@ xfs_refcount_has_records(
return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
+struct xfs_refcount_query_range_info {
+ xfs_refcount_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_refcount_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_refcount_query_range_info *query = priv;
+ struct xfs_refcount_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+ if (fa)
+ return xfs_refcount_complain_bad_rec(cur, fa, &irec);
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all refcount records between two keys. */
+int
+xfs_refcount_query_range(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec = { .rc = *low_rec };
+ union xfs_btree_irec high_brec = { .rc = *high_rec };
+ struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn };
+
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_refcount_query_range_helper, &query);
+}
+
int __init
xfs_refcount_intent_init_cache(void)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 783cd89ca195..9b56768a590c 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
-xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
@@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache;
int __init xfs_refcount_intent_init_cache(void);
void xfs_refcount_intent_destroy_cache(void);
+typedef int (*xfs_refcount_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv);
+
+int xfs_refcount_query_range(struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn, void *priv);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 5c3987d8dc24..0d80bd99147c 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -112,7 +112,7 @@ xfs_refcountbt_free_block(
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
}
STATIC int
@@ -226,7 +226,18 @@ xfs_refcountbt_verify(
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_refcount_level)
+ unsigned int maxlevel = pag->pagf_refcount_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the refcount btree, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_refcount_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fbb0b2637463..76bf7f48cb5a 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2567,7 +2567,7 @@ __xfs_rmap_add(
ri->ri_bmap = *bmap;
xfs_rmap_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
/* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index c269d704314d..31100120b2c5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -184,7 +184,7 @@ xfs_rtfind_back(
* Calculate first (leftmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0);
mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
firstbit;
/*
@@ -195,7 +195,7 @@ xfs_rtfind_back(
/*
* Different. Mark where we are and return.
*/
- i = bit - XFS_RTHIBIT(wdiff);
+ i = bit - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
}
@@ -233,7 +233,7 @@ xfs_rtfind_back(
/*
* Different, mark where we are and return.
*/
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
}
@@ -272,7 +272,7 @@ xfs_rtfind_back(
/*
* Different, mark where we are and return.
*/
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
} else
@@ -338,7 +338,7 @@ xfs_rtfind_forw(
* Calculate last (rightmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Calculate the difference between the value there
@@ -348,7 +348,7 @@ xfs_rtfind_forw(
/*
* Different. Mark where we are and return.
*/
- i = XFS_RTLOBIT(wdiff) - bit;
+ i = xfs_lowbit32(wdiff) - bit;
*rtx = start + i - 1;
return 0;
}
@@ -386,7 +386,7 @@ xfs_rtfind_forw(
/*
* Different, mark where we are and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*rtx = start + i - 1;
return 0;
}
@@ -423,7 +423,7 @@ xfs_rtfind_forw(
/*
* Different, mark where we are and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*rtx = start + i - 1;
return 0;
} else
@@ -452,71 +452,59 @@ xfs_trans_log_rtsummary(
}
/*
- * Read and/or modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- *
- * Summary information is returned in *sum if specified.
- * If no delta is specified, returns summary only.
+ * Modify the summary information for a given extent size, bitmap block
+ * combination.
*/
int
-xfs_rtmodify_summary_int(
+xfs_rtmodify_summary(
struct xfs_rtalloc_args *args,
int log, /* log2 of extent size */
xfs_fileoff_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ int delta) /* in/out: summary block number */
{
struct xfs_mount *mp = args->mp;
- int error;
- xfs_fileoff_t sb; /* summary fsblock */
- xfs_rtsumoff_t so; /* index into the summary file */
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
unsigned int infoword;
+ xfs_suminfo_t val;
+ int error;
- /*
- * Compute entry number in the summary file.
- */
- so = xfs_rtsumoffs(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = xfs_rtsumoffs_to_block(mp, so);
-
- error = xfs_rtsummary_read_buf(args, sb);
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
if (error)
return error;
- /*
- * Point to the summary information, modify/log it, and/or copy it out.
- */
infoword = xfs_rtsumoffs_to_infoword(mp, so);
- if (delta) {
- xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta);
-
- if (mp->m_rsum_cache) {
- if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
- if (val != 0 && log >= mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log + 1;
- }
- xfs_trans_log_rtsummary(args, infoword);
- if (sum)
- *sum = val;
- } else if (sum) {
- *sum = xfs_suminfo_get(args, infoword);
+ val = xfs_suminfo_add(args, infoword, delta);
+
+ if (mp->m_rsum_cache) {
+ if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ if (val != 0 && log >= mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
}
+
+ xfs_trans_log_rtsummary(args, infoword);
return 0;
}
+/*
+ * Read and return the summary information for a given extent size, bitmap block
+ * combination.
+ */
int
-xfs_rtmodify_summary(
+xfs_rtget_summary(
struct xfs_rtalloc_args *args,
int log, /* log2 of extent size */
xfs_fileoff_t bbno, /* bitmap block number */
- int delta) /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+ struct xfs_mount *mp = args->mp;
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ int error;
+
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
+ if (!error)
+ *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so));
+ return error;
}
/* Log rtbitmap block from the word @from to the byte before @next. */
@@ -585,7 +573,7 @@ xfs_rtmodify_range(
/*
* Compute first bit not changed and mask of relevant bits.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Set/clear the active bits.
@@ -720,7 +708,7 @@ xfs_rtfree_range(
*/
if (preblock < start) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(start - preblock),
+ xfs_highbit64(start - preblock),
xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
@@ -732,7 +720,7 @@ xfs_rtfree_range(
*/
if (postblock > end) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock - end),
+ xfs_highbit64(postblock - end),
xfs_rtx_to_rbmblock(mp, end + 1), -1);
if (error) {
return error;
@@ -743,7 +731,7 @@ xfs_rtfree_range(
* (new) free extent.
*/
return xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_highbit64(postblock + 1 - preblock),
xfs_rtx_to_rbmblock(mp, preblock), 1);
}
@@ -799,7 +787,7 @@ xfs_rtcheck_range(
/*
* Compute first bit not examined.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
/*
* Mask of relevant bits.
*/
@@ -812,7 +800,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i = XFS_RTLOBIT(wdiff) - bit;
+ i = xfs_lowbit32(wdiff) - bit;
*new = start + i;
*stat = 0;
return 0;
@@ -851,7 +839,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
@@ -889,7 +877,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
@@ -1131,6 +1119,20 @@ xfs_rtbitmap_blockcount(
}
/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
+}
+
+/*
* Compute the number of rtbitmap words needed to populate every block of a
* bitmap that is large enough to track the given number of rt extents.
*/
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index c0637057d69c..274dc7dae1fa 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
- xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, xfs_suminfo_t *sum);
int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
xfs_fileoff_t bbno, int delta);
int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
@@ -351,6 +351,20 @@ xfs_rtfree_extent(
int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno,
xfs_filblks_t rtlen);
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t
rtextents);
unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp,
@@ -369,6 +383,8 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp,
# define xfs_rtsummary_read_buf(a,b) (-ENOSYS)
# define xfs_rtbuf_cache_relse(a) (0)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS)
+# define xfs_compute_rextslog(rtx) (0)
+# define xfs_validate_rtextents(rtx) (false)
static inline xfs_filblks_t
xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents)
{
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 1f74d0cd1618..4a9e8588f4c9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
#include "xfs_da_format.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -508,8 +509,9 @@ xfs_validate_sb_common(
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
- if (sbp->sb_rextents != rexts ||
- sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ if (!xfs_validate_rtextents(rexts) ||
+ sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c4381388c0c1..4220d3584c1b 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index bdc777b9ec4a..160aa20aa441 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -175,7 +175,7 @@ xfs_symlink_local_to_remote(
if (!xfs_has_crc(mp)) {
bp->b_ops = NULL;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -191,7 +191,7 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
- memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(buf, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
ifp->if_bytes - 1);
}
@@ -202,15 +202,11 @@ xfs_symlink_local_to_remote(
*/
xfs_failaddr_t
xfs_symlink_shortform_verify(
- struct xfs_inode *ip)
+ void *sfp,
+ int64_t size)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- char *sfp = (char *)ifp->if_u1.if_data;
- int size = ifp->if_bytes;
char *endp = sfp + size;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
/*
* Zero length symlinks should never occur in memory as they are
* never allowed to exist on disk.
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 533200c4ccc2..20b5375f2d9c 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -51,7 +51,6 @@ typedef void * xfs_failaddr_t;
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
#define NULLFILEOFF ((xfs_fileoff_t)-1)
-#define NULLRTEXTNO ((xfs_rtxnum_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
@@ -208,6 +207,13 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
+
+ /*
+ * Don't increase fdblocks when freeing extent. This is a pony for
+ * the bnobt repair functions to re-free the free space without
+ * altering fdblocks. If you think you need this you're wrong.
+ */
+ XFS_AG_RESV_IGNORE,
};
/* Results of scanning a btree keyspace to check occupancy. */
diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c
new file mode 100644
index 000000000000..573e4e062754
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "bitmap.h"
+#include "scrub/agb_bitmap.h"
+
+/*
+ * Record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree. Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ * 4
+ * / | \
+ * 1 2 3
+ *
+ * Pretend for this example that each leaf block has 100 btree records. For
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2. Now
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ */
+
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+ return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/*
+ * Record all the buffers pointed to by the btree cursor. Callers already
+ * engaged in a btree walk should call this function to capture the list of
+ * blocks going from the leaf towards the root.
+ */
+int
+xagb_bitmap_set_btcur_path(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ int i;
+ int error;
+
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
+ error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
new file mode 100644
index 000000000000..ed08f76ff4f3
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGB_BITMAP_H__
+#define __XFS_SCRUB_AGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+ struct xbitmap32 agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->agbitmap, start, len);
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+ struct xagb_bitmap *sub)
+{
+ return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+
+#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 876a2f41b063..26bd1ff68f1b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
#include "scrub/reap.h"
/* Superblock */
@@ -72,7 +73,7 @@ xrep_superblock(
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
- return error;
+ return 0;
}
/* AGF */
@@ -341,7 +342,7 @@ xrep_agf_commit_new(
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGF. v5 filesystems only. */
@@ -494,12 +495,11 @@ xrep_agfl_walk_rmap(
/* Strike out the blocks that are cross-linked according to the rmapbt. */
STATIC int
xrep_agfl_check_extent(
- uint64_t start,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
@@ -647,8 +647,8 @@ struct xrep_agfl_fill {
/* Fill the AGFL with whatever blocks are in this extent. */
static int
xrep_agfl_fill(
- uint64_t start,
- uint64_t len,
+ uint32_t start,
+ uint32_t len,
void *priv)
{
struct xrep_agfl_fill *af = priv;
@@ -789,6 +789,9 @@ xrep_agfl(
/* Dump any AGFL overflow. */
error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
+ if (error)
+ goto err;
+
err:
xagb_bitmap_destroy(&agfl_extents);
return error;
@@ -962,7 +965,7 @@ xrep_agi_commit_new(
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGI. */
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 279af72b1671..d1b8a4997dd2 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,13 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub free space btrees.
@@ -24,10 +27,19 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ int error;
+
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
- return xchk_setup_ag_btree(sc, false);
+ error = xchk_setup_ag_btree(sc, false);
+ if (error)
+ return error;
+
+ if (xchk_could_repair(sc))
+ return xrep_setup_ag_allocbt(sc);
+
+ return 0;
}
/* Free space btree scrubber. */
@@ -127,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
- if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -138,31 +150,27 @@ xchk_allocbt_rec(
return 0;
}
-/* Scrub the freespace btrees for some AG. */
-STATIC int
+/* Scrub one of the freespace btrees for some AG. */
+int
xchk_allocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xchk_alloc ca = { };
struct xfs_btree_cur *cur;
- cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
-}
-
-int
-xchk_bnobt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_BNO);
-}
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_BNOBT:
+ cur = sc->sa.bno_cur;
+ break;
+ case XFS_SCRUB_TYPE_CNTBT:
+ cur = sc->sa.cnt_cur;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
-int
-xchk_cntbt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_CNT);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
/* xref check that the extent is not free */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..45edda096869
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG. Therefore, we can recreate the free extent records in an AG by looking
+ * for gaps in the physical extents recorded in the rmapbt. These records are
+ * staged in @free_records. Identifying the gaps is more difficult on a
+ * reflink filesystem because rmap records are allowed to overlap.
+ *
+ * Because the final step of building a new index is to free the space used by
+ * the old index, repair needs to find that space. Unfortunately, all
+ * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
+ * the same rmapbt owner code (OWN_AG), so this is not straightforward.
+ *
+ * The scan of the reverse mapping information records the space used by OWN_AG
+ * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
+ * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
+ * record all visited rmap btree blocks and all blocks owned by the AGFL.
+ *
+ * After that is where the definitions of old_allocbt_blocks shifts. This
+ * expression identifies possible former bnobt/cntbt blocks:
+ *
+ * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
+ *
+ * Substituting from above definitions, that becomes:
+ *
+ * old_allocbt_blocks & ~not_allocbt_blocks
+ *
+ * The OWN_AG bitmap itself isn't needed after this point, so what we really do
+ * instead is:
+ *
+ * old_allocbt_blocks &= ~not_allocbt_blocks;
+ *
+ * After this point, @old_allocbt_blocks is a bitmap of alleged former
+ * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
+ * parameter in place to avoid copying records around.
+ *
+ * Next, some of the space described by @free_records are diverted to the newbt
+ * reservation and used to format new btree blocks. The remaining records are
+ * written to the new btree indices. We reconstruct both bnobt and cntbt at
+ * the same time since we've already done all the work.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt {
+ /* Blocks owned by the rmapbt or the agfl. */
+ struct xagb_bitmap not_allocbt_blocks;
+
+ /* All OWN_AG blocks. */
+ struct xagb_bitmap old_allocbt_blocks;
+
+ /*
+ * New bnobt information. All btree block reservations are added to
+ * the reservation list in new_bnobt.
+ */
+ struct xrep_newbt new_bnobt;
+
+ /* new cntbt information */
+ struct xrep_newbt new_cntbt;
+
+ /* Free space extents. */
+ struct xfarray *free_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of non-null records in @free_records. */
+ uint64_t nr_real_records;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /*
+ * Next block we anticipate seeing in the rmap records. If the next
+ * rmap record is greater than next_agbno, we have found unused space.
+ */
+ xfs_agblock_t next_agbno;
+
+ /* Number of free blocks in this AG. */
+ xfs_agblock_t nr_blocks;
+
+ /* Longest free extent we found in the AG. */
+ xfs_agblock_t longest;
+};
+
+/* Set up to repair AG free space btrees. */
+int
+xrep_setup_ag_allocbt(
+ struct xfs_scrub *sc)
+{
+ unsigned int busy_gen;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
+ if (xfs_extent_busy_list_empty(sc->sa.pag))
+ return 0;
+
+ return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+}
+
+/* Check for any obvious conflicts in the free extent. */
+STATIC int
+xrep_abt_check_free_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_alloc_rec_incore *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->ar_startblock, rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sc->sa.refc_cur) {
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_COW, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Stash a free space record for all the space since the last bno we found
+ * all the way up to @end.
+ */
+static int
+xrep_abt_stash(
+ struct xrep_abt *ra,
+ xfs_agblock_t end)
+{
+ struct xfs_alloc_rec_incore arec = {
+ .ar_startblock = ra->next_agbno,
+ .ar_blockcount = end - ra->next_agbno,
+ };
+ struct xfs_scrub *sc = ra->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xrep_abt_check_free_ext(ra->sc, &arec);
+ if (error)
+ return error;
+
+ trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+
+ error = xfarray_append(ra->free_records, &arec);
+ if (error)
+ return error;
+
+ ra->nr_blocks += arec.ar_blockcount;
+ return 0;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+ int error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ error = xagb_bitmap_set(&ra->old_allocbt_blocks,
+ rec->rm_startblock, rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
+ if (error)
+ return error;
+
+ /* ...and all the free space. */
+ if (rec->rm_startblock > ra->next_agbno) {
+ error = xrep_abt_stash(ra, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ /*
+ * rmap records can overlap on reflink filesystems, so project
+ * next_agbno as far out into the AG space as we currently know about.
+ */
+ ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
+}
+
+/*
+ * Compare two free space extents by block number. We want to sort in order of
+ * increasing block number.
+ */
+static int
+xrep_bnobt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_startblock > bp->ar_startblock)
+ return 1;
+ else if (ap->ar_startblock < bp->ar_startblock)
+ return -1;
+ return 0;
+}
+
+/*
+ * Re-sort the free extents by block number so that we can put the records into
+ * the bnobt in the correct order. Make sure the records do not overlap in
+ * physical space.
+ */
+STATIC int
+xrep_bnobt_sort_records(
+ struct xrep_abt *ra)
+{
+ struct xfs_alloc_rec_incore arec;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
+ if (error)
+ return error;
+
+ while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
+ if (arec.ar_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = arec.ar_startblock + arec.ar_blockcount;
+ }
+
+ return error;
+}
+
+/*
+ * Compare two free space extents by length and then block number. We want
+ * to sort first in order of increasing length and then in order of increasing
+ * block number.
+ */
+static int
+xrep_cntbt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_blockcount > bp->ar_blockcount)
+ return 1;
+ else if (ap->ar_blockcount < bp->ar_blockcount)
+ return -1;
+ return xrep_bnobt_extent_cmp(a, b);
+}
+
+/*
+ * Sort the free extents by length so so that we can put the records into the
+ * cntbt in the correct order. Don't let userspace kill us if we're resorting
+ * after allocating btree blocks.
+ */
+STATIC int
+xrep_cntbt_sort_records(
+ struct xrep_abt *ra,
+ bool is_resort)
+{
+ return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
+ is_resort ? 0 : XFARRAY_SORT_KILLABLE);
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the gaps between rmap records (all
+ * unowned space), (2) the OWN_AG extents (which encompass the free space
+ * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
+ * blocks. The free space is (1) + (2) - (3) - (4).
+ */
+STATIC int
+xrep_abt_find_freespace(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&ra->not_allocbt_blocks);
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Iterate all the reverse mappings to find gaps in the physical
+ * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+ */
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
+ if (error)
+ goto err;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (ra->next_agbno < agend) {
+ error = xrep_abt_stash(ra, agend);
+ if (error)
+ goto err;
+ }
+
+ /* Collect all the AGFL blocks. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ goto err;
+
+ error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
+ if (error)
+ goto err_agfl;
+
+ /* Compute the old bnobt/cntbt blocks. */
+ error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
+ &ra->not_allocbt_blocks);
+ if (error)
+ goto err_agfl;
+
+ ra->nr_real_records = xfarray_length(ra->free_records);
+err_agfl:
+ xfs_trans_brelse(sc->tp, agfl_bp);
+err:
+ xchk_ag_btcur_free(&sc->sa);
+ xagb_bitmap_destroy(&ra->not_allocbt_blocks);
+ return error;
+}
+
+/*
+ * We're going to use the observed free space records to reserve blocks for the
+ * new free space btrees, so we play an iterative game where we try to converge
+ * on the number of blocks we need:
+ *
+ * 1. Estimate how many blocks we'll need to store the records.
+ * 2. If the first free record has more blocks than we need, we're done.
+ * We will have to re-sort the records prior to building the cntbt.
+ * 3. If that record has exactly the number of blocks we need, null out the
+ * record. We're done.
+ * 4. Otherwise, we still need more blocks. Null out the record, subtract its
+ * length from the number of blocks we need, and go back to step 1.
+ *
+ * Fortunately, we don't have to do any transaction work to play this game, so
+ * we don't have to tear down the staging cursors.
+ */
+STATIC int
+xrep_abt_reserve_space(
+ struct xrep_abt *ra,
+ struct xfs_btree_cur *bno_cur,
+ struct xfs_btree_cur *cnt_cur,
+ bool *needs_resort)
+{
+ struct xfs_scrub *sc = ra->sc;
+ xfarray_idx_t record_nr;
+ unsigned int allocated = 0;
+ int error = 0;
+
+ record_nr = xfarray_length(ra->free_records) - 1;
+ do {
+ struct xfs_alloc_rec_incore arec;
+ uint64_t required;
+ unsigned int desired;
+ unsigned int len;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(cnt_cur,
+ &ra->new_cntbt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ error = xfs_btree_bload_compute_geometry(bno_cur,
+ &ra->new_bnobt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ /* How many btree blocks do we need to store all records? */
+ required = ra->new_bnobt.bload.nr_blocks +
+ ra->new_cntbt.bload.nr_blocks;
+ ASSERT(required < INT_MAX);
+
+ /* If we've reserved enough blocks, we're done. */
+ if (allocated >= required)
+ break;
+
+ desired = required - allocated;
+
+ /* We need space but there's none left; bye! */
+ if (ra->nr_real_records == 0) {
+ error = -ENOSPC;
+ break;
+ }
+
+ /* Grab the first record from the list. */
+ error = xfarray_load(ra->free_records, record_nr, &arec);
+ if (error)
+ break;
+
+ ASSERT(arec.ar_blockcount <= UINT_MAX);
+ len = min_t(unsigned int, arec.ar_blockcount, desired);
+
+ trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
+ arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+
+ error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
+ arec.ar_startblock, len);
+ if (error)
+ break;
+ allocated += len;
+ ra->nr_blocks -= len;
+
+ if (arec.ar_blockcount > desired) {
+ /*
+ * Record has more space than we need. The number of
+ * free records doesn't change, so shrink the free
+ * record, inform the caller that the records are no
+ * longer sorted by length, and exit.
+ */
+ arec.ar_startblock += desired;
+ arec.ar_blockcount -= desired;
+ error = xfarray_store(ra->free_records, record_nr,
+ &arec);
+ if (error)
+ break;
+
+ *needs_resort = true;
+ return 0;
+ }
+
+ /*
+ * We're going to use up the entire record, so unset it and
+ * move on to the next one. This changes the number of free
+ * records (but doesn't break the sorting order), so we must
+ * go around the loop once more to re-run _bload_init.
+ */
+ error = xfarray_unset(ra->free_records, record_nr);
+ if (error)
+ break;
+ ra->nr_real_records--;
+ record_nr--;
+ } while (1);
+
+ return error;
+}
+
+STATIC int
+xrep_abt_dispose_one(
+ struct xrep_abt *ra,
+ struct xrep_newbt_resv *resv)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_agblock_t free_agbno = resv->agbno + resv->used;
+ xfs_extlen_t free_aglen = resv->len - resv->used;
+ int error;
+
+ ASSERT(pag == resv->pag);
+
+ /* Add a deferred rmap for each extent we used. */
+ if (resv->used > 0)
+ xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
+ resv->used, XFS_RMAP_OWN_AG);
+
+ /*
+ * For each reserved btree block we didn't use, add it to the free
+ * space btree. We didn't touch fdblocks when we reserved them, so
+ * we don't touch it now.
+ */
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, ra->new_bnobt.oinfo.oi_owner);
+
+ error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
+ &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
+ if (error)
+ return error;
+
+ return xrep_defer_finish(sc);
+}
+
+/*
+ * Deal with all the space we reserved. Blocks that were allocated for the
+ * free space btrees need to have a (deferred) rmap added for the OWN_AG
+ * allocation, and blocks that didn't get used can be freed via the usual
+ * (deferred) means.
+ */
+STATIC void
+xrep_abt_dispose_reservations(
+ struct xrep_abt *ra,
+ int error)
+{
+ struct xrep_newbt_resv *resv, *n;
+
+ if (error)
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ error = xrep_abt_dispose_one(ra, resv);
+ if (error)
+ goto junkit;
+ }
+
+junkit:
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ xfs_perag_put(resv->pag);
+ list_del(&resv->list);
+ kfree(resv);
+ }
+
+ xrep_newbt_cancel(&ra->new_bnobt);
+ xrep_newbt_cancel(&ra->new_cntbt);
+}
+
+/* Retrieve free space data for bulk load. */
+STATIC int
+xrep_abt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
+ struct xrep_abt *ra = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load_next(ra->free_records, &ra->array_cur,
+ arec);
+ if (error)
+ return error;
+
+ ra->longest = max(ra->longest, arec->ar_blockcount);
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_abt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
+}
+
+/*
+ * Reset the AGF counters to reflect the free space btrees that we just
+ * rebuilt, then reinitialize the per-AG data.
+ */
+STATIC int
+xrep_abt_reset_counters(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ unsigned int freesp_btreeblks = 0;
+
+ /*
+ * Compute the contribution to agf_btreeblks for the new free space
+ * btrees. This is the computed btree size minus anything we didn't
+ * use.
+ */
+ freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
+ freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
+
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
+
+ /*
+ * The AGF header contains extra information related to the free space
+ * btrees, so we must update those fields here.
+ */
+ agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
+ (be32_to_cpu(agf->agf_rmap_blocks) - 1));
+ agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
+ agf->agf_longest = cpu_to_be32(ra->longest);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
+ XFS_AGF_LONGEST |
+ XFS_AGF_FREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the allocbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected free space information to stage new free space btrees.
+ * If this is successful we'll return with the new btree root
+ * information logged to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_abt_build_new_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ bool needs_resort = false;
+ int error;
+
+ /*
+ * Sort the free extents by length so that we can set up the free space
+ * btrees in as few extents as possible. This reduces the amount of
+ * deferred rmap / free work we have to do at the end.
+ */
+ error = xrep_cntbt_sort_records(ra, false);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ xrep_newbt_init_bare(&ra->new_bnobt, sc);
+ xrep_newbt_init_bare(&ra->new_cntbt, sc);
+
+ ra->new_bnobt.bload.get_records = xrep_abt_get_records;
+ ra->new_cntbt.bload.get_records = xrep_abt_get_records;
+
+ ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
+ ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
+
+ /* Allocate cursors for the staged btrees. */
+ bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
+ pag, XFS_BTNUM_BNO);
+ cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
+ pag, XFS_BTNUM_CNT);
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btrees. */
+ error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
+ if (error)
+ goto err_cur;
+
+ /*
+ * If we need to re-sort the free extents by length, do so so that we
+ * can put the records into the cntbt in the correct order.
+ */
+ if (needs_resort) {
+ error = xrep_cntbt_sort_records(ra, needs_resort);
+ if (error)
+ goto err_cur;
+ }
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the alternate incore btree
+ * height so that we don't trip the verifiers when writing the new
+ * btree blocks to disk.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
+ ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
+ ra->new_cntbt.bload.btree_height;
+
+ /* Load the free space by length tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ ra->longest = 0;
+ error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ error = xrep_bnobt_sort_records(ra);
+ if (error)
+ return error;
+
+ /* Load the free space by block number tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(bno_cur, 0);
+ xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(cnt_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_abt_reset_counters(ra);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ xrep_abt_dispose_reservations(ra, error);
+
+ return xrep_roll_ag_trans(sc);
+
+err_levels:
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+err_cur:
+ xfs_btree_del_cursor(cnt_cur, error);
+ xfs_btree_del_cursor(bno_cur, error);
+err_newbt:
+ xrep_abt_dispose_reservations(ra, error);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_abt_remove_old_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_perag *pag = ra->sc->sa.pag;
+ int error;
+
+ /* Free the old btree blocks if they're not in use. */
+ error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
+ &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old allocbt blocks we can turn off
+ * the alternate height mechanism.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ return 0;
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_abt *ra;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
+ if (!ra)
+ return -ENOMEM;
+ ra->sc = sc;
+
+ /* We rebuild both data structures. */
+ sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice. In theory we cleared this before we started, but
+ * let's not risk the filesystem.
+ */
+ if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+ error = -EDEADLOCK;
+ goto out_ra;
+ }
+
+ /* Set up enough storage to handle maximally fragmented free space. */
+ descr = xchk_xfile_ag_descr(sc, "free space records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
+ sizeof(struct xfs_alloc_rec_incore),
+ &ra->free_records);
+ kfree(descr);
+ if (error)
+ goto out_ra;
+
+ /* Collect the free space data and find the old btree blocks. */
+ xagb_bitmap_init(&ra->old_allocbt_blocks);
+ error = xrep_abt_find_freespace(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the free space information. */
+ error = xrep_abt_build_new_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old trees. */
+ error = xrep_abt_remove_old_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ra->old_allocbt_blocks);
+ xfarray_destroy(ra->free_records);
+out_ra:
+ kfree(ra);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_allocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
+ error = xchk_allocbt(sc);
+ if (error)
+ goto out;
+
+ sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
+ error = xchk_allocbt(sc);
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 6c16d9530cca..83c7feb38714 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -527,28 +527,23 @@ xchk_xattr_check_sf(
struct xfs_scrub *sc)
{
struct xchk_xattr_buf *ab = sc->buf;
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
+ struct xfs_ifork *ifp = &sc->ip->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
+ struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf);
struct xfs_attr_sf_entry *next;
- struct xfs_ifork *ifp;
- unsigned char *end;
+ unsigned char *end = ifp->if_data + ifp->if_bytes;
int i;
int error = 0;
- ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
-
bitmap_zero(ab->usedmap, ifp->if_bytes);
- sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
- end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
- xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf));
- sfe = &sf->list[0];
if ((unsigned char *)sfe > end) {
xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
return 0;
}
- for (i = 0; i < sf->hdr.count; i++) {
+ for (i = 0; i < sf->count; i++) {
unsigned char *name = sfe->nameval;
unsigned char *value = &sfe->nameval[sfe->namelen];
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index e0c89a9a0ca0..1449bb5262d9 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -16,7 +16,9 @@
#include <linux/interval_tree_generic.h>
-struct xbitmap_node {
+/* u64 bitmap */
+
+struct xbitmap64_node {
struct rb_node bn_rbnode;
/* First set bit of this interval and subtree. */
@@ -39,72 +41,72 @@ struct xbitmap_node {
* forward-declare them anyway for clarity.
*/
static inline void
-xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
static inline void
-xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
-INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
-#define for_each_xbitmap_extent(bn, bitmap) \
+#define for_each_xbitmap64_extent(bn, bitmap) \
for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
- struct xbitmap_node, bn_rbnode); \
+ struct xbitmap64_node, bn_rbnode); \
(bn) != NULL; \
(bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
- struct xbitmap_node, bn_rbnode))
+ struct xbitmap64_node, bn_rbnode))
/* Clear a range of this bitmap. */
int
-xbitmap_clear(
- struct xbitmap *bitmap,
+xbitmap64_clear(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *bn;
- struct xbitmap_node *new_bn;
+ struct xbitmap64_node *bn;
+ struct xbitmap64_node *new_bn;
uint64_t last = start + len - 1;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) {
if (bn->bn_start < start && bn->bn_last > last) {
uint64_t old_last = bn->bn_last;
/* overlaps with the entire clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
/* add an extent */
- new_bn = kmalloc(sizeof(struct xbitmap_node),
+ new_bn = kmalloc(sizeof(struct xbitmap64_node),
XCHK_GFP_FLAGS);
if (!new_bn)
return -ENOMEM;
new_bn->bn_start = last + 1;
new_bn->bn_last = old_last;
- xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(new_bn, &bitmap->xb_root);
} else if (bn->bn_start < start) {
/* overlaps with the left side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
} else if (bn->bn_last > last) {
/* overlaps with the right side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_start = last + 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
break;
} else {
/* in the middle of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
@@ -114,59 +116,59 @@ xbitmap_clear(
/* Set a range of this bitmap. */
int
-xbitmap_set(
- struct xbitmap *bitmap,
+xbitmap64_set(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *left;
- struct xbitmap_node *right;
+ struct xbitmap64_node *left;
+ struct xbitmap64_node *right;
uint64_t last = start + len - 1;
int error;
/* Is this whole range already set? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
if (left && left->bn_start <= start && left->bn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
- error = xbitmap_clear(bitmap, start, len);
+ error = xbitmap64_clear(bitmap, start, len);
if (error)
return error;
/* Do we have a left-adjacent extent? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
ASSERT(!left || left->bn_last + 1 == start);
/* Do we have a right-adjacent extent? */
- right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
ASSERT(!right || right->bn_start == last + 1);
if (left && right) {
/* combine left and right adjacent extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
left->bn_last = right->bn_last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
kfree(right);
} else if (left) {
/* combine with left extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
} else if (right) {
/* combine with right extent */
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
right->bn_start = start;
- xbitmap_tree_insert(right, &bitmap->xb_root);
+ xbitmap64_tree_insert(right, &bitmap->xb_root);
} else {
/* add an extent */
- left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+ left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS);
if (!left)
return -ENOMEM;
left->bn_start = start;
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
}
return 0;
@@ -174,21 +176,21 @@ xbitmap_set(
/* Free everything related to this bitmap. */
void
-xbitmap_destroy(
- struct xbitmap *bitmap)
+xbitmap64_destroy(
+ struct xbitmap64 *bitmap)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
/* Set up a per-AG block bitmap. */
void
-xbitmap_init(
- struct xbitmap *bitmap)
+xbitmap64_init(
+ struct xbitmap64 *bitmap)
{
bitmap->xb_root = RB_ROOT_CACHED;
}
@@ -208,18 +210,18 @@ xbitmap_init(
* This is the logical equivalent of bitmap &= ~sub.
*/
int
-xbitmap_disunion(
- struct xbitmap *bitmap,
- struct xbitmap *sub)
+xbitmap64_disunion(
+ struct xbitmap64 *bitmap,
+ struct xbitmap64 *sub)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
int error;
- if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
+ if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub))
return 0;
- for_each_xbitmap_extent(bn, sub) {
- error = xbitmap_clear(bitmap, bn->bn_start,
+ for_each_xbitmap64_extent(bn, sub) {
+ error = xbitmap64_clear(bitmap, bn->bn_start,
bn->bn_last - bn->bn_start + 1);
if (error)
return error;
@@ -228,88 +230,273 @@ xbitmap_disunion(
return 0;
}
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap64_hweight(
+ struct xbitmap64 *bitmap)
+{
+ struct xbitmap64_node *bn;
+ uint64_t ret = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap)
+ ret += bn->bn_last - bn->bn_start + 1;
+
+ return ret;
+}
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap64_walk(
+ struct xbitmap64 *bitmap,
+ xbitmap64_walk_fn fn,
+ void *priv)
+{
+ struct xbitmap64_node *bn;
+ int error = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap) {
+ error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap64_empty(
+ struct xbitmap64 *bitmap)
+{
+ return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear? And for how long? */
+bool
+xbitmap64_test(
+ struct xbitmap64 *bitmap,
+ uint64_t start,
+ uint64_t *len)
+{
+ struct xbitmap64_node *bn;
+ uint64_t last = start + *len - 1;
+
+ bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
+ if (!bn)
+ return false;
+ if (bn->bn_start <= start) {
+ if (bn->bn_last < last)
+ *len = bn->bn_last - start + 1;
+ return true;
+ }
+ *len = bn->bn_start - start;
+ return false;
+}
+
+/* u32 bitmap */
+
+struct xbitmap32_node {
+ struct rb_node bn_rbnode;
+
+ /* First set bit of this interval and subtree. */
+ uint32_t bn_start;
+
+ /* Last set bit of this interval. */
+ uint32_t bn_last;
+
+ /* Last set bit of this subtree. Do not touch this. */
+ uint32_t __bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint32_t parameters. */
+
/*
- * Record all btree blocks seen while iterating all records of a btree.
- *
- * We know that the btree query_all function starts at the left edge and walks
- * towards the right edge of the tree. Therefore, we know that we can walk up
- * the btree cursor towards the root; if the pointer for a given level points
- * to the first record/key in that block, we haven't seen this block before;
- * and therefore we need to remember that we saw this block in the btree.
- *
- * So if our btree is:
- *
- * 4
- * / | \
- * 1 2 3
- *
- * Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
- * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
- * we record block 4. The list is [1, 4].
- *
- * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
- * the loop. The list remains [1, 4].
- *
- * For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
- * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
- *
- * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
- *
- * For the 201st record, we've moved on to leaf block 3.
- * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
- *
- * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
*/
+static inline void
+xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-/* Mark a btree block to the agblock bitmap. */
-STATIC int
-xagb_bitmap_visit_btblock(
- struct xfs_btree_cur *cur,
- int level,
- void *priv)
+static inline void
+xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
+ uint32_t last);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
+ uint32_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+
+/* Iterate each interval of a bitmap. Do not change the bitmap. */
+#define for_each_xbitmap32_extent(bn, bitmap) \
+ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+ struct xbitmap32_node, bn_rbnode); \
+ (bn) != NULL; \
+ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+ struct xbitmap32_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap32_clear(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
{
- struct xagb_bitmap *bitmap = priv;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
+ struct xbitmap32_node *bn;
+ struct xbitmap32_node *new_bn;
+ uint32_t last = start + len - 1;
- xfs_btree_get_block(cur, level, &bp);
- if (!bp)
- return 0;
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) {
+ if (bn->bn_start < start && bn->bn_last > last) {
+ uint32_t old_last = bn->bn_last;
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ /* overlaps with the entire clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
- return xagb_bitmap_set(bitmap, agbno, 1);
+ /* add an extent */
+ new_bn = kmalloc(sizeof(struct xbitmap32_node),
+ XCHK_GFP_FLAGS);
+ if (!new_bn)
+ return -ENOMEM;
+ new_bn->bn_start = last + 1;
+ new_bn->bn_last = old_last;
+ xbitmap32_tree_insert(new_bn, &bitmap->xb_root);
+ } else if (bn->bn_start < start) {
+ /* overlaps with the left side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ } else if (bn->bn_last > last) {
+ /* overlaps with the right side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_start = last + 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+ }
+
+ return 0;
}
-/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+/* Set a range of this bitmap. */
int
-xagb_bitmap_set_btblocks(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_set(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
{
- return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
- XFS_BTREE_VISIT_ALL, bitmap);
+ struct xbitmap32_node *left;
+ struct xbitmap32_node *right;
+ uint32_t last = start + len - 1;
+ int error;
+
+ /* Is this whole range already set? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
+ if (left && left->bn_start <= start && left->bn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ error = xbitmap32_clear(bitmap, start, len);
+ if (error)
+ return error;
+
+ /* Do we have a left-adjacent extent? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ ASSERT(!left || left->bn_last + 1 == start);
+
+ /* Do we have a right-adjacent extent? */
+ right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ ASSERT(!right || right->bn_start == last + 1);
+
+ if (left && right) {
+ /* combine left and right adjacent extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ left->bn_last = right->bn_last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ kfree(right);
+ } else if (left) {
+ /* combine with left extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ } else if (right) {
+ /* combine with right extent */
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ right->bn_start = start;
+ xbitmap32_tree_insert(right, &bitmap->xb_root);
+ } else {
+ /* add an extent */
+ left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS);
+ if (!left)
+ return -ENOMEM;
+ left->bn_start = start;
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ }
+
+ return 0;
+}
+
+/* Free everything related to this bitmap. */
+void
+xbitmap32_destroy(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) {
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+}
+
+/* Set up a per-AG block bitmap. */
+void
+xbitmap32_init(
+ struct xbitmap32 *bitmap)
+{
+ bitmap->xb_root = RB_ROOT_CACHED;
}
/*
- * Record all the buffers pointed to by the btree cursor. Callers already
- * engaged in a btree walk should call this function to capture the list of
- * blocks going from the leaf towards the root.
+ * Remove all the blocks mentioned in @sub from the extents in @bitmap.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @bitmap; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sub. This routine subtracts all the extents
+ * mentioned in sub from all the extents linked in @bitmap, which leaves
+ * @bitmap as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @bitmap can be reaped.
+ *
+ * This is the logical equivalent of bitmap &= ~sub.
*/
int
-xagb_bitmap_set_btcur_path(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_disunion(
+ struct xbitmap32 *bitmap,
+ struct xbitmap32 *sub)
{
- int i;
+ struct xbitmap32_node *bn;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
- error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub))
+ return 0;
+
+ for_each_xbitmap32_extent(bn, sub) {
+ error = xbitmap32_clear(bitmap, bn->bn_start,
+ bn->bn_last - bn->bn_start + 1);
if (error)
return error;
}
@@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path(
}
/* How many bits are set in this bitmap? */
-uint64_t
-xbitmap_hweight(
- struct xbitmap *bitmap)
+uint32_t
+xbitmap32_hweight(
+ struct xbitmap32 *bitmap)
{
- struct xbitmap_node *bn;
- uint64_t ret = 0;
+ struct xbitmap32_node *bn;
+ uint32_t ret = 0;
- for_each_xbitmap_extent(bn, bitmap)
+ for_each_xbitmap32_extent(bn, bitmap)
ret += bn->bn_last - bn->bn_start + 1;
return ret;
@@ -333,15 +520,15 @@ xbitmap_hweight(
/* Call a function for every run of set bits in this bitmap. */
int
-xbitmap_walk(
- struct xbitmap *bitmap,
- xbitmap_walk_fn fn,
+xbitmap32_walk(
+ struct xbitmap32 *bitmap,
+ xbitmap32_walk_fn fn,
void *priv)
{
- struct xbitmap_node *bn;
+ struct xbitmap32_node *bn;
int error = 0;
- for_each_xbitmap_extent(bn, bitmap) {
+ for_each_xbitmap32_extent(bn, bitmap) {
error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
if (error)
break;
@@ -352,23 +539,23 @@ xbitmap_walk(
/* Does this bitmap have no bits set at all? */
bool
-xbitmap_empty(
- struct xbitmap *bitmap)
+xbitmap32_empty(
+ struct xbitmap32 *bitmap)
{
return bitmap->xb_root.rb_root.rb_node == NULL;
}
/* Is the start of the range set or clear? And for how long? */
bool
-xbitmap_test(
- struct xbitmap *bitmap,
- uint64_t start,
- uint64_t *len)
+xbitmap32_test(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t *len)
{
- struct xbitmap_node *bn;
- uint64_t last = start + *len - 1;
+ struct xbitmap32_node *bn;
+ uint32_t last = start + *len - 1;
- bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
if (!bn)
return false;
if (bn->bn_start <= start) {
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 4fe58bad6734..2df8911606d6 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,17 +6,19 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xbitmap {
+/* u64 bitmap */
+
+struct xbitmap64 {
struct rb_root_cached xb_root;
};
-void xbitmap_init(struct xbitmap *bitmap);
-void xbitmap_destroy(struct xbitmap *bitmap);
+void xbitmap64_init(struct xbitmap64 *bitmap);
+void xbitmap64_destroy(struct xbitmap64 *bitmap);
-int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub);
+uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap);
/*
* Return codes for the bitmap iterator functions are 0 to continue iterating,
@@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap);
* iteration, because neither bitmap iterator ever generates that error code on
* its own. Callers must not modify the bitmap while walking it.
*/
-typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
-int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn,
void *priv);
-bool xbitmap_empty(struct xbitmap *bitmap);
-bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+bool xbitmap64_empty(struct xbitmap64 *bitmap);
+bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len);
-/* Bitmaps, but for type-checked for xfs_agblock_t */
+/* u32 bitmap */
-struct xagb_bitmap {
- struct xbitmap agbitmap;
+struct xbitmap32 {
+ struct rb_root_cached xb_root;
};
-static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
-{
- xbitmap_init(&bitmap->agbitmap);
-}
-
-static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
-{
- xbitmap_destroy(&bitmap->agbitmap);
-}
-
-static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_clear(&bitmap->agbitmap, start, len);
-}
-static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_set(&bitmap->agbitmap, start, len);
-}
-
-static inline bool
-xagb_bitmap_test(
- struct xagb_bitmap *bitmap,
- xfs_agblock_t start,
- xfs_extlen_t *len)
-{
- uint64_t biglen = *len;
- bool ret;
-
- ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
-
- if (start + biglen >= UINT_MAX) {
- ASSERT(0);
- biglen = UINT_MAX - start;
- }
-
- *len = biglen;
- return ret;
-}
-
-static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
- struct xagb_bitmap *sub)
-{
- return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
-}
+void xbitmap32_init(struct xbitmap32 *bitmap);
+void xbitmap32_destroy(struct xbitmap32 *bitmap);
-static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
-{
- return xbitmap_hweight(&bitmap->agbitmap);
-}
-static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
-{
- return xbitmap_empty(&bitmap->agbitmap);
-}
+int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub);
+uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap);
-static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
- xbitmap_walk_fn fn, void *priv)
-{
- return xbitmap_walk(&bitmap->agbitmap, fn, priv);
-}
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iteration caller. The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own. Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv);
+int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
+ void *priv);
-int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
-int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
+bool xbitmap32_empty(struct xbitmap32 *bitmap);
+bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 06d8c1996a33..b169cddde6da 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,9 +19,11 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "scrub/health.h"
#include "xfs_ag.h"
/* Set us up with an inode's bmap. */
@@ -48,9 +50,18 @@ xchk_setup_inode_bmap(
if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+ bool is_repair = xchk_could_repair(sc);
xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
+ /* Break all our leases, we're going to mess with things. */
+ if (is_repair) {
+ error = xfs_break_layouts(VFS_I(sc->ip),
+ &sc->ilock_flags, BREAK_WRITE);
+ if (error)
+ goto out;
+ }
+
inode_dio_wait(VFS_I(sc->ip));
/*
@@ -71,6 +82,15 @@ xchk_setup_inode_bmap(
error = filemap_fdatawait_keep_errors(mapping);
if (error && (error != -ENOSPC && error != -EIO))
goto out;
+
+ /* Drop the page cache if we're repairing block mappings. */
+ if (is_repair) {
+ error = invalidate_inode_pages2(
+ VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
}
/* Got the inode, lock it and we're ready to go. */
@@ -78,6 +98,10 @@ xchk_setup_inode_bmap(
if (error)
goto out;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode */
@@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps(
}
/*
+ * Decide if we want to scan the reverse mappings to determine if the attr
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_attrfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * Files can have an attr fork in EXTENTS format with zero records for
+ * several reasons:
+ *
+ * a) an attr set created a fork but ran out of space
+ * b) attr replace deleted an old attr but failed during the set step
+ * c) the data fork was in btree format when all attrs were deleted, so
+ * the fork was left in place
+ * d) the inode repair code zapped the fork
+ *
+ * Only in case (d) do we want to scan the rmapbt to see if we need to
+ * rebuild the attr fork. The fork zap code clears all DAC permission
+ * bits and zeroes the uid and gid, so avoid the scan if any of those
+ * three conditions are not met.
+ */
+ if ((VFS_I(ip)->i_mode & 0777) != 0)
+ return false;
+ if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID))
+ return false;
+ if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID))
+ return false;
+
+ return true;
+}
+
+/*
+ * Decide if we want to scan the reverse mappings to determine if the data
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_datafork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_df;
+
+ /* Don't support realtime rmap checks yet. */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return false;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * If we encounter an empty data fork along with evidence that the fork
+ * might not really be empty, we need to scan the reverse mappings to
+ * decide if we're going to rebuild the fork. Data forks with nonzero
+ * file size are scanned.
+ */
+ return i_size_read(VFS_I(ip)) != 0;
+}
+
+/*
* Decide if we want to walk every rmap btree in the fs to make sure that each
* rmap for this file fork has corresponding bmbt entries.
*/
@@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps(
struct xchk_bmap_info *info)
{
struct xfs_scrub *sc = info->sc;
- struct xfs_ifork *ifp;
if (!xfs_has_rmapbt(sc->mp))
return false;
@@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return false;
- /* Don't support realtime rmap checks yet. */
- if (info->is_rt)
- return false;
-
- /*
- * The inode repair code zaps broken inode forks by resetting them back
- * to EXTENTS format and zero extent records. If we encounter a fork
- * in this state along with evidence that the fork isn't supposed to be
- * empty, we need to scan the reverse mappings to decide if we're going
- * to rebuild the fork. Data forks with nonzero file size are scanned.
- * xattr forks are never empty of content, so they are always scanned.
- */
- ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
- if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
- if (info->whichfork == XFS_DATA_FORK &&
- i_size_read(VFS_I(sc->ip)) == 0)
- return false;
-
- return true;
- }
+ if (info->whichfork == XFS_ATTR_FORK)
+ return xchk_bmap_check_empty_attrfork(sc->ip);
- return false;
+ return xchk_bmap_check_empty_datafork(sc->ip);
}
/* Make sure each rmap has a corresponding bmbt entry. */
@@ -939,7 +1020,20 @@ int
xchk_bmap_data(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_DATA_FORK);
+ int error;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ /* If the data fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED);
+ return 0;
}
/* Scrub an inode's attr fork. */
@@ -947,7 +1041,27 @@ int
xchk_bmap_attr(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_ATTR_FORK);
+ int error;
+
+ /*
+ * If the attr fork has been zapped, it's possible that forkoff was
+ * reset to zero and hence sc->ip->i_afp is NULL. We don't want the
+ * NULL ifp check in xchk_bmap to conclude that the attr fork is ok,
+ * so short circuit that logic by setting the corruption flag and
+ * returning immediately.
+ */
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ /* If the attr fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED);
+ return 0;
}
/* Scrub an inode's CoW fork. */
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
new file mode 100644
index 000000000000..a4bb89fdd510
--- /dev/null
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Fork Block Mapping (BMBT) Repair
+ * ======================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+
+enum reflink_scan_state {
+ RLS_IRRELEVANT = -1, /* not applicable to this file */
+ RLS_UNKNOWN, /* shared extent scans required */
+ RLS_SET_IFLAG, /* iflag must be set */
+};
+
+struct xrep_bmap {
+ /* Old bmbt blocks */
+ struct xfsb_bitmap old_bmbt_blocks;
+
+ /* New fork. */
+ struct xrep_newbt new_bmapbt;
+
+ /* List of new bmap records. */
+ struct xfarray *bmap_records;
+
+ struct xfs_scrub *sc;
+
+ /* How many blocks did we find allocated to this file? */
+ xfs_rfsblock_t nblocks;
+
+ /* How many bmbt blocks did we find for this fork? */
+ xfs_rfsblock_t old_bmbt_block_count;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /* How many real (non-hole, non-delalloc) mappings do we have? */
+ uint64_t real_mappings;
+
+ /* Which fork are we fixing? */
+ int whichfork;
+
+ /* What d the REFLINK flag be set when the repair is over? */
+ enum reflink_scan_state reflink_scan;
+
+ /* Do we allow unwritten extents? */
+ bool allow_unwritten;
+};
+
+/* Is this space extent shared? Flag the inode if it is. */
+STATIC int
+xrep_bmap_discover_shared(
+ struct xrep_bmap *rb,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ struct xfs_scrub *sc = rb->sc;
+ xfs_agblock_t agbno;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
+ &fbno, &flen, false);
+ if (error)
+ return error;
+
+ if (fbno != NULLAGBLOCK)
+ rb->reflink_scan = RLS_SET_IFLAG;
+
+ return 0;
+}
+
+/* Remember this reverse-mapping as a series of bmap records. */
+STATIC int
+xrep_bmap_from_rmap(
+ struct xrep_bmap *rb,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ bool unwritten)
+{
+ struct xfs_bmbt_irec irec = {
+ .br_startoff = startoff,
+ .br_startblock = startblock,
+ .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM,
+ };
+ struct xfs_bmbt_rec rbe;
+ struct xfs_scrub *sc = rb->sc;
+ int error = 0;
+
+ /*
+ * If we're repairing the data fork of a non-reflinked regular file on
+ * a reflink filesystem, we need to figure out if this space extent is
+ * shared.
+ */
+ if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) {
+ error = xrep_bmap_discover_shared(rb, startblock, blockcount);
+ if (error)
+ return error;
+ }
+
+ do {
+ xfs_failaddr_t fa;
+
+ irec.br_blockcount = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec);
+ if (fa)
+ return -EFSCORRUPTED;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+
+ rb->real_mappings++;
+
+ irec.br_startblock += irec.br_blockcount;
+ irec.br_startoff += irec.br_blockcount;
+ blockcount -= irec.br_blockcount;
+ } while (blockcount > 0);
+
+ return 0;
+}
+
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_fork_rmap(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_scrub *sc = rb->sc;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /*
+ * Data extents for rt files are never stored on the data device, but
+ * everything else (xattrs, bmbt blocks) can be.
+ */
+ if (XFS_IS_REALTIME_INODE(sc->ip) &&
+ !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))
+ return -EFSCORRUPTED;
+
+ /* Check that this is within the AG. */
+ if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Check the file offset range. */
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* No contradictory flags. */
+ if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) &&
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rm_startblock, rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ if (rec->rm_owner != rb->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_bmap_check_fork_rmap(rb, cur, rec);
+ if (error)
+ return error;
+
+ /*
+ * Record all blocks allocated to this file even if the extent isn't
+ * for the fork we're rebuilding so that we can reset di_nblocks later.
+ */
+ rb->nblocks += rec->rm_blockcount;
+
+ /* If this rmap isn't for the fork we want, we're done. */
+ if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+
+ /* Reject unwritten extents if we don't allow those. */
+ if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
+ return -EFSCORRUPTED;
+
+ fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+ rec->rm_startblock);
+
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+ rb->old_bmbt_block_count += rec->rm_blockcount;
+ return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno,
+ rec->rm_blockcount);
+ }
+
+ return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno,
+ rec->rm_blockcount,
+ rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/*
+ * Compare two block mapping records. We want to sort in order of increasing
+ * file offset.
+ */
+static int
+xrep_bmap_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_bmbt_rec *ba = a;
+ const struct xfs_bmbt_rec *bb = b;
+ xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba);
+ xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb);
+
+ if (ao > bo)
+ return 1;
+ else if (ao < bo)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the bmap extents by fork offset or else the records will be in the
+ * wrong order. Ensure there are no overlaps in the file offset ranges.
+ */
+STATIC int
+xrep_bmap_sort_records(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ xfs_fileoff_t next_off = 0;
+ xfarray_idx_t array_cur;
+ int error;
+
+ error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ if (irec.br_startoff < next_off)
+ return -EFSCORRUPTED;
+
+ next_off = irec.br_startoff + irec.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xrep_bmap_scan_ag(
+ struct xrep_bmap *rb,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Find the delalloc extents from the old incore extent tree. */
+STATIC int
+xrep_bmap_find_delalloc(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_rec rbe;
+ struct xfs_inode *ip = rb->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork);
+ int error = 0;
+
+ /*
+ * Skip this scan if we don't expect to find delayed allocation
+ * reservations in this fork.
+ */
+ if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0)
+ return 0;
+
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (!isnullstartblock(irec.br_startblock))
+ continue;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Collect block mappings for this fork of this inode and decide if we have
+ * enough space to rebuild. Caller is responsible for cleaning up the list if
+ * anything goes wrong.
+ */
+STATIC int
+xrep_bmap_find_mappings(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ /* Iterate the rmaps for extents. */
+ for_each_perag(sc->mp, agno, pag) {
+ error = xrep_bmap_scan_ag(rb, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ return xrep_bmap_find_delalloc(rb);
+}
+
+/* Retrieve real extent mappings for bulk loading the bmap btree. */
+STATIC int
+xrep_bmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_bmbt_rec rec;
+ struct xfs_bmbt_irec *irec = &cur->bc_rec.b;
+ struct xrep_bmap *rb = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(rb->bmap_records, rb->array_cur++,
+ &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, irec);
+ } while (isnullstartblock(irec->br_startblock));
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_bmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+
+ return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_bmap_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ ASSERT(level > 0);
+
+ return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_bmap_reset_counters(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int64_t delta;
+
+ if (rb->reflink_scan == RLS_SET_IFLAG)
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Update the inode block counts to reflect the extents we found in the
+ * rmapbt.
+ */
+ delta = ifake->if_blocks - rb->old_bmbt_block_count;
+ sc->ip->i_nblocks = rb->nblocks + delta;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the quota counts by the difference in size between the old
+ * and new bmbt.
+ */
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta);
+ return 0;
+}
+
+/*
+ * Create a new iext tree and load it with block mappings. If the inode is
+ * in extents format, that's all we need to do to commit the new mappings.
+ * If it is in btree format, this takes care of preloading the incore tree.
+ */
+STATIC int
+xrep_bmap_extents_load(
+ struct xrep_bmap *rb)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork;
+ xfarray_idx_t array_cur;
+ int error;
+
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Add all the mappings (incl. delalloc) to the incore extent tree. */
+ xfs_iext_first(ifp, &icur);
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ xfs_iext_insert_raw(ifp, &icur, &irec);
+ if (!isnullstartblock(irec.br_startblock))
+ ifp->if_nextents++;
+
+ xfs_iext_next(ifp, &icur);
+ }
+
+ return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork,
+ ifp->if_nextents);
+}
+
+/*
+ * Reserve new btree blocks, bulk load the bmap records into the ondisk btree,
+ * and load the incore extent tree.
+ */
+STATIC int
+xrep_bmap_btree_load(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *bmap_cur)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(bmap_cur,
+ &rb->new_bmapbt.bload, rb->real_mappings);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire bmap
+ * from the number of extents we found, and pump up our transaction to
+ * have sufficient block reservation. We're allowed to exceed file
+ * quota to repair inconsistent metadata.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip,
+ rb->new_bmapbt.bload.nr_blocks, 0, true);
+ if (error)
+ return error;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rb->new_bmapbt,
+ rb->new_bmapbt.bload.nr_blocks);
+ if (error)
+ return error;
+
+ /* Add all observed bmap records. */
+ rb->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb);
+ if (error)
+ return error;
+
+ /*
+ * Load the new bmap records into the new incore extent tree to
+ * preserve delalloc reservations for regular files. The directory
+ * code loads the extent tree during xfs_dir_open and assumes
+ * thereafter that it remains loaded, so we must not violate that
+ * assumption.
+ */
+ return xrep_bmap_extents_load(rb);
+}
+
+/*
+ * Use the collected bmap information to stage a new bmap fork. If this is
+ * successful we'll return with the new fork information logged to the repair
+ * transaction but not yet committed. The caller must ensure that the inode
+ * is joined to the transaction; the inode will be joined to a clean
+ * transaction when the function returns.
+ */
+STATIC int
+xrep_bmap_build_new_fork(
+ struct xrep_bmap *rb)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *bmap_cur;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int error;
+
+ error = xrep_bmap_sort_records(rb);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new fork by initializing the new btree
+ * structure and creating a fake ifork in the ifakeroot structure.
+ */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork,
+ &oinfo);
+ if (error)
+ return error;
+
+ rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
+ rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
+ rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
+ bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+ /*
+ * Figure out the size and format of the new fork, then fill it with
+ * all the bmap records we've found. Join the inode to the transaction
+ * so that we can roll the transaction while holding the inode locked.
+ */
+ if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS;
+ error = xrep_bmap_extents_load(rb);
+ } else {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE;
+ error = xrep_bmap_btree_load(rb, bmap_cur);
+ }
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new fork in the inode. After this point the old mapping
+ * data are no longer accessible and the new tree is live. We delete
+ * the cursor immediately after committing the staged root because the
+ * staged fork might be in extents format.
+ */
+ xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork);
+ xfs_btree_del_cursor(bmap_cur, 0);
+
+ /* Reset the inode counters now that we've changed the fork. */
+ error = xrep_bmap_reset_counters(rb);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rb->new_bmapbt);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+
+err_cur:
+ if (bmap_cur)
+ xfs_btree_del_cursor(bmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rb->new_bmapbt);
+ return error;
+}
+
+/*
+ * Now that we've logged the new inode btree, invalidate all of the old blocks
+ * and free them, if there were any.
+ */
+STATIC int
+xrep_bmap_remove_old_tree(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_owner_info oinfo;
+
+ /* Free the old bmbt blocks if they're not in use. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo);
+}
+
+/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */
+STATIC int
+xrep_bmap_check_inputs(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+
+ /* No fork means nothing to rebuild. */
+ if (!ifp)
+ return -ECANCELED;
+
+ /*
+ * We only know how to repair extent mappings, which is to say that we
+ * only support extents and btree fork format. Repairs to a local
+ * format fork require a higher level repair function, so we do not
+ * have any work to do here.
+ */
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ return -ECANCELED;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+
+ if (whichfork == XFS_ATTR_FORK)
+ return 0;
+
+ /* Only files, symlinks, and directories get to have data forks. */
+ switch (VFS_I(sc->ip)->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ /* ok */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Don't know how to rebuild realtime data forks. */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+/* Set up the initial state of the reflink scan. */
+static inline enum reflink_scan_state
+xrep_bmap_init_reflink_scan(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ /* cannot share on non-reflink filesystem */
+ if (!xfs_has_reflink(sc->mp))
+ return RLS_IRRELEVANT;
+
+ /* preserve flag if it's already set */
+ if (xfs_is_reflink_inode(sc->ip))
+ return RLS_SET_IFLAG;
+
+ /* can only share regular files */
+ if (!S_ISREG(VFS_I(sc->ip)->i_mode))
+ return RLS_IRRELEVANT;
+
+ /* cannot share attr fork extents */
+ if (whichfork != XFS_DATA_FORK)
+ return RLS_IRRELEVANT;
+
+ /* cannot share realtime extents */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return RLS_IRRELEVANT;
+
+ return RLS_UNKNOWN;
+}
+
+/* Repair an inode fork. */
+int
+xrep_bmap(
+ struct xfs_scrub *sc,
+ int whichfork,
+ bool allow_unwritten)
+{
+ struct xrep_bmap *rb;
+ char *descr;
+ unsigned int max_bmbt_recs;
+ bool large_extcount;
+ int error = 0;
+
+ error = xrep_bmap_check_inputs(sc, whichfork);
+ if (error == -ECANCELED)
+ return 0;
+ if (error)
+ return error;
+
+ rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS);
+ if (!rb)
+ return -ENOMEM;
+ rb->sc = sc;
+ rb->whichfork = whichfork;
+ rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork);
+ rb->allow_unwritten = allow_unwritten;
+
+ /* Set up enough storage to handle the max records for this fork. */
+ large_extcount = xfs_has_large_extent_counts(sc->mp);
+ max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork);
+ descr = xchk_xfile_ino_descr(sc, "%s fork mapping records",
+ whichfork == XFS_DATA_FORK ? "data" : "attr");
+ error = xfarray_create(descr, max_bmbt_recs,
+ sizeof(struct xfs_bmbt_rec), &rb->bmap_records);
+ kfree(descr);
+ if (error)
+ goto out_rb;
+
+ /* Collect all reverse mappings for this fork's extents. */
+ xfsb_bitmap_init(&rb->old_bmbt_blocks);
+ error = xrep_bmap_find_mappings(rb);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the bmap information. */
+ error = xrep_bmap_build_new_fork(rb);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_bmap_remove_old_tree(rb);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rb->old_bmbt_blocks);
+ xfarray_destroy(rb->bmap_records);
+out_rb:
+ kfree(rb);
+ return error;
+}
+
+/* Repair an inode's data fork. */
+int
+xrep_bmap_data(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_DATA_FORK, true);
+}
+
+/* Repair an inode's attr fork. */
+int
+xrep_bmap_attr(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_ATTR_FORK, false);
+}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index de24532fe083..81f2b96bb5a7 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -25,6 +25,7 @@
#include "xfs_trans_priv.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
@@ -604,6 +605,7 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
+ xrep_reset_perag_resv(sc);
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -733,6 +735,8 @@ xchk_iget(
xfs_ino_t inum,
struct xfs_inode **ipp)
{
+ ASSERT(sc->tp != NULL);
+
return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
}
@@ -816,6 +820,26 @@ again:
return 0;
}
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Try to attach dquots to this inode if we think we might want to repair it.
+ * Callers must not hold any ILOCKs. If the dquots are broken and cannot be
+ * attached, a quotacheck will be scheduled.
+ */
+int
+xchk_ino_dqattach(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ if (!xchk_could_repair(sc))
+ return 0;
+
+ return xrep_ino_dqattach(sc);
+}
+#endif
+
/* Install an inode that we opened by handle for scrubbing. */
int
xchk_install_handle_inode(
@@ -882,8 +906,8 @@ xchk_iget_for_scrubbing(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_inode(sc, ip);
if (error == -ENOENT)
@@ -1027,6 +1051,11 @@ xchk_setup_inode_contents(
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
+
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode for us */
@@ -1132,6 +1161,7 @@ xchk_metadata_inode_subtype(
unsigned int scrub_type)
{
__u32 smtype = sc->sm->sm_type;
+ unsigned int sick_mask = sc->sick_mask;
int error;
sc->sm->sm_type = scrub_type;
@@ -1149,6 +1179,7 @@ xchk_metadata_inode_subtype(
break;
}
+ sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
return error;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index cabdc0e16838..da09580b454a 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
}
#endif
#ifdef CONFIG_XFS_QUOTA
+int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
#else
static inline int
+xchk_ino_dqattach(struct xfs_scrub *sc)
+{
+ return 0;
+}
+static inline int
xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
@@ -151,6 +157,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+/*
+ * Grab the inode at @inum. The caller must have created a scrub transaction
+ * so that we can confirm the inumber by walking the inobt and not deadlock on
+ * a loop in the inobt.
+ */
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
@@ -158,6 +169,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
/*
+ * Safe version of (untrusted) xchk_iget that uses an empty transaction to
+ * avoid deadlocking on loops in the inobt. This should only be used in a
+ * scrub or repair setup routine, and only prior to grabbing a transaction.
+ */
+static inline int
+xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp)
+{
+ int error;
+
+ ASSERT(sc->tp == NULL);
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+ error = xchk_iget(sc, inum, ipp);
+ xchk_trans_cancel(sc);
+ return error;
+}
+
+/*
* Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies.
*/
@@ -167,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+
#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
@@ -175,8 +208,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT |
XFS_SCRUB_OFLAG_PREEN);
}
+
+/*
+ * "Should we prepare for a repair?"
+ *
+ * Return true if the caller permits us to repair metadata and we're not
+ * setting up for a post-repair evaluation.
+ */
+static inline bool xchk_could_repair(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED);
+}
#else
# define xchk_needs_repair(sc) (false)
+# define xchk_could_repair(sc) (false)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -188,6 +234,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
(sc)->mp->m_super->s_id, ##__VA_ARGS__)
+#define xchk_xfile_ag_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+ ##__VA_ARGS__)
+#define xchk_xfile_ino_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
+ ##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
new file mode 100644
index 000000000000..1e82c727af8e
--- /dev/null
+++ b/fs/xfs/scrub/cow_repair.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_icache.h"
+#include "xfs_refcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/off_bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * CoW Fork Mapping Repair
+ * =======================
+ *
+ * Although CoW staging extents are owned by incore CoW inode forks, on disk
+ * they are owned by the refcount btree. The ondisk metadata does not record
+ * any ownership information, which limits what we can do to repair the
+ * mappings in the CoW fork. At most, we can replace ifork mappings that lack
+ * an entry in the refcount btree or are described by a reverse mapping record
+ * whose owner is not OWN_COW.
+ *
+ * Replacing extents is also tricky -- we can't touch written CoW fork extents
+ * since they are undergoing writeback, and delalloc extents do not require
+ * repair since they only exist incore. Hence the most we can do is find the
+ * bad parts of unwritten mappings, allocate a replacement set of blocks, and
+ * replace the incore mapping. We use the regular reaping process to unmap
+ * or free the discarded blocks, as appropriate.
+ */
+struct xrep_cow {
+ struct xfs_scrub *sc;
+
+ /* Bitmap of file offset ranges that need replacing. */
+ struct xoff_bitmap bad_fileoffs;
+
+ /* Bitmap of fsblocks that were removed from the CoW fork. */
+ struct xfsb_bitmap old_cowfork_fsblocks;
+
+ /* CoW fork mappings used to scan for bad CoW staging extents. */
+ struct xfs_bmbt_irec irec;
+
+ /* refcount btree block number of irec.br_startblock */
+ unsigned int irec_startbno;
+
+ /* refcount btree block number of the next refcount record we expect */
+ unsigned int next_bno;
+};
+
+/* CoW staging extent. */
+struct xrep_cow_extent {
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+/*
+ * Mark the part of the file range that corresponds to the given physical
+ * space. Caller must ensure that the physical range is within xc->irec.
+ */
+STATIC int
+xrep_cow_mark_file_range(
+ struct xrep_cow *xc,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ xfs_fileoff_t startoff;
+
+ startoff = xc->irec.br_startoff +
+ (startblock - xc->irec.br_startblock);
+
+ trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
+ blockcount);
+
+ return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
+}
+
+/*
+ * Trim @src to fit within the CoW fork mapping being examined, and put the
+ * result in @dst.
+ */
+static inline void
+xrep_cow_trim_refcount(
+ struct xrep_cow *xc,
+ struct xfs_refcount_irec *dst,
+ const struct xfs_refcount_irec *src)
+{
+ unsigned int adj;
+
+ memcpy(dst, src, sizeof(*dst));
+
+ if (dst->rc_startblock < xc->irec_startbno) {
+ adj = xc->irec_startbno - dst->rc_startblock;
+ dst->rc_blockcount -= adj;
+ dst->rc_startblock += adj;
+ }
+
+ if (dst->rc_startblock + dst->rc_blockcount >
+ xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (dst->rc_startblock + dst->rc_blockcount) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ dst->rc_blockcount -= adj;
+ }
+}
+
+/* Mark any shared CoW staging extents. */
+STATIC int
+xrep_cow_mark_shared_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ xfs_fsblock_t fsbno;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ rrec.rc_startblock);
+ return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+}
+
+/*
+ * Mark any portion of the CoW fork file offset range where there is not a CoW
+ * staging extent record in the refcountbt, and keep a record of where we did
+ * find correct refcountbt records. Staging records are always cleaned out at
+ * mount time, so any two inodes trying to map the same staging area would have
+ * already taken the fs down due to refcount btree verifier errors. Hence this
+ * inode should be the sole creator of the staging extent records ondisk.
+ */
+STATIC int
+xrep_cow_mark_missing_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ int error;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ if (xc->next_bno >= rrec.rc_startblock)
+ goto next;
+
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ xc->next_bno),
+ rrec.rc_startblock - xc->next_bno);
+ if (error)
+ return error;
+
+next:
+ xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
+ return 0;
+}
+
+/*
+ * Mark any area that does not correspond to a CoW staging rmap. These are
+ * cross-linked areas that must be avoided.
+ */
+STATIC int
+xrep_cow_mark_missing_staging_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t rec_bno;
+ xfs_extlen_t rec_len;
+ unsigned int adj;
+
+ if (rec->rm_owner == XFS_RMAP_OWN_COW)
+ return 0;
+
+ rec_bno = rec->rm_startblock;
+ rec_len = rec->rm_blockcount;
+ if (rec_bno < xc->irec_startbno) {
+ adj = xc->irec_startbno - rec_bno;
+ rec_len -= adj;
+ rec_bno += adj;
+ }
+
+ if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (rec_bno + rec_len) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ rec_len -= adj;
+ }
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
+ return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+}
+
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_perag *pag;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_agnumber_t agno;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
+ xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
+
+ pag = xfs_perag_get(sc->mp, agno);
+ if (!pag)
+ return -EFSCORRUPTED;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ goto out_pag;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sa;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sa;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
+ xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sa;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sa;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone turned
+ * on the debugging knob, replace everything in the CoW fork.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ return error;
+ }
+
+out_sa:
+ xchk_ag_free(sc, &sc->sa);
+out_pag:
+ xfs_perag_put(pag);
+ return 0;
+}
+
+/*
+ * Allocate a replacement CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = sc->mp,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .minlen = 1,
+ .maxlen = maxlen,
+ .prod = 1,
+ .resv = XFS_AG_RESV_NONE,
+ .datatype = XFS_ALLOC_USERDATA,
+ };
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
+ if (error)
+ return error;
+
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+
+ repl->fsbno = args.fsbno;
+ repl->len = args.len;
+ return 0;
+}
+
+/*
+ * Look up the current CoW fork mapping so that we only allocate enough to
+ * replace a single mapping. If we don't find a mapping that covers the start
+ * of the file range, or we find a delalloc or written extent, something is
+ * seriously wrong, since we didn't drop the ILOCK.
+ */
+static inline int
+xrep_cow_find_mapping(
+ struct xrep_cow *xc,
+ struct xfs_iext_cursor *icur,
+ xfs_fileoff_t startoff,
+ struct xfs_bmbt_irec *got)
+{
+ struct xfs_inode *ip = xc->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
+ goto bad;
+
+ if (got->br_startoff > startoff)
+ goto bad;
+
+ if (got->br_blockcount == 0)
+ goto bad;
+
+ if (isnullstartblock(got->br_startblock))
+ goto bad;
+
+ if (xfs_bmap_is_written_extent(got))
+ goto bad;
+
+ return 0;
+bad:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+#define REPLACE_LEFT_SIDE (1U << 0)
+#define REPLACE_RIGHT_SIDE (1U << 1)
+
+/*
+ * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
+ * beginning of @got with the space described by @rep.
+ */
+static inline void
+xrep_cow_replace_mapping(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *icur,
+ const struct xfs_bmbt_irec *got,
+ const struct xrep_cow_extent *repl)
+{
+ struct xfs_bmbt_irec new = *got; /* struct copy */
+
+ ASSERT(repl->len > 0);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
+
+ if (got->br_blockcount == repl->len) {
+ /*
+ * The new extent is a complete replacement for the existing
+ * extent. Update the COW fork record.
+ */
+ new.br_startblock = repl->fsbno;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+ return;
+ }
+
+ /*
+ * The new extent can replace the beginning of the COW fork record.
+ * Move the left side of @got upwards, then insert the new record.
+ */
+ new.br_startoff += repl->len;
+ new.br_startblock += repl->len;
+ new.br_blockcount -= repl->len;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+
+ new.br_startoff = got->br_startoff;
+ new.br_startblock = repl->fsbno;
+ new.br_blockcount = repl->len;
+ xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
+}
+
+/*
+ * Replace the unwritten CoW staging extent backing the given file range with a
+ * new space extent that isn't as problematic.
+ */
+STATIC int
+xrep_cow_replace_range(
+ struct xrep_cow *xc,
+ xfs_fileoff_t startoff,
+ xfs_extlen_t *blockcount)
+{
+ struct xfs_iext_cursor icur;
+ struct xrep_cow_extent repl;
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_fileoff_t nextoff;
+ xfs_extlen_t alloc_len;
+ int error;
+
+ /*
+ * Put the existing CoW fork mapping in @got. If @got ends before
+ * @rep, truncate @rep so we only replace one extent mapping at a time.
+ */
+ error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
+ if (error)
+ return error;
+ nextoff = min(startoff + *blockcount,
+ got.br_startoff + got.br_blockcount);
+
+ /*
+ * Allocate a replacement extent. If we don't fill all the blocks,
+ * shorten the quantity that will be deleted in this step.
+ */
+ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
+ nextoff - startoff);
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (error)
+ return error;
+
+ /*
+ * Replace the old mapping with the new one, and commit the metadata
+ * changes made so far.
+ */
+ xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
+
+ xfs_inode_set_cowblocks_tag(sc->ip);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /* Note the old CoW staging extents; we'll reap them all later. */
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
+ repl.len);
+ if (error)
+ return error;
+
+ *blockcount = repl.len;
+ return 0;
+}
+
+/*
+ * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
+ * reservation.
+ */
+STATIC int
+xrep_cow_replace(
+ uint64_t startoff,
+ uint64_t blockcount,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ int error = 0;
+
+ while (blockcount > 0) {
+ xfs_extlen_t len = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ error = xrep_cow_replace_range(xc, startoff, &len);
+ if (error)
+ break;
+
+ blockcount -= len;
+ startoff += len;
+ }
+
+ return error;
+}
+
+/*
+ * Repair an inode's CoW fork. The CoW fork is an in-core structure, so
+ * there's no btree to rebuid. Instead, we replace any mappings that are
+ * cross-linked or lack ondisk CoW fork records in the refcount btree.
+ */
+int
+xrep_bmap_cow(
+ struct xfs_scrub *sc)
+{
+ struct xrep_cow *xc;
+ struct xfs_iext_cursor icur;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
+ int error;
+
+ if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
+ return -EOPNOTSUPP;
+
+ if (!ifp)
+ return 0;
+
+ /* realtime files aren't supported yet */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ /*
+ * If we're somehow not in extents format, then reinitialize it to
+ * an empty extent mapping fork and exit.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ return 0;
+ }
+
+ xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
+ if (!xc)
+ return -ENOMEM;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ xc->sc = sc;
+ xoff_bitmap_init(&xc->bad_fileoffs);
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+
+ for_each_xfs_iext(ifp, &icur, &xc->irec) {
+ if (xchk_should_terminate(sc, &error))
+ goto out_bitmap;
+
+ /*
+ * delalloc reservations only exist incore, so there is no
+ * ondisk metadata that we can examine. Hence we leave them
+ * alone.
+ */
+ if (isnullstartblock(xc->irec.br_startblock))
+ continue;
+
+ /*
+ * COW fork extents are only in the written state if writeback
+ * is actively writing to disk. We cannot restart the write
+ * at a different disk address since we've already issued the
+ * IO, so we leave these alone and hope for the best.
+ */
+ if (xfs_bmap_is_written_extent(&xc->irec))
+ continue;
+
+ error = xrep_cow_find_bad(xc);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Replace any bad unwritten mappings with fresh reservations. */
+ error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Reap as many of the old CoW blocks as we can. They are owned ondisk
+ * by the refcount btree, not the inode, so it is correct to treat them
+ * like inode metadata.
+ */
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ xoff_bitmap_destroy(&xc->bad_fileoffs);
+ kmem_free(xc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 0b491784b759..d86ab51af928 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -15,10 +15,12 @@
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
+#include "scrub/health.h"
/* Set us up to scrub directories. */
int
@@ -760,6 +762,11 @@ xchk_directory(
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
return -ENOENT;
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
/* Plausible size? */
if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -784,7 +791,36 @@ xchk_directory(
/* Look up every name in this directory by hash. */
error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error == -ECANCELED)
- error = 0;
- return error;
+ if (error && error != -ECANCELED)
+ return error;
+
+ /* If the dir is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED);
+ return 0;
+}
+
+/*
+ * Decide if this directory has been zapped to satisfy the inode and ifork
+ * verifiers. Checking and repairing should be postponed until the directory
+ * is fixed.
+ */
+bool
+xchk_dir_looks_zapped(
+ struct xfs_inode *dp)
+{
+ /* Repair zapped this dir's data fork a short time ago */
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbtd
+ * scrubber to reconstruct the block mappings. Directories always
+ * contain some content, so this is a clear sign of a zapped directory.
+ * The state checked by xfs_ifork_zapped is not persisted, so this is
+ * the secondary strategy if repairs are interrupted by a crash or an
+ * unmount.
+ */
+ return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_df.if_nextents == 0;
}
diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c
new file mode 100644
index 000000000000..20c4daedd48d
--- /dev/null
+++ b/fs/xfs/scrub/dqiterate.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+
+/* Initialize a dquot iteration cursor. */
+void
+xchk_dqiter_init(
+ struct xchk_dqiter *cursor,
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ cursor->sc = sc;
+ cursor->bmap.br_startoff = NULLFILEOFF;
+ cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK;
+ cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype);
+ cursor->id = 0;
+}
+
+/*
+ * Ensure that the cached data fork mapping for the dqiter cursor is fresh and
+ * covers the dquot pointed to by the scan cursor.
+ */
+STATIC int
+xchk_dquot_iter_revalidate_bmap(
+ struct xchk_dqiter *cursor)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ xfs_dqid_t this_id = cursor->id;
+ int nmaps = 1;
+ int error;
+
+ fileoff = this_id / qi->qi_dqperchunk;
+
+ /*
+ * If we have a mapping for cursor->id and it's still fresh, there's
+ * no need to reread the bmbt.
+ */
+ if (cursor->bmap.br_startoff != NULLFILEOFF &&
+ cursor->if_seq == ifp->if_seq &&
+ cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff)
+ return 0;
+
+ /* Look up the data fork mapping for the dquot id of interest. */
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ ASSERT(nmaps > 0);
+ return -EFSCORRUPTED;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id);
+ return 0;
+}
+
+/* Advance the dqiter cursor to the next non-sparse region of the quota file. */
+STATIC int
+xchk_dquot_iter_advance_bmap(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_ondisk_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ uint64_t next_id;
+ int nmaps = 1;
+ int error;
+
+ /* Find the dquot id for the next non-hole mapping. */
+ do {
+ fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount;
+ if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap,
+ &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ /* Must have reached the end of the mappings. */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+ } while (!xfs_bmap_is_real_extent(&cursor->bmap));
+
+ next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk;
+ if (next_id > XFS_DQ_ID_MAX) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ /* Propose jumping forward to the dquot in the next allocated block. */
+ *next_ondisk_id = next_id;
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id);
+ return 0;
+}
+
+/*
+ * Find the id of the next highest incore dquot. Normally this will correspond
+ * exactly with the quota file block mappings, but repair might have erased a
+ * mapping because it was crosslinked; in that case, we need to re-allocate the
+ * space so that we can reset q_blkno.
+ */
+STATIC void
+xchk_dquot_iter_advance_incore(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_incore_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype);
+ struct xfs_dquot *dq;
+ unsigned int nr_found;
+
+ *next_incore_id = -1ULL;
+
+ mutex_lock(&qi->qi_tree_lock);
+ nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1);
+ if (nr_found)
+ *next_incore_id = dq->q_id;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id);
+}
+
+/*
+ * Walk all incore dquots of this filesystem. Caller must set *@cursorp to
+ * zero before the first call, and must not hold the quota file ILOCK.
+ * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more
+ * dquots to iterate; or a negative errno.
+ */
+int
+xchk_dquot_iter(
+ struct xchk_dqiter *cursor,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_mount *mp = cursor->sc->mp;
+ struct xfs_dquot *dq = NULL;
+ uint64_t next_ondisk, next_incore = -1ULL;
+ unsigned int lock_mode;
+ int error = 0;
+
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+ next_ondisk = cursor->id;
+
+ /* Revalidate and/or advance the cursor. */
+ lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip);
+ error = xchk_dquot_iter_revalidate_bmap(cursor);
+ if (!error && !xfs_bmap_is_real_extent(&cursor->bmap))
+ error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk);
+ xfs_iunlock(cursor->quota_ip, lock_mode);
+ if (error)
+ return error;
+
+ if (next_ondisk > cursor->id)
+ xchk_dquot_iter_advance_incore(cursor, &next_incore);
+
+ /* Pick the next dquot in the sequence and return it. */
+ cursor->id = min(next_ondisk, next_incore);
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+
+ trace_xchk_dquot_iter(cursor, cursor->id);
+
+ error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq);
+ if (error)
+ return error;
+
+ cursor->id = dq->q_id + 1;
+ *dqpp = dq;
+ return 1;
+}
diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h
new file mode 100644
index 000000000000..40b462c1dd0d
--- /dev/null
+++ b/fs/xfs/scrub/fsb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSB_BITMAP_H__
+#define __XFS_SCRUB_FSB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fsblock_t */
+
+struct xfsb_bitmap {
+ struct xbitmap64 fsbitmap;
+};
+
+static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->fsbitmap);
+}
+
+static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->fsbitmap);
+}
+
+static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap,
+ xfs_fsblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->fsbitmap, start, len);
+}
+
+static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->fsbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 5e2b09ed6e29..531006910ca9 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -10,8 +10,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
@@ -118,6 +116,38 @@ xchk_health_mask_for_scrub_type(
}
/*
+ * If the scrub state is clean, add @mask to the scrub sick mask to clear
+ * additional sick flags from the metadata object's sick state.
+ */
+void
+xchk_mark_healthy_if_clean(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT)))
+ sc->sick_mask |= mask;
+}
+
+/*
+ * If we're scrubbing a piece of file metadata for the first time, does it look
+ * like it has been zapped? Skip the check if we just repaired the metadata
+ * and are revalidating it.
+ */
+bool
+xchk_file_looks_zapped(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0);
+
+ if (sc->flags & XREP_ALREADY_FIXED)
+ return false;
+
+ return xfs_inode_has_sickness(sc->ip, mask);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index 66a273f8585b..a731b2467399 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
xfs_btnum_t btnum);
+void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
+bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index fb7bbf47ae5d..a720fc62262a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -585,7 +585,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes(
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
-/* Scrub the inode btrees for some AG. */
-STATIC int
+/* Scrub one of the inode btrees for some AG. */
+int
xchk_iallocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
@@ -720,9 +719,23 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
+ xfs_btnum_t which;
int error;
- cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_INOBT:
+ cur = sc->sa.ino_cur;
+ which = XFS_BTNUM_INO;
+ break;
+ case XFS_SCRUB_TYPE_FINOBT:
+ cur = sc->sa.fino_cur;
+ which = XFS_BTNUM_FINO;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
+
error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
&iabt);
if (error)
@@ -743,20 +756,6 @@ xchk_iallocbt(
return error;
}
-int
-xchk_inobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_INO);
-}
-
-int
-xchk_finobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_FINO);
-}
-
/* See if an inode btree has (or doesn't have) an inode chunk record. */
static inline void
xchk_xref_inode_check(
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
new file mode 100644
index 000000000000..b3f7182dd2f5
--- /dev/null
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -0,0 +1,884 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Btree Repair
+ * ==================
+ *
+ * A quick refresher of inode btrees on a v5 filesystem:
+ *
+ * - Inode records are read into memory in units of 'inode clusters'. However
+ * many inodes fit in a cluster buffer is the smallest number of inodes that
+ * can be allocated or freed. Clusters are never smaller than one fs block
+ * though they can span multiple blocks. The size (in fs blocks) is
+ * computed with xfs_icluster_size_fsb(). The fs block alignment of a
+ * cluster is computed with xfs_ialloc_cluster_alignment().
+ *
+ * - Each inode btree record can describe a single 'inode chunk'. The chunk
+ * size is defined to be 64 inodes. If sparse inodes are enabled, every
+ * inobt record must be aligned to the chunk size; if not, every record must
+ * be aligned to the start of a cluster. It is possible to construct an XFS
+ * geometry where one inobt record maps to multiple inode clusters; it is
+ * also possible to construct a geometry where multiple inobt records map to
+ * different parts of one inode cluster.
+ *
+ * - If sparse inodes are not enabled, the smallest unit of allocation for
+ * inode records is enough to contain one inode chunk's worth of inodes.
+ *
+ * - If sparse inodes are enabled, the holemask field will be active. Each
+ * bit of the holemask represents 4 potential inodes; if set, the
+ * corresponding space does *not* contain inodes and must be left alone.
+ * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
+ * of inode records is one inode cluster.
+ *
+ * So what's the rebuild algorithm?
+ *
+ * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
+ * records. The OWN_INOBT records are the old inode btree blocks and will be
+ * cleared out after we've rebuilt the tree. Each possible inode cluster
+ * within an OWN_INODES record will be read in; for each possible inobt record
+ * associated with that cluster, compute the freemask calculated from the
+ * i_mode data in the inode chunk. For sparse inodes the holemask will be
+ * calculated by creating the properly aligned inobt record and punching out
+ * any chunk that's missing. Inode allocations and frees grab the AGI first,
+ * so repair protects itself from concurrent access by locking the AGI.
+ *
+ * Once we've reconstructed all the inode records, we can create new inode
+ * btree roots and reload the btrees. We rebuild both inode trees at the same
+ * time because they have the same rmap owner and it would be more complex to
+ * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
+ * blocks it owns. We have all the data we need to build both, so dump
+ * everything and start over.
+ *
+ * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
+ */
+
+struct xrep_ibt {
+ /* Record under construction. */
+ struct xfs_inobt_rec_incore rie;
+
+ /* new inobt information */
+ struct xrep_newbt new_inobt;
+
+ /* new finobt information */
+ struct xrep_newbt new_finobt;
+
+ /* Old inode btree blocks we found in the rmap. */
+ struct xagb_bitmap old_iallocbt_blocks;
+
+ /* Reconstructed inode records. */
+ struct xfarray *inode_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of inodes assigned disk space. */
+ unsigned int icount;
+
+ /* Number of inodes in use. */
+ unsigned int iused;
+
+ /* Number of finobt records needed. */
+ unsigned int finobt_recs;
+
+ /* get_records()'s position in the inode record array. */
+ xfarray_idx_t array_cur;
+};
+
+/*
+ * Is this inode in use? If the inode is in memory we can tell from i_mode,
+ * otherwise we have to check di_mode in the on-disk buffer. We only care
+ * that the high (i.e. non-permission) bits of _mode are zero. This should be
+ * safe because repair keeps all AG headers locked until the end, and process
+ * trying to perform an inode allocation/free must lock the AGI.
+ *
+ * @cluster_ag_base is the inode offset of the cluster within the AG.
+ * @cluster_bp is the cluster buffer.
+ * @cluster_index is the inode offset within the inode cluster.
+ */
+STATIC int
+xrep_ibt_check_ifree(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ag_base,
+ struct xfs_buf *cluster_bp,
+ unsigned int cluster_index,
+ bool *inuse)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dinode *dip;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
+ unsigned int cluster_buf_base;
+ unsigned int offset;
+ int error;
+
+ agino = cluster_ag_base + cluster_index;
+ fsino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+ /* Inode uncached or half assembled, read disk buffer */
+ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
+ offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
+ if (offset >= BBTOB(cluster_bp->b_length))
+ return -EFSCORRUPTED;
+ dip = xfs_buf_offset(cluster_bp, offset);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
+ return -EFSCORRUPTED;
+
+ if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ return -EFSCORRUPTED;
+
+ /* Will the in-core inode tell us if it's in use? */
+ error = xchk_inode_is_allocated(sc, agino, inuse);
+ if (!error)
+ return 0;
+
+ *inuse = dip->di_mode != 0;
+ return 0;
+}
+
+/* Stash the accumulated inobt record for rebuilding. */
+STATIC int
+xrep_ibt_stash(
+ struct xrep_ibt *ri)
+{
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
+ if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
+ return -EFSCORRUPTED;
+
+ if (ri->rie.ir_freecount > 0)
+ ri->finobt_recs++;
+
+ trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+
+ error = xfarray_append(ri->inode_records, &ri->rie);
+ if (error)
+ return error;
+
+ ri->rie.ir_startino = NULLAGINO;
+ return 0;
+}
+
+/*
+ * Given an extent of inodes and an inode cluster buffer, calculate the
+ * location of the corresponding inobt record (creating it if necessary),
+ * then update the parts of the holemask and freemask of that record that
+ * correspond to the inode extent we were given.
+ *
+ * @cluster_ir_startino is the AG inode number of an inobt record that we're
+ * proposing to create for this inode cluster. If sparse inodes are enabled,
+ * we must round down to a chunk boundary to find the actual sparse record.
+ * @cluster_bp is the buffer of the inode cluster.
+ * @nr_inodes is the number of inodes to check from the cluster.
+ */
+STATIC int
+xrep_ibt_cluster_record(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ir_startino,
+ struct xfs_buf *cluster_bp,
+ unsigned int nr_inodes)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t ir_startino;
+ unsigned int cluster_base;
+ unsigned int cluster_index;
+ int error = 0;
+
+ ir_startino = cluster_ir_startino;
+ if (xfs_has_sparseinodes(mp))
+ ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
+ cluster_base = cluster_ir_startino - ir_startino;
+
+ /*
+ * If the accumulated inobt record doesn't map this cluster, add it to
+ * the list and reset it.
+ */
+ if (ri->rie.ir_startino != NULLAGINO &&
+ ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
+ error = xrep_ibt_stash(ri);
+ if (error)
+ return error;
+ }
+
+ if (ri->rie.ir_startino == NULLAGINO) {
+ ri->rie.ir_startino = ir_startino;
+ ri->rie.ir_free = XFS_INOBT_ALL_FREE;
+ ri->rie.ir_holemask = 0xFFFF;
+ ri->rie.ir_count = 0;
+ }
+
+ /* Record the whole cluster. */
+ ri->icount += nr_inodes;
+ ri->rie.ir_count += nr_inodes;
+ ri->rie.ir_holemask &= ~xfs_inobt_maskn(
+ cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
+ nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* Which inodes within this cluster are free? */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ bool inuse = false;
+
+ error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
+ cluster_bp, cluster_index, &inuse);
+ if (error)
+ return error;
+ if (!inuse)
+ continue;
+ ri->iused++;
+ ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
+ cluster_index);
+ }
+ return 0;
+}
+
+/*
+ * For each inode cluster covering the physical extent recorded by the rmapbt,
+ * we must calculate the properly aligned startino of that cluster, then
+ * iterate each cluster to fill in used and filled masks appropriately. We
+ * then use the (startino, used, filled) information to construct the
+ * appropriate inode records.
+ */
+STATIC int
+xrep_ibt_process_cluster(
+ struct xrep_ibt *ri,
+ xfs_agblock_t cluster_bno)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *cluster_bp;
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t cluster_ag_base;
+ xfs_agino_t irec_index;
+ unsigned int nr_inodes;
+ int error;
+
+ nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
+ XFS_INODES_PER_CHUNK);
+
+ /*
+ * Grab the inode cluster buffer. This is safe to do with a broken
+ * inobt because imap_to_bp directly maps the buffer without touching
+ * either inode btree.
+ */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+ imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ imap.im_boffset = 0;
+ error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
+ if (error)
+ return error;
+
+ /*
+ * Record the contents of each possible inobt record mapping this
+ * cluster.
+ */
+ cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
+ for (irec_index = 0;
+ irec_index < igeo->inodes_per_cluster;
+ irec_index += XFS_INODES_PER_CHUNK) {
+ error = xrep_ibt_cluster_record(ri,
+ cluster_ag_base + irec_index, cluster_bp,
+ nr_inodes);
+ if (error)
+ break;
+
+ }
+
+ xfs_trans_brelse(sc->tp, cluster_bp);
+ return error;
+}
+
+/* Check for any obvious conflicts in the inode chunk extent. */
+STATIC int
+xrep_ibt_check_inode_ext(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t agino;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /* Inode records must be within the AG. */
+ if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
+ return -EFSCORRUPTED;
+
+ /* The entire record must align to the inode cluster size. */
+ if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
+ !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
+ return -EFSCORRUPTED;
+
+ /*
+ * The entire record must also adhere to the inode cluster alignment
+ * size if sparse inodes are not enabled.
+ */
+ if (!xfs_has_sparseinodes(mp) &&
+ (!IS_ALIGNED(agbno, igeo->cluster_align) ||
+ !IS_ALIGNED(agbno + len, igeo->cluster_align)))
+ return -EFSCORRUPTED;
+
+ /*
+ * On a sparse inode fs, this cluster could be part of a sparse chunk.
+ * Sparse clusters must be aligned to sparse chunk alignment.
+ */
+ if (xfs_has_sparseinodes(mp) &&
+ (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
+ !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
+ return -EFSCORRUPTED;
+
+ /* Make sure the entire range of blocks are valid AG inodes. */
+ agino = XFS_AGB_TO_AGINO(mp, agbno);
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Found a fragment of the old inode btrees; dispose of them later. */
+STATIC int
+xrep_ibt_record_old_btree_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Record extents that belong to inode cluster blocks. */
+STATIC int
+xrep_ibt_record_inode_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t cluster_base;
+ int error;
+
+ error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
+ rec->rm_blockcount);
+ if (error)
+ return error;
+
+ trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
+ rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+ rec->rm_offset, rec->rm_flags);
+
+ /*
+ * Record the free/hole masks for each inode cluster that could be
+ * mapped by this rmap record.
+ */
+ for (cluster_base = 0;
+ cluster_base < rec->rm_blockcount;
+ cluster_base += igeo->blocks_per_cluster) {
+ error = xrep_ibt_process_cluster(ri,
+ rec->rm_startblock + cluster_base);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+STATIC int
+xrep_ibt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ switch (rec->rm_owner) {
+ case XFS_RMAP_OWN_INOBT:
+ return xrep_ibt_record_old_btree_blocks(ri, rec);
+ case XFS_RMAP_OWN_INODES:
+ return xrep_ibt_record_inode_blocks(ri, rec);
+ }
+ return 0;
+}
+
+/*
+ * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
+ * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
+ * the inode btrees. The caller must clean up the lists if anything goes
+ * wrong.
+ */
+STATIC int
+xrep_ibt_find_inodes(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ ri->rie.ir_startino = NULLAGINO;
+
+ /* Collect all reverse mappings for inode blocks. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /* If we have a record ready to go, add it to the array. */
+ if (ri->rie.ir_startino != NULLAGINO)
+ return xrep_ibt_stash(ri);
+
+ return 0;
+}
+
+/* Update the AGI counters. */
+STATIC int
+xrep_ibt_reset_counters(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ unsigned int freecount = ri->icount - ri->iused;
+
+ /* Trigger inode count recalculation */
+ xfs_force_summary_recalc(sc->mp);
+
+ /*
+ * The AGI header contains extra information related to the inode
+ * btrees, so we must update those fields here.
+ */
+ agi->agi_count = cpu_to_be32(ri->icount);
+ agi->agi_freecount = cpu_to_be32(freecount);
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagi(sc);
+}
+
+/* Retrieve finobt data for bulk load. */
+STATIC int
+xrep_fibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(ri->inode_records,
+ ri->array_cur++, irec);
+ } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Retrieve inobt data for bulk load. */
+STATIC int
+xrep_ibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new inobt blocks to the bulk loader. */
+STATIC int
+xrep_ibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
+}
+
+/* Feed one of the new finobt blocks to the bulk loader. */
+STATIC int
+xrep_fibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
+}
+
+/* Make sure the records do not overlap in inumber address space. */
+STATIC int
+xrep_ibt_check_overlap(
+ struct xrep_ibt *ri)
+{
+ struct xfs_inobt_rec_incore irec;
+ xfarray_idx_t cur;
+ xfs_agino_t next_agino = 0;
+ int error = 0;
+
+ foreach_xfarray_idx(ri->inode_records, cur) {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ error = xfarray_load(ri->inode_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (irec.ir_startino < next_agino)
+ return -EFSCORRUPTED;
+
+ next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
+ }
+
+ return error;
+}
+
+/* Build new inode btrees and dispose of the old one. */
+STATIC int
+xrep_ibt_build_new_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur = NULL;
+ xfs_fsblock_t fsbno;
+ bool need_finobt;
+ int error;
+
+ need_finobt = xfs_has_finobt(sc->mp);
+
+ /*
+ * Create new btrees for staging all the inobt records we collected
+ * earlier. The records were collected in order of increasing agino,
+ * so we do not have to sort them. Ensure there are no overlapping
+ * records.
+ */
+ error = xrep_ibt_check_overlap(ri);
+ if (error)
+ return error;
+
+ /*
+ * The new inode btrees will not be rooted in the AGI until we've
+ * successfully rebuilt the tree.
+ *
+ * Start by setting up the inobt staging cursor.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_IBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+ XFS_AG_RESV_NONE);
+ ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
+ ri->new_inobt.bload.get_records = xrep_ibt_get_records;
+
+ ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
+ XFS_BTNUM_INO);
+ error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
+ xfarray_length(ri->inode_records));
+ if (error)
+ goto err_inocur;
+
+ /* Set up finobt staging cursor. */
+ if (need_finobt) {
+ enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
+
+ if (sc->mp->m_finobt_nores)
+ resv = XFS_AG_RESV_NONE;
+
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_FIBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
+ fsbno, resv);
+ ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
+ ri->new_finobt.bload.get_records = xrep_fibt_get_records;
+
+ fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
+ &ri->new_finobt.afake, XFS_BTNUM_FINO);
+ error = xfs_btree_bload_compute_geometry(fino_cur,
+ &ri->new_finobt.bload, ri->finobt_recs);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_finocur;
+
+ /* Reserve all the space we need to build the new btrees. */
+ error = xrep_newbt_alloc_blocks(&ri->new_inobt,
+ ri->new_inobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+
+ if (need_finobt) {
+ error = xrep_newbt_alloc_blocks(&ri->new_finobt,
+ ri->new_finobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Add all inobt records. */
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
+ if (error)
+ goto err_finocur;
+
+ /* Add all finobt records. */
+ if (need_finobt) {
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
+ if (error)
+ goto err_finocur;
+ }
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(ino_cur, 0);
+
+ if (fino_cur) {
+ xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(fino_cur, 0);
+ }
+
+ /* Reset the AGI counters now that we've changed the inode roots. */
+ error = xrep_ibt_reset_counters(ri);
+ if (error)
+ goto err_finobt;
+
+ /* Free unused blocks and bitmap. */
+ if (need_finobt) {
+ error = xrep_newbt_commit(&ri->new_finobt);
+ if (error)
+ goto err_inobt;
+ }
+ error = xrep_newbt_commit(&ri->new_inobt);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_finocur:
+ if (need_finobt)
+ xfs_btree_del_cursor(fino_cur, error);
+err_inocur:
+ xfs_btree_del_cursor(ino_cur, error);
+err_finobt:
+ if (need_finobt)
+ xrep_newbt_cancel(&ri->new_finobt);
+err_inobt:
+ xrep_newbt_cancel(&ri->new_inobt);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_ibt_remove_old_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ /*
+ * Free the old inode btree blocks if they're not in use. It's ok to
+ * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
+ * reservation because we reset the reservation before releasing the
+ * AGI and AGF header buffer locks.
+ */
+ error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
+ &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /*
+ * If the finobt is enabled and has a per-AG reservation, make sure we
+ * reinitialize the per-AG reservations.
+ */
+ if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
+ sc->flags |= XREP_RESET_PERAG_RESV;
+
+ return 0;
+}
+
+/* Repair both inode btrees. */
+int
+xrep_iallocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_ibt *ri;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ xfs_agino_t first_agino, last_agino;
+ int error = 0;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
+ if (!ri)
+ return -ENOMEM;
+ ri->sc = sc;
+
+ /* We rebuild both inode btrees. */
+ sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
+
+ /* Set up enough storage to handle an AG with nothing but inodes. */
+ xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+ last_agino /= XFS_INODES_PER_CHUNK;
+ descr = xchk_xfile_ag_descr(sc, "inode index records");
+ error = xfarray_create(descr, last_agino,
+ sizeof(struct xfs_inobt_rec_incore),
+ &ri->inode_records);
+ kfree(descr);
+ if (error)
+ goto out_ri;
+
+ /* Collect the inode data and find the old btree blocks. */
+ xagb_bitmap_init(&ri->old_iallocbt_blocks);
+ error = xrep_ibt_find_inodes(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the inode indexes. */
+ error = xrep_ibt_build_new_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_ibt_remove_old_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
+ xfarray_destroy(ri->inode_records);
+out_ri:
+ kfree(ri);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_iallocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
+ error = xchk_iallocbt(sc);
+ if (error)
+ goto out;
+
+ if (xfs_has_finobt(sc->mp)) {
+ sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
+ error = xchk_iallocbt(sc);
+ }
+
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 889f556bc98f..6e2fe2d6250b 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -25,6 +25,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/* Prepare the attached inode for scrubbing. */
static inline int
@@ -39,6 +40,10 @@ xchk_prepare_iscrub(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -95,8 +100,8 @@ xchk_setup_inode(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_iscrub(sc, ip);
if (error == -ENOENT)
@@ -181,8 +186,11 @@ xchk_setup_inode(
* saying the inode is allocated and the icache being unable to load
* the inode until we can flag the corruption in xchk_inode. The
* scrub function has to note the corruption, since we're not really
- * supposed to do that from the setup function.
+ * supposed to do that from the setup function. Save the mapping to
+ * make repairs to the ondisk inode buffer.
*/
+ if (xchk_could_repair(sc))
+ xrep_setup_inode(sc, &imap);
return 0;
out_cancel:
@@ -338,6 +346,10 @@ xchk_inode_flags2(
if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
goto bad;
+ /* no large extent counts without the filesystem feature */
+ if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
@@ -548,7 +560,7 @@ xchk_dinode(
}
/* di_forkoff */
- if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 000000000000..0ca62d59f84a
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,1525 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Inode Record Repair
+ * ===================
+ *
+ * Roughly speaking, inode problems can be classified based on whether or not
+ * they trip the dinode verifiers. If those trip, then we won't be able to
+ * xfs_iget ourselves the inode.
+ *
+ * Therefore, the xrep_dinode_* functions fix anything that will cause the
+ * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
+ * fix things on live incore inodes. The inode repair functions make decisions
+ * with security and usability implications when reviving a file:
+ *
+ * - Files with zero di_mode or a garbage di_mode are converted to regular file
+ * that only root can read. This file may not actually contain user data,
+ * if the file was not previously a regular file. Setuid and setgid bits
+ * are cleared.
+ *
+ * - Zero-size directories can be truncated to look empty. It is necessary to
+ * run the bmapbtd and directory repair functions to fully rebuild the
+ * directory.
+ *
+ * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
+ * to run the bmapbtd and symlink repair functions to salvage the symlink.
+ *
+ * - Invalid extent size hints will be removed.
+ *
+ * - Quotacheck will be scheduled if we repaired an inode that was so badly
+ * damaged that the ondisk inode had to be rebuilt.
+ *
+ * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
+ * Setuid and setgid bits are cleared.
+ *
+ * - Data and attr forks are reset to extents format with zero extents if the
+ * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
+ * repair functions to recover the space mapping.
+ *
+ * - ACLs will not be recovered if the attr fork is zapped or the extended
+ * attribute structure itself requires salvaging.
+ *
+ * - If the attr fork is zapped, the user and group ids are reset to root and
+ * the setuid and setgid bits are removed.
+ */
+
+/*
+ * All the information we need to repair the ondisk inode if we can't iget the
+ * incore inode. We don't allocate this buffer unless we're going to perform
+ * a repair to the ondisk inode cluster buffer.
+ */
+struct xrep_inode {
+ /* Inode mapping that we saved from the initial lookup attempt. */
+ struct xfs_imap imap;
+
+ struct xfs_scrub *sc;
+
+ /* Blocks in use on the data device by data extents or bmbt blocks. */
+ xfs_rfsblock_t data_blocks;
+
+ /* Blocks in use on the rt device. */
+ xfs_rfsblock_t rt_blocks;
+
+ /* Blocks in use by the attr fork. */
+ xfs_rfsblock_t attr_blocks;
+
+ /* Number of data device extents for the data fork. */
+ xfs_extnum_t data_extents;
+
+ /*
+ * Number of realtime device extents for the data fork. If
+ * data_extents and rt_extents indicate that the data fork has extents
+ * on both devices, we'll just back away slowly.
+ */
+ xfs_extnum_t rt_extents;
+
+ /* Number of (data device) extents for the attr fork. */
+ xfs_aextnum_t attr_extents;
+
+ /* Sick state to set after zapping parts of the inode. */
+ unsigned int ino_sick_mask;
+
+ /* Must we remove all access from this file? */
+ bool zap_acls;
+};
+
+/*
+ * Setup function for inode repair. @imap contains the ondisk inode mapping
+ * information so that we can correct the ondisk inode cluster buffer if
+ * necessary to make iget work.
+ */
+int
+xrep_setup_inode(
+ struct xfs_scrub *sc,
+ const struct xfs_imap *imap)
+{
+ struct xrep_inode *ri;
+
+ sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ ri = sc->buf;
+ memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
+ ri->sc = sc;
+ return 0;
+}
+
+/*
+ * Make sure this ondisk inode can pass the inode buffer verifier. This is
+ * not the same as the dinode verifier.
+ */
+STATIC void
+xrep_dinode_buf_core(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ unsigned int ioffset)
+{
+ struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t agino;
+ bool crc_ok = false;
+ bool magic_ok = false;
+ bool unlinked_ok = false;
+
+ agino = be32_to_cpu(dip->di_next_unlinked);
+
+ if (xfs_verify_agino_or_null(bp->b_pag, agino))
+ unlinked_ok = true;
+
+ if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ xfs_dinode_good_version(mp, dip->di_version))
+ magic_ok = true;
+
+ if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF))
+ crc_ok = true;
+
+ if (magic_ok && unlinked_ok && crc_ok)
+ return;
+
+ if (!magic_ok) {
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ dip->di_version = 3;
+ }
+ if (!unlinked_ok)
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(tp, bp, ioffset,
+ ioffset + sizeof(struct xfs_dinode) - 1);
+}
+
+/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
+STATIC void
+xrep_dinode_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ int i;
+ int ni;
+
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++)
+ xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
+}
+
+/* Reinitialize things that never change in an inode. */
+STATIC void
+xrep_dinode_header(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ trace_xrep_dinode_header(sc, dip);
+
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+ dip->di_version = 3;
+ dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
+ uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+}
+
+/* Turn di_mode into /something/ recognizable. */
+STATIC void
+xrep_dinode_mode(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_mode(sc, dip);
+
+ if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
+ return;
+
+ /* bad mode, so we set it to a file that only root can read */
+ mode = S_IFREG;
+ dip->di_mode = cpu_to_be16(mode);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+ ri->zap_acls = true;
+}
+
+/* Fix any conflicting flags that the verifiers complain about. */
+STATIC void
+xrep_dinode_flags(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ bool isrt)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_flags(sc, dip);
+
+ if (isrt)
+ flags |= XFS_DIFLAG_REALTIME;
+ else
+ flags &= ~XFS_DIFLAG_REALTIME;
+
+ /*
+ * For regular files on a reflink filesystem, set the REFLINK flag to
+ * protect shared extents. A later stage will actually check those
+ * extents and clear the flag if possible.
+ */
+ if (xfs_has_reflink(mp) && S_ISREG(mode))
+ flags2 |= XFS_DIFLAG2_REFLINK;
+ else
+ flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+ if (flags & XFS_DIFLAG_REALTIME)
+ flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (!xfs_has_bigtime(mp))
+ flags2 &= ~XFS_DIFLAG2_BIGTIME;
+ if (!xfs_has_large_extent_counts(mp))
+ flags2 &= ~XFS_DIFLAG2_NREXT64;
+ if (flags2 & XFS_DIFLAG2_NREXT64)
+ dip->di_nrext64_pad = 0;
+ else if (dip->di_version >= 3)
+ dip->di_v3_pad = 0;
+ dip->di_flags = cpu_to_be16(flags);
+ dip->di_flags2 = cpu_to_be64(flags2);
+}
+
+/*
+ * Blow out symlink; now it points nowhere. We don't have to worry about
+ * incore state because this inode is failing the verifiers.
+ */
+STATIC void
+xrep_dinode_zap_symlink(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ char *p;
+
+ trace_xrep_dinode_zap_symlink(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ dip->di_size = cpu_to_be64(1);
+ p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ *p = '?';
+ ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+}
+
+/*
+ * Blow out dir, make the parent point to the root. In the future repair will
+ * reconstruct this directory for us. Note that there's no in-core directory
+ * inode because the sf verifier tripped, so we don't have to worry about the
+ * dentry cache.
+ */
+STATIC void
+xrep_dinode_zap_dir(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dir2_sf_hdr *sfp;
+ int i8count;
+
+ trace_xrep_dinode_zap_dir(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+ sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ sfp->count = 0;
+ sfp->i8count = i8count;
+ xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
+ dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+ ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
+}
+
+/* Make sure we don't have a garbage file size. */
+STATIC void
+xrep_dinode_size(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint64_t size = be64_to_cpu(dip->di_size);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_size(sc, dip);
+
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ /* di_size can't be nonzero for special files */
+ dip->di_size = 0;
+ break;
+ case S_IFREG:
+ /* Regular files can't be larger than 2^63-1 bytes. */
+ dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
+ break;
+ case S_IFLNK:
+ /*
+ * Truncate ridiculously oversized symlinks. If the size is
+ * zero, reset it to point to the current directory. Both of
+ * these conditions trigger dinode verifier errors, so there
+ * is no in-core state to reset.
+ */
+ if (size > XFS_SYMLINK_MAXLEN)
+ dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
+ else if (size == 0)
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ /*
+ * Directories can't have a size larger than 32G. If the size
+ * is zero, reset it to an empty directory. Both of these
+ * conditions trigger dinode verifier errors, so there is no
+ * in-core state to reset.
+ */
+ if (size > XFS_DIR2_SPACE_SIZE)
+ dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
+ else if (size == 0)
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/* Fix extent size hints. */
+STATIC void
+xrep_dinode_extsize_hints(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ xfs_failaddr_t fa;
+
+ trace_xrep_dinode_extsize_hints(sc, dip);
+
+ fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa) {
+ dip->di_extsize = 0;
+ dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ }
+
+ if (dip->di_version < 3)
+ return;
+
+ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa) {
+ dip->di_cowextsize = 0;
+ dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+ }
+}
+
+/* Count extents and blocks for an inode given an rmap. */
+STATIC int
+xrep_dinode_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ /* We only care about this inode. */
+ if (rec->rm_owner != ri->sc->sm->sm_ino)
+ return 0;
+
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
+ ri->attr_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->attr_extents++;
+
+ return 0;
+ }
+
+ ri->data_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->data_extents++;
+
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all AG rmap data. */
+STATIC int
+xrep_dinode_count_ag_rmaps(
+ struct xrep_inode *ri,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
+ error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(ri->sc->tp, agf);
+ return error;
+}
+
+/* Count extents and blocks for a given inode from all rmap data. */
+STATIC int
+xrep_dinode_count_rmaps(
+ struct xrep_inode *ri)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+ return -EOPNOTSUPP;
+
+ for_each_perag(ri->sc->mp, agno, pag) {
+ error = xrep_dinode_count_ag_rmaps(ri, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ /* Can't have extents on both the rt and the data device. */
+ if (ri->data_extents && ri->rt_extents)
+ return -EFSCORRUPTED;
+
+ trace_xrep_dinode_count_rmaps(ri->sc,
+ ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
+ ri->data_extents, ri->rt_extents, ri->attr_extents);
+ return 0;
+}
+
+/* Return true if this extents-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_extents_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmbt_irec new;
+ struct xfs_bmbt_rec *dp;
+ xfs_extnum_t nex;
+ bool isrt;
+ unsigned int i;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ dp = XFS_DFORK_PTR(dip, whichfork);
+
+ isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(dp, &new);
+ fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
+ &new);
+ if (fa)
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if this btree-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_bmbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmdr_block *dfp;
+ xfs_extnum_t nex;
+ unsigned int i;
+ unsigned int dmxr;
+ unsigned int nrecs;
+ unsigned int level;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ if (dfork_size < sizeof(struct xfs_bmdr_block))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, whichfork);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ return true;
+ if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
+ return true;
+
+ dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
+ for (i = 1; i <= nrecs; i++) {
+ struct xfs_bmbt_key *fkp;
+ xfs_bmbt_ptr_t *fpp;
+ xfs_fileoff_t fileoff;
+ xfs_fsblock_t fsbno;
+
+ fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+ fileoff = be64_to_cpu(fkp->br_startoff);
+ if (!xfs_verify_fileoff(sc->mp, fileoff))
+ return true;
+
+ fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+ fsbno = be64_to_cpu(*fpp);
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check the data fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_dfork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ void *dfork_ptr;
+ int64_t data_size;
+ unsigned int fmt;
+ unsigned int dfork_size;
+
+ /*
+ * Verifier functions take signed int64_t, so check for bogus negative
+ * values first.
+ */
+ data_size = be64_to_cpu(dip->di_size);
+ if (data_size < 0)
+ return true;
+
+ fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (fmt != XFS_DINODE_FMT_DEV)
+ return true;
+ break;
+ case S_IFREG:
+ if (fmt == XFS_DINODE_FMT_LOCAL)
+ return true;
+ fallthrough;
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
+ default:
+ return true;
+ }
+
+ dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
+ dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ switch (fmt) {
+ case XFS_DINODE_FMT_DEV:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /* dir/symlink structure cannot be larger than the fork */
+ if (data_size > dfork_size)
+ return true;
+ /* directory structure must pass verification. */
+ if (S_ISDIR(mode) &&
+ xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
+ return true;
+ /* symlink structure must pass verification. */
+ if (S_ISLNK(mode) &&
+ xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
+ return true;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+static void
+xrep_dinode_set_data_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_nextents = cpu_to_be64(nextents);
+ else
+ dip->di_nextents = cpu_to_be32(nextents);
+}
+
+static void
+xrep_dinode_set_attr_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_anextents = cpu_to_be32(nextents);
+ else
+ dip->di_anextents = cpu_to_be16(nextents);
+}
+
+/* Reset the data fork to something sane. */
+STATIC void
+xrep_dinode_zap_dfork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_dfork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
+
+ xrep_dinode_set_data_nextents(dip, 0);
+ ri->data_blocks = 0;
+ ri->rt_blocks = 0;
+
+ /* Special files always get reset to DEV */
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ dip->di_format = XFS_DINODE_FMT_DEV;
+ dip->di_size = 0;
+ return;
+ }
+
+ /*
+ * If we have data extents, reset to an empty map and hope the user
+ * will run the bmapbtd checker next.
+ */
+ if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
+ dip->di_format = XFS_DINODE_FMT_EXTENTS;
+ return;
+ }
+
+ /* Otherwise, reset the local format to the minimum. */
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/*
+ * Check the attr fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_afork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_sf_hdr *afork_ptr;
+ size_t attr_size;
+ unsigned int afork_size;
+
+ if (XFS_DFORK_BOFF(dip) == 0)
+ return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
+ xfs_dfork_attr_extents(dip) != 0;
+
+ afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+
+ switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
+ case XFS_DINODE_FMT_LOCAL:
+ /* Fork has to be large enough to extract the xattr size. */
+ if (afork_size < sizeof(struct xfs_attr_sf_hdr))
+ return true;
+
+ /* xattr structure cannot be larger than the fork */
+ attr_size = be16_to_cpu(afork_ptr->totsize);
+ if (attr_size > afork_size)
+ return true;
+
+ /* xattr structure must pass verification. */
+ return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Reset the attr fork to empty. Since the attr fork could have contained
+ * ACLs, make the file readable only by root.
+ */
+STATIC void
+xrep_dinode_zap_afork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_afork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
+
+ dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
+ xrep_dinode_set_attr_nextents(dip, 0);
+ ri->attr_blocks = 0;
+
+ /*
+ * If the data fork is in btree format, removing the attr fork entirely
+ * might cause verifier failures if the next level down in the bmbt
+ * could now fit in the data fork area.
+ */
+ if (dip->di_format != XFS_DINODE_FMT_BTREE)
+ dip->di_forkoff = 0;
+ dip->di_mode = cpu_to_be16(mode & ~0777);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+}
+
+/* Make sure the fork offset is a sensible value. */
+STATIC void
+xrep_dinode_ensure_forkoff(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_bmdr_block *bmdr;
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t attr_extents, data_extents;
+ size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+ unsigned int lit_sz = XFS_LITINO(sc->mp);
+ unsigned int afork_min, dfork_min;
+
+ trace_xrep_dinode_ensure_forkoff(sc, dip);
+
+ /*
+ * Before calling this function, xrep_dinode_core ensured that both
+ * forks actually fit inside their respective literal areas. If this
+ * was not the case, the fork was reset to FMT_EXTENTS with zero
+ * records. If the rmapbt scan found attr or data fork blocks, this
+ * will be noted in the dinode_stats, and we must leave enough room
+ * for the bmap repair code to reconstruct the mapping structure.
+ *
+ * First, compute the minimum space required for the attr fork.
+ */
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform xattr structure at all, that
+ * means the attr fork area was exactly large enough to fit
+ * the sf structure.
+ */
+ afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (attr_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
+ } else if (ri->attr_extents > 0) {
+ /*
+ * The attr fork thinks it has zero extents, but we
+ * found some xattr extents. We need to leave enough
+ * empty space here so that the incore attr fork will
+ * get created (and hence trigger the attr fork bmap
+ * repairer).
+ */
+ afork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ afork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+ afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ /* We should never see any other formats. */
+ afork_min = 0;
+ break;
+ }
+
+ /* Compute the minimum space required for the data fork. */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ dfork_min = sizeof(__be32);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ dfork_min = sizeof(uuid_t);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform data fork at all, that means
+ * the data fork area was large enough to fit whatever was in
+ * there.
+ */
+ dfork_min = be64_to_cpu(dip->di_size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ data_extents = xfs_dfork_data_extents(dip);
+ if (data_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
+ } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
+ /*
+ * The data fork thinks it has zero extents, but we
+ * found some data extents. We need to leave enough
+ * empty space here so that the data fork bmap repair
+ * will recover the mappings.
+ */
+ dfork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ dfork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ dfork_min = 0;
+ break;
+ }
+
+ /*
+ * Round all values up to the nearest 8 bytes, because that is the
+ * precision of di_forkoff.
+ */
+ afork_min = roundup(afork_min, 8);
+ dfork_min = roundup(dfork_min, 8);
+ bmdr_minsz = roundup(bmdr_minsz, 8);
+
+ ASSERT(dfork_min <= lit_sz);
+ ASSERT(afork_min <= lit_sz);
+
+ /*
+ * If the data fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork up.
+ */
+ if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_data_extents(dip) == 0 &&
+ (ri->data_extents > 0 || ri->rt_extents > 0) &&
+ bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
+ if (bmdr_minsz + afork_min > lit_sz) {
+ /*
+ * The attr for and the stub fork we need to recover
+ * the data fork won't both fit. Zap the attr fork.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ afork_min = bmdr_minsz;
+ } else {
+ void *before, *after;
+
+ /* Otherwise, just slide the attr fork up. */
+ before = XFS_DFORK_APTR(dip);
+ dip->di_forkoff = bmdr_minsz >> 3;
+ after = XFS_DFORK_APTR(dip);
+ memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
+ }
+ }
+
+ /*
+ * If the attr fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork down.
+ */
+ if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_attr_extents(dip) == 0 &&
+ ri->attr_extents > 0 &&
+ bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
+ if (dip->di_format == XFS_DINODE_FMT_BTREE) {
+ /*
+ * If the data fork is in btree format then we can't
+ * adjust forkoff because that runs the risk of
+ * violating the extents/btree format transition rules.
+ */
+ } else if (bmdr_minsz + dfork_min > lit_sz) {
+ /*
+ * If we can't move the attr fork, too bad, we lose the
+ * attr fork and leak its blocks.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ } else {
+ /*
+ * Otherwise, just slide the attr fork down. The attr
+ * fork is empty, so we don't have any old contents to
+ * move here.
+ */
+ dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
+ }
+ }
+}
+
+/*
+ * Zap the data/attr forks if we spot anything that isn't going to pass the
+ * ifork verifiers or the ifork formatters, because we need to get the inode
+ * into good enough shape that the higher level repair functions can run.
+ */
+STATIC void
+xrep_dinode_zap_forks(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t data_extents;
+ xfs_extnum_t attr_extents;
+ xfs_filblks_t nblocks;
+ uint16_t mode;
+ bool zap_datafork = false;
+ bool zap_attrfork = ri->zap_acls;
+
+ trace_xrep_dinode_zap_forks(sc, dip);
+
+ mode = be16_to_cpu(dip->di_mode);
+
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
+ /* Inode counters don't make sense? */
+ if (data_extents > nblocks)
+ zap_datafork = true;
+ if (attr_extents > nblocks)
+ zap_attrfork = true;
+ if (data_extents + attr_extents > nblocks)
+ zap_datafork = zap_attrfork = true;
+
+ if (!zap_datafork)
+ zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
+ if (!zap_attrfork)
+ zap_attrfork = xrep_dinode_check_afork(sc, dip);
+
+ /* Zap whatever's bad. */
+ if (zap_attrfork)
+ xrep_dinode_zap_afork(ri, dip, mode);
+ if (zap_datafork)
+ xrep_dinode_zap_dfork(ri, dip, mode);
+ xrep_dinode_ensure_forkoff(ri, dip, mode);
+
+ /*
+ * Zero di_nblocks if we don't have any extents at all to satisfy the
+ * buffer verifier.
+ */
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (data_extents + attr_extents == 0)
+ dip->di_nblocks = 0;
+}
+
+/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xrep_dinode_core(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino = sc->sm->sm_ino;
+ int error;
+ int iget_error;
+
+ /* Figure out what this inode had mapped in both forks. */
+ error = xrep_dinode_count_rmaps(ri);
+ if (error)
+ return error;
+
+ /* Read the inode cluster buffer. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
+ NULL);
+ if (error)
+ return error;
+
+ /* Make sure we can pass the inode buffer verifier. */
+ xrep_dinode_buf(sc, bp);
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ /* Fix everything the verifier will complain about. */
+ dip = xfs_buf_offset(bp, ri->imap.im_boffset);
+ xrep_dinode_header(sc, dip);
+ xrep_dinode_mode(ri, dip);
+ xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
+ xrep_dinode_size(ri, dip);
+ xrep_dinode_extsize_hints(sc, dip);
+ xrep_dinode_zap_forks(ri, dip);
+
+ /* Write out the inode. */
+ trace_xrep_dinode_fixed(sc, dip);
+ xfs_dinode_calc_crc(sc->mp, dip);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
+ ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+
+ /*
+ * In theory, we've fixed the ondisk inode record enough that we should
+ * be able to load the inode into the cache. Try to iget that inode
+ * now while we hold the AGI and the inode cluster buffer and take the
+ * IOLOCK so that we can continue with repairs without anyone else
+ * accessing the inode. If iget fails, we still need to commit the
+ * changes.
+ */
+ iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ /*
+ * Commit the inode cluster buffer updates and drop the AGI buffer that
+ * we've been holding since scrub setup. From here on out, repairs
+ * deal only with the cached inode.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ if (iget_error)
+ return iget_error;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ error = xrep_ino_dqattach(sc);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (ri->ino_sick_mask)
+ xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
+ return 0;
+}
+
+/* Fix everything xfs_dinode_verify cares about. */
+STATIC int
+xrep_dinode_problems(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ error = xrep_dinode_core(ri);
+ if (error)
+ return error;
+
+ /* We had to fix a totally busted inode, schedule quotacheck. */
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+
+ return 0;
+}
+
+/*
+ * Fix problems that the verifiers don't care about. In general these are
+ * errors that don't cause problems elsewhere in the kernel that we can easily
+ * detect, so we don't check them all that rigorously.
+ */
+
+/* Make sure block and extent counts are ok. */
+STATIC int
+xrep_inode_blockcounts(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+ xfs_filblks_t count;
+ xfs_filblks_t acount;
+ xfs_extnum_t nextents;
+ int error;
+
+ trace_xrep_inode_blockcounts(sc);
+
+ /* Set data fork counters from the data fork mappings. */
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+ &nextents, &count);
+ if (error)
+ return error;
+ if (xfs_is_reflink_inode(sc->ip)) {
+ /*
+ * data fork blockcount can exceed physical storage if a user
+ * reflinks the same block over and over again.
+ */
+ ;
+ } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ if (count >= sc->mp->m_sb.sb_rblocks)
+ return -EFSCORRUPTED;
+ } else {
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ }
+ error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
+ if (error)
+ return error;
+ sc->ip->i_df.if_nextents = nextents;
+
+ /* Set attr fork counters from the attr fork mappings. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+ if (ifp) {
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+ &nextents, &acount);
+ if (error)
+ return error;
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
+ nextents);
+ if (error)
+ return error;
+ ifp->if_nextents = nextents;
+ } else {
+ acount = 0;
+ }
+
+ sc->ip->i_nblocks = count + acount;
+ return 0;
+}
+
+/* Check for invalid uid/gid/prid. */
+STATIC void
+xrep_inode_ids(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+
+ trace_xrep_inode_ids(sc);
+
+ if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
+ i_uid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ }
+
+ if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
+ i_gid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ }
+
+ if (sc->ip->i_projid == -1U) {
+ sc->ip->i_projid = 0;
+ dirty = true;
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+ }
+
+ /* strip setuid/setgid if we touched any of the ids */
+ if (dirty)
+ VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+}
+
+static inline void
+xrep_clamp_timestamp(
+ struct xfs_inode *ip,
+ struct timespec64 *ts)
+{
+ ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
+ *ts = timestamp_truncate(*ts, VFS_I(ip));
+}
+
+/* Nanosecond counters can't have more than 1 billion. */
+STATIC void
+xrep_inode_timestamps(
+ struct xfs_inode *ip)
+{
+ struct timespec64 tstamp;
+ struct inode *inode = VFS_I(ip);
+
+ tstamp = inode_get_atime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_atime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_mtime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_mtime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_ctime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_ctime_to_ts(inode, tstamp);
+
+ xrep_clamp_timestamp(ip, &ip->i_crtime);
+}
+
+/* Fix inode flags that don't make sense together. */
+STATIC void
+xrep_inode_flags(
+ struct xfs_scrub *sc)
+{
+ uint16_t mode;
+
+ trace_xrep_inode_flags(sc);
+
+ mode = VFS_I(sc->ip)->i_mode;
+
+ /* Clear junk flags */
+ if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
+
+ /* NEWRTBM only applies to realtime bitmaps */
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+ sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ else
+ sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
+
+ /* These only make sense for directories. */
+ if (!S_ISDIR(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS);
+
+ /* These only make sense for files. */
+ if (!S_ISREG(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
+ XFS_DIFLAG_EXTSIZE);
+
+ /* These only make sense for non-rt files. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
+
+ /* Immutable and append only? Drop the append. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
+ sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
+
+ /* Clear junk flags. */
+ if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
+
+ /* No reflink flag unless we support it and it's a file. */
+ if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /* DAX only applies to files and dirs. */
+ if (!(S_ISREG(mode) || S_ISDIR(mode)))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+
+ /* No reflink files on the realtime device. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+}
+
+/*
+ * Fix size problems with block/node format directories. If we fail to find
+ * the extent list, just bail out and let the bmapbtd repair functions clean
+ * up that mess.
+ */
+STATIC void
+xrep_inode_blockdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t off;
+ int error;
+
+ trace_xrep_inode_blockdir_size(sc);
+
+ error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
+ if (error)
+ return;
+
+ /* Find the last block before 32G; this is the dir size. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
+ if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
+ /* zero-extents directory? */
+ return;
+ }
+
+ off = got.br_startoff + got.br_blockcount;
+ sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
+ XFS_FSB_TO_B(sc->mp, off));
+}
+
+/* Fix size problems with short format directories. */
+STATIC void
+xrep_inode_sfdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+
+ trace_xrep_inode_sfdir_size(sc);
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ sc->ip->i_disk_size = ifp->if_bytes;
+}
+
+/*
+ * Fix any irregularities in a directory inode's size now that we can iterate
+ * extent maps and access other regular inode data.
+ */
+STATIC void
+xrep_inode_dir_size(
+ struct xfs_scrub *sc)
+{
+ trace_xrep_inode_dir_size(sc);
+
+ switch (sc->ip->i_df.if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ xrep_inode_blockdir_size(sc);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ xrep_inode_sfdir_size(sc);
+ break;
+ }
+}
+
+/* Fix extent size hint problems. */
+STATIC void
+xrep_inode_extsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
+ sc->ip->i_extsize = 0;
+ sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
+ }
+}
+
+/* Fix any irregularities in an inode that the verifiers don't catch. */
+STATIC int
+xrep_inode_problems(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = xrep_inode_blockcounts(sc);
+ if (error)
+ return error;
+ xrep_inode_timestamps(sc->ip);
+ xrep_inode_flags(sc);
+ xrep_inode_ids(sc);
+ /*
+ * We can now do a better job fixing the size of a directory now that
+ * we can scan the data fork extents than we could in xrep_dinode_size.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ xrep_inode_dir_size(sc);
+ xrep_inode_extsize(sc);
+
+ trace_xrep_inode_fixed(sc);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair an inode's fields. */
+int
+xrep_inode(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ /*
+ * No inode? That means we failed the _iget verifiers. Repair all
+ * the things that the inode verifiers care about, then retry _iget.
+ */
+ if (!sc->ip) {
+ struct xrep_inode *ri = sc->buf;
+
+ ASSERT(ri != NULL);
+
+ error = xrep_dinode_problems(ri);
+ if (error)
+ return error;
+
+ /* By this point we had better have a working incore inode. */
+ if (!sc->ip)
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* If we found corruption of any kind, try to fix it. */
+ if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+ error = xrep_inode_problems(sc);
+ if (error)
+ return error;
+ }
+
+ /* See if we can clear the reflink flag. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ return xrep_defer_finish(sc);
+}
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
new file mode 100644
index 000000000000..bb6d980b4fcd
--- /dev/null
+++ b/fs/xfs/scrub/newbt.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/newbt.h"
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us. However, there are some
+ * exceptions to this rule:
+ *
+ * (0) If someone turned one of the debug knobs.
+ * (1) If this is a per-AG btree and the AG has less than 10% space free.
+ * (2) If this is an inode btree and the FS has less than 10% space free.
+
+ * In either case, format the new btree blocks almost completely full to
+ * minimize space usage.
+ */
+static void
+xrep_newbt_estimate_slack(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_btree_bload *bload = &xnr->bload;
+ uint64_t free;
+ uint64_t sz;
+
+ /*
+ * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+ * unless someone has set them otherwise, so we just pull the values
+ * here.
+ */
+ bload->leaf_slack = xfs_globals.bload_leaf_slack;
+ bload->node_slack = xfs_globals.bload_node_slack;
+
+ if (sc->ops->type == ST_PERAG) {
+ free = sc->sa.pag->pagf_freeblks;
+ sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ } else {
+ free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ sz = sc->mp->m_sb.sb_dblocks;
+ }
+
+ /* No further changes if there's more than 10% free space left. */
+ if (free >= div_u64(sz, 10))
+ return;
+
+ /*
+ * We're low on space; load the btrees as tightly as possible. Leave
+ * a couple of open slots in each btree block so that we don't end up
+ * splitting the btrees like crazy after a mount.
+ */
+ if (bload->leaf_slack < 0)
+ bload->leaf_slack = 2;
+ if (bload->node_slack < 0)
+ bload->node_slack = 2;
+}
+
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv)
+{
+ memset(xnr, 0, sizeof(struct xrep_newbt));
+ xnr->sc = sc;
+ xnr->oinfo = *oinfo; /* structure copy */
+ xnr->alloc_hint = alloc_hint;
+ xnr->resv = resv;
+ INIT_LIST_HEAD(&xnr->resv_list);
+ xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
+ xrep_newbt_estimate_slack(xnr);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+int
+xrep_newbt_init_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ int whichfork,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ xrep_newbt_init_ag(xnr, sc, oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+ return 0;
+}
+
+/*
+ * Initialize accounting resources for staging a new btree. Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+ XFS_AG_RESV_NONE);
+}
+
+/*
+ * Designate specific blocks to be used to build our new btree. @pag must be
+ * a passive reference.
+ */
+STATIC int
+xrep_newbt_add_blocks(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ const struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xrep_newbt_resv *resv;
+ int error;
+
+ resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
+ if (!resv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&resv->list);
+ resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ resv->len = args->len;
+ resv->used = 0;
+ resv->pag = xfs_perag_hold(pag);
+
+ if (args->tp) {
+ ASSERT(xnr->oinfo.oi_offset == 0);
+
+ error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+ if (error)
+ goto out_pag;
+ }
+
+ list_add_tail(&resv->list, &xnr->resv_list);
+ return 0;
+out_pag:
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ return error;
+}
+
+/*
+ * Add an extent to the new btree reservation pool. Callers are required to
+ * reap this reservation manually if the repair is cancelled. @pag must be a
+ * passive reference.
+ */
+int
+xrep_newbt_add_extent(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xfs_alloc_arg args = {
+ .tp = NULL, /* no autoreap */
+ .oinfo = xnr->oinfo,
+ .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+ .len = len,
+ .resv = xnr->resv,
+ };
+
+ return xrep_newbt_add_blocks(xnr, pag, &args);
+}
+
+/* Don't let our allocation hint take us beyond this AG */
+static inline void
+xrep_newbt_validate_ag_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
+
+ if (agno == sc->sa.pag->pag_agno &&
+ xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for a new per-AG btree. */
+STATIC int
+xrep_newbt_alloc_ag_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ ASSERT(sc->sa.pag != NULL);
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_ag_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ if (agno != sc->sa.pag->pag_agno) {
+ ASSERT(agno == sc->sa.pag->pag_agno);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+xrep_newbt_validate_file_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+
+ if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+STATIC int
+xrep_newbt_alloc_file_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_file_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_file_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ pag = xfs_perag_get(mp, agno);
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, pag, &args);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ if (xnr->sc->ip)
+ return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
+ return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure. Returns the number of EFIs logged or a negative errno.
+ */
+STATIC int
+xrep_newbt_free_extent(
+ struct xrep_newbt *xnr,
+ struct xrep_newbt_resv *resv,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agblock_t free_agbno = resv->agbno;
+ xfs_extlen_t free_aglen = resv->len;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ if (!btree_committed || resv->used == 0) {
+ /*
+ * If we're not committing a new btree or we didn't use the
+ * space reservation, let the existing EFI free the entire
+ * space extent.
+ */
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
+ free_agbno, free_aglen, xnr->oinfo.oi_owner);
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ return 1;
+ }
+
+ /*
+ * We used space and committed the btree. Cancel the autoreap, remove
+ * the written blocks from the reservation, and possibly log a new EFI
+ * to free any unused reservation space.
+ */
+ xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
+ free_agbno += resv->used;
+ free_aglen -= resv->used;
+
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, xnr->oinfo.oi_owner);
+
+ ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
+ ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
+
+ /*
+ * Use EFIs to free the reservations. This reduces the chance
+ * that we leak blocks if the system goes down.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+ error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
+ xnr->resv, true);
+ if (error)
+ return error;
+
+ return 1;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+STATIC int
+xrep_newbt_free(
+ struct xrep_newbt *xnr,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ unsigned int freed = 0;
+ int error = 0;
+
+ /*
+ * If the filesystem already went down, we can't free the blocks. Skip
+ * ahead to freeing the incore metadata because we can't fix anything.
+ */
+ if (xfs_is_shutdown(sc->mp))
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ int ret;
+
+ ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ if (ret < 0) {
+ error = ret;
+ goto junkit;
+ }
+
+ freed += ret;
+ if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto junkit;
+ freed = 0;
+ }
+ }
+
+ if (freed)
+ error = xrep_defer_finish(sc);
+
+junkit:
+ /*
+ * If we still have reservations attached to @newbt, cleanup must have
+ * failed and the filesystem is about to go down. Clean up the incore
+ * reservations and try to commit to freeing the space we used.
+ */
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ }
+
+ if (sc->ip) {
+ kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
+ xnr->ifake.if_fork = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+xrep_newbt_commit(
+ struct xrep_newbt *xnr)
+{
+ return xrep_newbt_free(xnr, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit. We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+xrep_newbt_cancel(
+ struct xrep_newbt *xnr)
+{
+ xrep_newbt_free(xnr, false);
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+ struct xfs_btree_cur *cur,
+ struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr)
+{
+ struct xrep_newbt_resv *resv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+
+ /*
+ * The first item in the list should always have a free block unless
+ * we're completely out.
+ */
+ resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+ if (resv->used == resv->len)
+ return -ENOSPC;
+
+ /*
+ * Peel off a block from the start of the reservation. We allocate
+ * blocks in order to place blocks on disk in increasing record or key
+ * order. The block reservations tend to end up on the list in
+ * decreasing order, which hopefully results in leaf blocks ending up
+ * together.
+ */
+ agbno = resv->agbno + resv->used;
+ resv->used++;
+
+ /* If we used all the blocks in this reservation, move it to the end. */
+ if (resv->used == resv->len)
+ list_move_tail(&resv->list, &xnr->resv_list);
+
+ trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
+ xnr->oinfo.oi_owner);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
+ agbno));
+ else
+ ptr->s = cpu_to_be32(agbno);
+
+ /* Relog all the EFIs. */
+ return xrep_defer_finish(xnr->sc);
+}
+
+/* How many reserved blocks are unused? */
+unsigned int
+xrep_newbt_unused_blocks(
+ struct xrep_newbt *xnr)
+{
+ struct xrep_newbt_resv *resv;
+ unsigned int unused = 0;
+
+ list_for_each_entry(resv, &xnr->resv_list, list)
+ unused += resv->len - resv->used;
+ return unused;
+}
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
new file mode 100644
index 000000000000..89f8e3970b1f
--- /dev/null
+++ b/fs/xfs/scrub/newbt.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NEWBT_H__
+#define __XFS_SCRUB_NEWBT_H__
+
+struct xrep_newbt_resv {
+ /* Link to list of extents that we've reserved. */
+ struct list_head list;
+
+ struct xfs_perag *pag;
+
+ /* Auto-freeing this reservation if we don't commit. */
+ struct xfs_alloc_autoreap autoreap;
+
+ /* AG block of the extent we reserved. */
+ xfs_agblock_t agbno;
+
+ /* Length of the reservation. */
+ xfs_extlen_t len;
+
+ /* How much of this reservation has been used. */
+ xfs_extlen_t used;
+};
+
+struct xrep_newbt {
+ struct xfs_scrub *sc;
+
+ /* List of extents that we've reserved. */
+ struct list_head resv_list;
+
+ /* Fake root for new btree. */
+ union {
+ struct xbtree_afakeroot afake;
+ struct xbtree_ifakeroot ifake;
+ };
+
+ /* rmap owner of these blocks */
+ struct xfs_owner_info oinfo;
+
+ /* btree geometry for the bulk loader */
+ struct xfs_btree_bload bload;
+
+ /* Allocation hint */
+ xfs_fsblock_t alloc_hint;
+
+ /* per-ag reservation type */
+ enum xfs_ag_resv_type resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv);
+int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xrep_newbt_cancel(struct xrep_newbt *xnr);
+int xrep_newbt_commit(struct xrep_newbt *xnr);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr);
+unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr);
+
+#endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h
new file mode 100644
index 000000000000..0d3f9e6c1aad
--- /dev/null
+++ b/fs/xfs/scrub/off_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_OFF_BITMAP_H__
+#define __XFS_SCRUB_OFF_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fileoff_t */
+
+struct xoff_bitmap {
+ struct xbitmap64 offbitmap;
+};
+
+static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->offbitmap);
+}
+
+static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->offbitmap);
+}
+
+static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap,
+ xfs_fileoff_t off, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->offbitmap, off, len);
+}
+
+static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->offbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index e6155d86f791..7db873672146 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -156,6 +156,16 @@ xchk_parent_validate(
goto out_rele;
}
+ /*
+ * We cannot yet validate this parent pointer if the directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
/* Look for a directory entry in the parent pointing to the child. */
error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -217,6 +227,13 @@ xchk_parent(
*/
error = xchk_parent_validate(sc, parent_ino);
} while (error == -EAGAIN);
+ if (error == -EBUSY) {
+ /*
+ * We could not scan a directory, so we marked the check
+ * incomplete. No further error return is necessary.
+ */
+ return 0;
+ }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 5671c8153433..183d531875ea 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
@@ -17,9 +18,10 @@
#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/quota.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline xfs_dqtype_t
+xfs_dqtype_t
xchk_quota_to_dqtype(
struct xfs_scrub *sc)
{
@@ -75,14 +77,70 @@ struct xchk_quota_info {
xfs_dqid_t last_id;
};
+/* There's a written block backing this dquot, right? */
+STATIC int
+xchk_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ xfs_fileoff_t offset)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ if (!xfs_verify_fileoff(mp, offset)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (dq->q_fileoffset != offset) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps != 1) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (!xfs_verify_fsbno(mp, irec.br_startblock))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (!xfs_bmap_is_written_extent(&irec))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ return 0;
+}
+
+/* Complain if a quota timer is incorrectly set. */
+static inline void
+xchk_quota_item_timer(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t offset,
+ const struct xfs_dquot_res *res)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ } else {
+ if (res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ }
+}
+
/* Scrub the fields in an individual quota item. */
STATIC int
xchk_quota_item(
- struct xfs_dquot *dq,
- xfs_dqtype_t dqtype,
- void *priv)
+ struct xchk_quota_info *sqi,
+ struct xfs_dquot *dq)
{
- struct xchk_quota_info *sqi = priv;
struct xfs_scrub *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -94,6 +152,17 @@ xchk_quota_item(
return error;
/*
+ * We want to validate the bmap record for the storage backing this
+ * dquot, so we need to lock the dquot and the quota file. For quota
+ * operations, the locking order is first the ILOCK and then the dquot.
+ * However, dqiterate gave us a locked dquot, so drop the dquot lock to
+ * get the ILOCK.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_SHARED);
+ xfs_dqlock(dq);
+
+ /*
* Except for the root dquot, the actual dquot we got must either have
* the same or higher id as we saw before.
*/
@@ -103,6 +172,11 @@ xchk_quota_item(
sqi->last_id = dq->q_id;
+ error = xchk_quota_item_bmap(sc, dq, offset);
+ xchk_iunlock(sc, XFS_ILOCK_SHARED);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
+ return error;
+
/*
* Warn if the hard limits are larger than the fs.
* Administrators can do this, though in production this seems
@@ -166,6 +240,10 @@ xchk_quota_item(
dq->q_rtb.count > dq->q_rtb.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ xchk_quota_item_timer(sc, offset, &dq->q_blk);
+ xchk_quota_item_timer(sc, offset, &dq->q_ino);
+ xchk_quota_item_timer(sc, offset, &dq->q_rtb);
+
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED;
@@ -191,7 +269,7 @@ xchk_quota_data_fork(
return error;
/* Check for data fork problems that apply only to quota files. */
- max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
@@ -218,9 +296,11 @@ int
xchk_quota(
struct xfs_scrub *sc)
{
- struct xchk_quota_info sqi;
+ struct xchk_dqiter cursor = { };
+ struct xchk_quota_info sqi = { .sc = sc };
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
xfs_dqtype_t dqtype;
int error = 0;
@@ -239,10 +319,15 @@ xchk_quota(
* functions.
*/
xchk_iunlock(sc, sc->ilock_flags);
- sqi.sc = sc;
- sqi.last_id = 0;
- error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
- xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ /* Now look for things that the quota verifiers won't complain about. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xchk_quota_item(&sqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
if (error == -ECANCELED)
error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h
new file mode 100644
index 000000000000..6c7134ce2385
--- /dev/null
+++ b/fs/xfs/scrub/quota.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTA_H__
+#define __XFS_SCRUB_QUOTA_H__
+
+xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc);
+
+/* dquot iteration code */
+
+struct xchk_dqiter {
+ struct xfs_scrub *sc;
+
+ /* Quota file that we're walking. */
+ struct xfs_inode *quota_ip;
+
+ /* Cached data fork mapping for the dquot. */
+ struct xfs_bmbt_irec bmap;
+
+ /* The next dquot to scan. */
+ uint64_t id;
+
+ /* Quota type (user/group/project). */
+ xfs_dqtype_t dqtype;
+
+ /* Data fork sequence number to detect stale mappings. */
+ unsigned int if_seq;
+};
+
+void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype);
+int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp);
+
+#endif /* __XFS_SCRUB_QUOTA_H__ */
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
new file mode 100644
index 000000000000..0bab4c30cb85
--- /dev/null
+++ b/fs/xfs/scrub/quota_repair.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "xfs_reflink.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Quota Repair
+ * ============
+ *
+ * Quota repairs are fairly simplistic; we fix everything that the dquot
+ * verifiers complain about, cap any counters or limits that make no sense,
+ * and schedule a quotacheck if we had to fix anything. We also repair any
+ * data fork extent records that don't apply to metadata files.
+ */
+
+struct xrep_quota_info {
+ struct xfs_scrub *sc;
+ bool need_quotacheck;
+};
+
+/*
+ * Allocate a new block into a sparse hole in the quota file backing this
+ * dquot, initialize the block, and commit the whole mess.
+ */
+STATIC int
+xrep_quota_item_fill_bmap_hole(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_buf *bp;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Map a block into the file. */
+ error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ 0);
+ if (error)
+ return error;
+
+ error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0,
+ irec, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -ENOSPC;
+
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
+
+ trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id);
+
+ /* Initialize the new block. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_dquot_buf_ops;
+
+ xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+ /*
+ * Finish the mapping transactions and roll one more time to
+ * disconnect sc->ip from sc->tp.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Make sure there's a written block backing this dquot */
+STATIC int
+xrep_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ bool *dirty)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk;
+ int nmaps = 1;
+ int error;
+
+ /* The computed file offset should always be valid. */
+ if (!xfs_verify_fileoff(mp, offset)) {
+ ASSERT(xfs_verify_fileoff(mp, offset));
+ return -EFSCORRUPTED;
+ }
+ dq->q_fileoffset = offset;
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) {
+ /* Hole/delalloc extent; allocate a real block. */
+ error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec);
+ if (error)
+ return error;
+ } else if (irec.br_state != XFS_EXT_NORM) {
+ /* Unwritten extent, which we already took care of? */
+ ASSERT(irec.br_state == XFS_EXT_NORM);
+ return -EFSCORRUPTED;
+ } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) {
+ /*
+ * If the cached daddr is incorrect, repair probably punched a
+ * hole out of the quota file and filled it back in with a new
+ * block. Update the block mapping in the dquot.
+ */
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock);
+ }
+
+ *dirty = true;
+ return 0;
+}
+
+/* Reset quota timers if incorrectly set. */
+static inline void
+xrep_quota_item_timer(
+ struct xfs_scrub *sc,
+ const struct xfs_dquot_res *res,
+ bool *dirty)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ *dirty = true;
+ } else {
+ if (res->timer)
+ *dirty = true;
+ }
+}
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xrep_quota_item(
+ struct xrep_quota_info *rqi,
+ struct xfs_dquot *dq)
+{
+ struct xfs_scrub *sc = rqi->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t fs_icount;
+ bool dirty = false;
+ int error = 0;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * We might need to fix holes in the bmap record for the storage
+ * backing this dquot, so we need to lock the dquot and the quota file.
+ * dqiterate gave us a locked dquot, so drop the dquot lock to get the
+ * ILOCK_EXCL.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_dqlock(dq);
+
+ error = xrep_quota_item_bmap(sc, dq, &dirty);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /* Check the limits. */
+ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) {
+ dq->q_blk.softlimit = dq->q_blk.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_ino.softlimit > dq->q_ino.hardlimit) {
+ dq->q_ino.softlimit = dq->q_ino.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) {
+ dq->q_rtb.softlimit = dq->q_rtb.hardlimit;
+ dirty = true;
+ }
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits. We don't know what the real number
+ * is, but we can make quotacheck find out for us.
+ */
+ if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) {
+ dq->q_blk.reserved -= dq->q_blk.count;
+ dq->q_blk.reserved += mp->m_sb.sb_dblocks;
+ dq->q_blk.count = mp->m_sb.sb_dblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+ if (dq->q_ino.count > fs_icount) {
+ dq->q_ino.reserved -= dq->q_ino.count;
+ dq->q_ino.reserved += fs_icount;
+ dq->q_ino.count = fs_icount;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ dq->q_rtb.reserved -= dq->q_rtb.count;
+ dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
+ dq->q_rtb.count = mp->m_sb.sb_rblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+
+ xrep_quota_item_timer(sc, &dq->q_blk, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_ino, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_rtb, &dirty);
+
+ if (!dirty)
+ return 0;
+
+ trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id);
+
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ xfs_trans_dqjoin(sc->tp, dq);
+ if (dq->q_id) {
+ xfs_qm_adjust_dqlimits(dq);
+ xfs_qm_adjust_dqtimers(dq);
+ }
+ xfs_trans_log_dquot(sc->tp, dq);
+ error = xfs_trans_roll(&sc->tp);
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Fix a quota timer so that we can pass the verifier. */
+STATIC void
+xrep_quota_fix_timer(
+ struct xfs_mount *mp,
+ const struct xfs_disk_dquot *ddq,
+ __be64 softlimit,
+ __be64 countnow,
+ __be32 *timer,
+ time64_t timelimit)
+{
+ uint64_t soft = be64_to_cpu(softlimit);
+ uint64_t count = be64_to_cpu(countnow);
+ time64_t new_timer;
+ uint32_t t;
+
+ if (!soft || count <= soft || *timer != 0)
+ return;
+
+ new_timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + timelimit);
+ if (ddq->d_type & XFS_DQTYPE_BIGTIME)
+ t = xfs_dq_unix_to_bigtime(new_timer);
+ else
+ t = new_timer;
+
+ *timer = cpu_to_be32(t);
+}
+
+/* Fix anything the verifiers complain about. */
+STATIC int
+xrep_quota_block(
+ struct xfs_scrub *sc,
+ xfs_daddr_t daddr,
+ xfs_dqtype_t dqtype,
+ xfs_dqid_t id)
+{
+ struct xfs_dqblk *dqblk;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype);
+ struct xfs_buf *bp = NULL;
+ enum xfs_blft buftype = 0;
+ int i;
+ int error;
+
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr,
+ qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Failed verifier, retry read with no ops. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp,
+ sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen,
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ break;
+ case 0:
+ dqblk = bp->b_addr;
+ ddq = &dqblk[0].dd_diskdq;
+
+ /*
+ * If there's nothing that would impede a dqiterate, we're
+ * done.
+ */
+ if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype ||
+ id == be32_to_cpu(ddq->d_id)) {
+ xfs_trans_brelse(sc->tp, bp);
+ return 0;
+ }
+ break;
+ default:
+ return error;
+ }
+
+ /* Something's wrong with the block, fix the whole thing. */
+ dqblk = bp->b_addr;
+ bp->b_ops = &xfs_dquot_buf_ops;
+ for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) {
+ ddq = &dqblk->dd_diskdq;
+
+ trace_xrep_disk_dquot(sc->mp, dqtype, id + i);
+
+ ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddq->d_version = XFS_DQUOT_VERSION;
+ ddq->d_type = dqtype;
+ ddq->d_id = cpu_to_be32(id + i);
+
+ if (xfs_has_bigtime(sc->mp) && ddq->d_id)
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit,
+ ddq->d_bcount, &ddq->d_btimer,
+ defq->blk.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit,
+ ddq->d_icount, &ddq->d_itimer,
+ defq->ino.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit,
+ ddq->d_rtbcount, &ddq->d_rtbtimer,
+ defq->rtb.time);
+
+ /* We only support v5 filesystems so always set these. */
+ uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ dqblk->dd_lsn = 0;
+ }
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ buftype = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_GROUP:
+ buftype = XFS_BLFT_GDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_PROJ:
+ buftype = XFS_BLFT_PDQUOT_BUF;
+ break;
+ }
+ xfs_trans_buf_set_type(sc->tp, bp, buftype);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return xrep_roll_trans(sc);
+}
+
+/*
+ * Repair a quota file's data fork. The function returns with the inode
+ * joined.
+ */
+STATIC int
+xrep_quota_data_fork(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_iext_cursor icur;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off;
+ xfs_fsblock_t fsbno;
+ bool truncate = false;
+ bool joined = false;
+ int error = 0;
+
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ goto out;
+
+ /* Check for data fork problems that apply only to quota files. */
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (isnullstartblock(irec.br_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ truncate = true;
+ break;
+ }
+
+ /* Convert unwritten extents to real ones. */
+ if (irec.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec nrec;
+ int nmap = 1;
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ error = xfs_bmapi_write(sc->tp, sc->ip,
+ irec.br_startoff, irec.br_blockcount,
+ XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
+ if (error)
+ goto out;
+ if (nmap != 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ ASSERT(nrec.br_startoff == irec.br_startoff);
+ ASSERT(nrec.br_blockcount == irec.br_blockcount);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ }
+ }
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ if (truncate) {
+ /* Erase everything after the block containing the max dquot */
+ error = xfs_bunmapi_range(&sc->tp, sc->ip, 0,
+ max_dqid_off * sc->mp->m_sb.sb_blocksize,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
+
+ /* Remove all CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0,
+ XFS_MAX_FILEOFF, true);
+ if (error)
+ goto out;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Always re-log the inode so that our permanent transaction
+ * can keep on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Now go fix anything that fails the verifiers. */
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ for (fsbno = irec.br_startblock, off = irec.br_startoff;
+ fsbno < irec.br_startblock + irec.br_blockcount;
+ fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB,
+ off += XFS_DQUOT_CLUSTER_SIZE_FSB) {
+ error = xrep_quota_block(sc,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ dqtype, off * qi->qi_dqperchunk);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Go fix anything in the quota items that we could have been mad about. Now
+ * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to
+ * use the regular dquot functions.
+ */
+STATIC int
+xrep_quota_problems(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xrep_quota_info rqi = { .sc = sc };
+ struct xfs_dquot *dq;
+ int error;
+
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xrep_quota_item(&rqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Make a quotacheck happen. */
+ if (rqi.need_quotacheck)
+ xrep_force_quotacheck(sc, dqtype);
+ return 0;
+}
+
+/* Repair all of a quota type's items. */
+int
+xrep_quota(
+ struct xfs_scrub *sc)
+{
+ xfs_dqtype_t dqtype;
+ int error;
+
+ dqtype = xchk_quota_to_dqtype(sc);
+
+ /*
+ * Re-take the ILOCK so that we can fix any problems that we found
+ * with the data fork mappings, or with the dquot bufs themselves.
+ */
+ if (!(sc->ilock_flags & XFS_ILOCK_EXCL))
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ error = xrep_quota_data_fork(sc, dqtype);
+ if (error)
+ return error;
+
+ /*
+ * Finish deferred items and roll the transaction to unjoin the quota
+ * inode from transaction so that we can unlock the quota inode; we
+ * play only with dquots from now on.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xchk_iunlock(sc, sc->ilock_flags);
+
+ /* Fix anything the dquot verifiers don't complain about. */
+ error = xrep_quota_problems(sc, dqtype);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index e51c1544be63..16462332c897 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -36,16 +36,14 @@ xchk_dir_walk_sf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_geometry *geo = mp->m_dir_geo;
struct xfs_dir2_sf_entry *sfep;
- struct xfs_dir2_sf_hdr *sfp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_ino_t ino;
xfs_dir2_dataptr_t dapos;
unsigned int i;
int error;
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/* dot entry */
dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 86a62420e02c..f99eca799809 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -20,6 +20,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
@@ -31,11 +32,14 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
+#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/fsb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -73,10 +77,10 @@
* with only the same rmap owner but the block is not owned by something with
* the same rmap owner, the block will be freed.
*
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking. We must also invalidate any buffers associated with
- * @bitmap.
+ * The caller is responsible for locking the AG headers/inode for the entire
+ * rebuild operation so that nothing else can sneak in and change the incore
+ * state while we're not looking. We must also invalidate any buffers
+ * associated with @bitmap.
*/
/* Information about reaping extents after a repair. */
@@ -247,7 +251,7 @@ xreap_agextent_binval(
max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
- for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+ for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
struct xfs_buf *bp = NULL;
xfs_daddr_t daddr;
int error;
@@ -377,6 +381,17 @@ xreap_agextent_iter(
trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
rs->force_roll = true;
+
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ /*
+ * If we're unmapping CoW staging extents, remove the
+ * records from the refcountbt, which will remove the
+ * rmap record as well.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ return 0;
+ }
+
return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
*aglenp, rs->oinfo);
}
@@ -395,6 +410,26 @@ xreap_agextent_iter(
return 0;
}
+ /*
+ * If we're getting rid of CoW staging extents, use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
+ rs->resv, true);
+ if (error)
+ return error;
+
+ rs->force_roll = true;
+ return 0;
+ }
+
/* Put blocks back on the AGFL one at a time. */
if (rs->resv == XFS_AG_RESV_AGFL) {
ASSERT(*aglenp == 1);
@@ -409,13 +444,17 @@ xreap_agextent_iter(
/*
* Use deferred frees to get rid of the old btree blocks to try to
* minimize the window in which we could crash and lose the old blocks.
+ * Add a defer ops barrier every other extent to avoid stressing the
+ * system with large EFIs.
*/
- error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
rs->resv, true);
if (error)
return error;
rs->deferred++;
+ if (rs->deferred % 2 == 0)
+ xfs_defer_add_barrier(sc->tp);
return 0;
}
@@ -425,13 +464,12 @@ xreap_agextent_iter(
*/
STATIC int
xreap_agmeta_extent(
- uint64_t fsbno,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xreap_state *rs = priv;
struct xfs_scrub *sc = rs->sc;
- xfs_agblock_t agbno = fsbno;
xfs_agblock_t agbno_next = agbno + len;
int error = 0;
@@ -496,3 +534,115 @@ xrep_reap_agblocks(
return 0;
}
+
+/*
+ * Break a file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * not cross an AG boundary.
+ */
+STATIC int
+xreap_fsmeta_extent(
+ uint64_t fsbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ xfs_agblock_t agbno_next = agbno + len;
+ int error = 0;
+
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sa.pag);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ while (agbno < agbno_next) {
+ xfs_extlen_t aglen;
+ bool crosslinked;
+
+ error = xreap_agextent_select(rs, agbno, agbno_next,
+ &crosslinked, &aglen);
+ if (error)
+ goto out_agf;
+
+ error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+ if (error)
+ goto out_agf;
+
+ if (xreap_want_defer_finish(rs)) {
+ /*
+ * Holds the AGF buffer across the deferred chain
+ * processing.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto out_agf;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ /*
+ * Hold the AGF buffer across the transaction roll so
+ * that we don't have to reattach it to the scrub
+ * context.
+ */
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ if (error)
+ goto out_agf;
+ xreap_reset(rs);
+ }
+
+ agbno += aglen;
+ }
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every fs metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_fsblocks(
+ struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index fe24626af164..0b69f16dd98f 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -6,7 +6,12 @@
#ifndef __XFS_SCRUB_REAP_H__
#define __XFS_SCRUB_REAP_H__
+struct xagb_bitmap;
+struct xfsb_bitmap;
+
int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 304ea1e1bfb0..bf22f245bbfa 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -441,7 +441,7 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
- if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
new file mode 100644
index 000000000000..f38fccc42a20
--- /dev/null
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the refcount btree root and
+ * insert all the records.
+ */
+
+/* The only parts of the rmap that we care about for computing refcounts. */
+struct xrep_refc_rmap {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+} __packed;
+
+struct xrep_refc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xagb_bitmap old_refcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_extlen_t btblocks;
+};
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_refc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
+ rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rc_startblock, rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_refc_stash(
+ struct xrep_refc *rr,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = agbno,
+ .rc_blockcount = len,
+ .rc_domain = domain,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+
+ error = xrep_refc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(sc->sa.pag, &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_refc_stash_cow(
+ struct xrep_refc *rr,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_refc_rmap_shareable(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rmap)
+{
+ /* AG metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Metadata in files are never shareable */
+ if (xfs_internal_inum(mp, rmap->rm_owner))
+ return false;
+
+ /* Metadata and unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return false;
+
+ return true;
+}
+
+/*
+ * Walk along the reverse mapping records until we find one that could describe
+ * a shared extent.
+ */
+STATIC int
+xrep_refc_walk_rmaps(
+ struct xrep_refc *rr,
+ struct xrep_refc_rmap *rrm,
+ bool *have_rec)
+{
+ struct xfs_rmap_irec rmap;
+ struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt))
+ return -EFSCORRUPTED;
+
+ if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
+ rmap.rm_blockcount);
+ if (error)
+ return error;
+ } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ /* refcountbt block, dump it when we're done. */
+ rr->btblocks += rmap.rm_blockcount;
+ error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
+ rmap.rm_startblock, rmap.rm_blockcount);
+ if (error)
+ return error;
+ }
+ } while (!xrep_refc_rmap_shareable(mp, &rmap));
+
+ rrm->startblock = rmap.rm_startblock;
+ rrm->blockcount = rmap.rm_blockcount;
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_refc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/* Sort in the same order as the ondisk records. */
+static int
+xrep_refc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_refc_encode_startblock(ap);
+ sb = xrep_refc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_refc_sort_records(
+ struct xrep_refc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_agbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+static inline int
+xrep_refc_next_edge(
+ struct xfarray *rmap_bag,
+ struct xrep_refc_rmap *next_rrm,
+ bool next_valid,
+ xfs_agblock_t *nbnop)
+{
+ struct xrep_refc_rmap rrm;
+ xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
+ xfs_agblock_t nbno = NULLAGBLOCK;
+ int error;
+
+ if (next_valid)
+ nbno = next_rrm->startblock;
+
+ while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
+ nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
+
+ if (error)
+ return error;
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (nbno == NULLAGBLOCK)
+ return -EFSCORRUPTED;
+
+ *nbnop = nbno;
+ return 0;
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_refc_push_rmaps_at(
+ struct xrep_refc *rr,
+ struct xfarray *rmap_bag,
+ xfs_agblock_t bno,
+ struct xrep_refc_rmap *rrm,
+ bool *have,
+ uint64_t *stack_sz)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rrm->startblock == bno) {
+ error = xfarray_store_anywhere(rmap_bag, rrm);
+ if (error)
+ return error;
+ (*stack_sz)++;
+ error = xrep_refc_walk_rmaps(rr, rrm, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_refc_find_refcounts(
+ struct xrep_refc *rr)
+{
+ struct xrep_refc_rmap rrm;
+ struct xfs_scrub *sc = rr->sc;
+ struct xfarray *rmap_bag;
+ char *descr;
+ uint64_t old_stack_sz;
+ uint64_t stack_sz = 0;
+ xfs_agblock_t sbno;
+ xfs_agblock_t cbno;
+ xfs_agblock_t nbno;
+ bool have;
+ int error;
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Set up a sparse array to store all the rmap records that we're
+ * tracking to generate a reference count record. If this exceeds
+ * MAXREFCOUNT, we clamp rc_refcount.
+ */
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
+ &rmap_bag);
+ kfree(descr);
+ if (error)
+ goto out_cur;
+
+ /* Start the rmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rrm.startblock;
+ error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
+ &rrm, &have, &stack_sz);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_sz = stack_sz;
+
+ /* While stack isn't empty... */
+ while (stack_sz) {
+ xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
+
+ /* Pop all rmaps that end at nbno */
+ while ((error = xfarray_iter(rmap_bag, &array_cur,
+ &rrm)) == 1) {
+ if (RRM_NEXT(rrm) != nbno)
+ continue;
+ error = xfarray_unset(rmap_bag, array_cur - 1);
+ if (error)
+ goto out_bag;
+ stack_sz--;
+ }
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_refc_push_rmaps_at(rr, rmap_bag,
+ nbno, &rrm, &have, &stack_sz);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (stack_sz != old_stack_sz) {
+ if (old_stack_sz > 1) {
+ error = xrep_refc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_sz);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (stack_sz == 0)
+ break;
+ old_stack_sz = stack_sz;
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(stack_sz == 0);
+out_bag:
+ xfarray_destroy(rmap_bag);
+out_cur:
+ xchk_ag_btcur_free(&sc->sa);
+ return error;
+}
+#undef RRM_NEXT
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_refc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ struct xrep_refc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_refc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_refc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Update the AGF counters. */
+STATIC int
+xrep_refc_reset_counters(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the refcountbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected refcount information to stage a new refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_refc_build_new_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *refc_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ error = xrep_refc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+ XFS_AG_RESV_METADATA);
+ rr->new_btree.bload.get_records = xrep_refc_get_records;
+ rr->new_btree.bload.claim_block = xrep_refc_claim_block;
+
+ /* Compute how many blocks we'll need. */
+ refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
+ pag);
+ error = xfs_btree_bload_compute_geometry(refc_cur,
+ &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
+
+ /* Add all observed refcount records. */
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_refc_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_refcount_level = 0;
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_refc_remove_old_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ int error;
+
+ /* Free the old refcountbt blocks if they're not in use. */
+ error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old refcountbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservations.
+ */
+ pag->pagf_repair_refcount_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+ return 0;
+}
+
+/* Rebuild the refcount btree. */
+int
+xrep_refcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_refc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per block. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xagb_bitmap_init(&rr->old_refcountbt_blocks);
+ error = xrep_refc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the refcount information. */
+ error = xrep_refc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_refc_remove_old_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b8b5439f2d7..745d5b8f405a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -27,6 +27,9 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_defer.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_reflink.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -176,6 +179,16 @@ xrep_roll_ag_trans(
return 0;
}
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+ struct xfs_scrub *sc)
+{
+ if (!sc->ip)
+ return xrep_roll_ag_trans(sc);
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
/* Finish all deferred work attached to the repair transaction. */
int
xrep_defer_finish(
@@ -673,6 +686,7 @@ xrep_find_ag_btree_roots(
return error;
}
+#ifdef CONFIG_XFS_QUOTA
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -699,10 +713,10 @@ xrep_force_quotacheck(
*
* This function ensures that the appropriate dquots are attached to an inode.
* We cannot allow the dquot code to allocate an on-disk dquot block here
- * because we're already in transaction context with the inode locked. The
- * on-disk dquot should already exist anyway. If the quota code signals
- * corruption or missing quota information, schedule quotacheck, which will
- * repair corruptions in the quota metadata.
+ * because we're already in transaction context. The on-disk dquot should
+ * already exist anyway. If the quota code signals corruption or missing quota
+ * information, schedule quotacheck, which will repair corruptions in the quota
+ * metadata.
*/
int
xrep_ino_dqattach(
@@ -710,7 +724,10 @@ xrep_ino_dqattach(
{
int error;
- error = xfs_qm_dqattach_locked(sc->ip, false);
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ error = xfs_qm_dqattach(sc->ip);
switch (error) {
case -EFSBADCRC:
case -EFSCORRUPTED:
@@ -734,3 +751,367 @@ xrep_ino_dqattach(
return error;
}
+#endif /* CONFIG_XFS_QUOTA */
+
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+int
+xrep_ino_ensure_extent_count(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_extnum_t nextents)
+{
+ xfs_extnum_t max_extents;
+ bool inode_has_nrext64;
+
+ inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
+ max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
+ if (nextents <= max_extents)
+ return 0;
+ if (inode_has_nrext64)
+ return -EFSCORRUPTED;
+ if (!xfs_has_large_extent_counts(sc->mp))
+ return -EFSCORRUPTED;
+
+ max_extents = xfs_iext_max_nextents(true, whichfork);
+ if (nextents > max_extents)
+ return -EFSCORRUPTED;
+
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Initialize all the btree cursors for an AG repair except for the btree that
+ * we're rebuilding.
+ */
+void
+xrep_ag_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ /* Set up a bnobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
+ sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+ sa->agi_bp, XFS_BTNUM_INO);
+ if (xfs_has_finobt(mp))
+ sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
+ xfs_has_rmapbt(mp))
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
+ xfs_has_reflink(mp))
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sc->sa.pag);
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGF
+ * buffer. We had better get the same AGF buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagf(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agf(pag));
+
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agf_bp) {
+ ASSERT(bp == sc->sa.agf_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGI
+ * buffer. We had better get the same AGI buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagi(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agi(pag));
+
+ clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agi_bp) {
+ ASSERT(bp == sc->sa.agi_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Given an active reference to a perag structure, load AG headers and cursors.
+ * This should only be called to scan an AG while repairing file-based metadata.
+ */
+int
+xrep_ag_init(
+ struct xfs_scrub *sc,
+ struct xfs_perag *pag,
+ struct xchk_ag *sa)
+{
+ int error;
+
+ ASSERT(!sa->pag);
+
+ error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
+ if (error)
+ return error;
+
+ /* Grab our own passive reference from the caller's ref. */
+ sa->pag = xfs_perag_hold(pag);
+ xrep_ag_btcur_init(sc, sa);
+ return 0;
+}
+
+/* Reinitialize the per-AG block reservation for the AG we just fixed. */
+int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(sc->sa.pag != NULL);
+ ASSERT(sc->ops->type == ST_PERAG);
+ ASSERT(sc->tp);
+
+ sc->flags &= ~XREP_RESET_PERAG_RESV;
+ error = xfs_ag_resv_free(sc->sa.pag);
+ if (error)
+ goto out;
+ error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
+ if (error == -ENOSPC) {
+ xfs_err(sc->mp,
+"Insufficient free space to reset per-AG reservation for AG %u after repair.",
+ sc->sa.pag->pag_agno);
+ error = 0;
+ }
+
+out:
+ return error;
+}
+
+/* Decide if we are going to call the repair function for a scrub type. */
+bool
+xrep_will_attempt(
+ struct xfs_scrub *sc)
+{
+ /* Userspace asked us to rebuild the structure regardless. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+ return true;
+
+ /* Let debug users force us into the repair routines. */
+ if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ return true;
+
+ /* Metadata is corrupt or failed cross-referencing. */
+ if (xchk_needs_repair(sc->sm))
+ return true;
+
+ return false;
+}
+
+/* Try to fix some part of a metadata inode by calling another scrubber. */
+STATIC int
+xrep_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ __u32 smflags = sc->sm->sm_flags;
+ unsigned int sick_mask = sc->sick_mask;
+ int error;
+
+ /*
+ * Let's see if the inode needs repair. We're going to open-code calls
+ * to the scrub and repair functions so that we can hang on to the
+ * resources that we already acquired instead of using the standard
+ * setup/teardown routines.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (error)
+ goto out;
+
+ if (!xrep_will_attempt(sc))
+ goto out;
+
+ /*
+ * Repair some part of the inode. This will potentially join the inode
+ * to the transaction.
+ */
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xrep_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xrep_bmap(sc, XFS_DATA_FORK, false);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xrep_bmap(sc, XFS_ATTR_FORK, false);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /*
+ * Finish all deferred intent items and then roll the transaction so
+ * that the inode will not be joined to the transaction when we exit
+ * the function.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out;
+
+ /*
+ * Clear the corruption flags and re-check the metadata that we just
+ * repaired.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /* If corruption persists, the repair has failed. */
+ if (xchk_needs_repair(sc->sm)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+out:
+ sc->sick_mask = sick_mask;
+ sc->sm->sm_type = smtype;
+ sc->sm->sm_flags = smflags;
+ return error;
+}
+
+/*
+ * Repair the ondisk forks of a metadata inode. The caller must ensure that
+ * sc->ip points to the metadata inode and the ILOCK is held on that inode.
+ * The inode must not be joined to the transaction before the call, and will
+ * not be afterwards.
+ */
+int
+xrep_metadata_inode_forks(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+ int error;
+
+ /* Repair the inode record and the data fork. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error)
+ return error;
+
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
+ if (error)
+ return error;
+
+ /* Make sure the attr fork looks ok before we delete it. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+
+ /* Clear the reflink flag since metadata never shares. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we modified the inode, roll the transaction but don't rejoin the
+ * inode to the new transaction because xrep_bmap_data can do that.
+ */
+ if (dirty) {
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ dirty = false;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60d2a9ae5f2e..17114327e6fa 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,15 +28,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
/* Repair helpers */
int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
+bool xrep_will_attempt(struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
int xrep_defer_finish(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
+static inline int
+xrep_trans_commit(
+ struct xfs_scrub *sc)
+{
+ int error = xfs_trans_commit(sc->tp);
+
+ sc->tp = NULL;
+ return error;
+}
+
struct xbitmap;
struct xagb_bitmap;
+struct xfsb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
@@ -57,8 +70,35 @@ struct xrep_find_ag_btree {
int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
+
+#ifdef CONFIG_XFS_QUOTA
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
+#else
+# define xrep_force_quotacheck(sc, type) ((void)0)
+# define xrep_ino_dqattach(sc) (0)
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
+ xfs_extnum_t nextents);
+int xrep_reset_perag_resv(struct xfs_scrub *sc);
+int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
+int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+
+/* Repair setup functions */
+int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
+
+struct xfs_imap;
+int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
+
+void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
+ struct xchk_ag *sa);
+
+/* Metadata revalidators */
+
+int xrep_revalidate_allocbt(struct xfs_scrub *sc);
+int xrep_revalidate_iallocbt(struct xfs_scrub *sc);
/* Metadata repairers */
@@ -67,9 +107,34 @@ int xrep_superblock(struct xfs_scrub *sc);
int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
+int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_refcountbt(struct xfs_scrub *sc);
+int xrep_inode(struct xfs_scrub *sc);
+int xrep_bmap_data(struct xfs_scrub *sc);
+int xrep_bmap_attr(struct xfs_scrub *sc);
+int xrep_bmap_cow(struct xfs_scrub *sc);
+
+#ifdef CONFIG_XFS_RT
+int xrep_rtbitmap(struct xfs_scrub *sc);
+#else
+# define xrep_rtbitmap xrep_notsupported
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+int xrep_quota(struct xfs_scrub *sc);
+#else
+# define xrep_quota xrep_notsupported
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_reinit_pagf(struct xfs_scrub *sc);
+int xrep_reinit_pagi(struct xfs_scrub *sc);
#else
+#define xrep_ino_dqattach(sc) (0)
+#define xrep_will_attempt(sc) (false)
+
static inline int
xrep_attempt(
struct xfs_scrub *sc,
@@ -87,11 +152,45 @@ xrep_calc_ag_resblks(
return 0;
}
+static inline int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(0);
+ return -EOPNOTSUPP;
+}
+
+/* repair setup functions for no-repair */
+static inline int
+xrep_setup_nothing(
+ struct xfs_scrub *sc)
+{
+ return 0;
+}
+#define xrep_setup_ag_allocbt xrep_setup_nothing
+
+#define xrep_setup_inode(sc, imap) ((void)0)
+
+#define xrep_revalidate_allocbt (NULL)
+#define xrep_revalidate_iallocbt (NULL)
+
#define xrep_probe xrep_notsupported
#define xrep_superblock xrep_notsupported
#define xrep_agf xrep_notsupported
#define xrep_agfl xrep_notsupported
#define xrep_agi xrep_notsupported
+#define xrep_allocbt xrep_notsupported
+#define xrep_iallocbt xrep_notsupported
+#define xrep_refcountbt xrep_notsupported
+#define xrep_inode xrep_notsupported
+#define xrep_bmap_data xrep_notsupported
+#define xrep_bmap_attr xrep_notsupported
+#define xrep_bmap_cow xrep_notsupported
+#define xrep_rtbitmap xrep_notsupported
+#define xrep_quota xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index d29a26ecddd6..c99d1714f283 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -24,6 +24,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
/*
* Set us up to scrub reverse mapping btrees.
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 41a1d89ae8e6..441ca9977652 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -14,17 +14,33 @@
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bit.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/rtbitmap.h"
/* Set us up with the realtime metadata locked. */
int
xchk_setup_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb;
int error;
- error = xchk_trans_alloc(sc, 0);
+ rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+ if (!rtb)
+ return -ENOMEM;
+ sc->buf = rtb;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtbitmap(sc, rtb);
+ if (error)
+ return error;
+ }
+
+ error = xchk_trans_alloc(sc, rtb->resblks);
if (error)
return error;
@@ -32,7 +48,22 @@ xchk_setup_rtbitmap(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+
+ /*
+ * Now that we've locked the rtbitmap, we can't race with growfsrt
+ * trying to expand the bitmap or change the size of the rt volume.
+ * Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
+ rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+ }
return 0;
}
@@ -63,21 +94,30 @@ STATIC int
xchk_rtbitmap_check_extents(
struct xfs_scrub *sc)
{
- struct xfs_mount *mp = sc->mp;
struct xfs_bmbt_irec map;
- xfs_rtblock_t off;
- int nmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
int error = 0;
- for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
+
+ while (off < endoff) {
+ int nmap = 1;
+
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rbmip, off,
- mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
break;
@@ -98,12 +138,48 @@ int
xchk_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb = sc->buf;
int error;
- /* Is the size of the rtbitmap correct? */
- if (sc->mp->m_rbmip->i_disk_size !=
- XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
- xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rtb->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* Is sb_rextslog correct? */
+ if (mp->m_sb.sb_rextslog != rtb->rextslog) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is sb_rbmblocks large enough to handle the current rt volume? In no
+ * case can we exceed 4bn bitmap blocks since the super field is a u32.
+ */
+ if (rtb->rbmblocks > U32_MAX) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* The bitmap file length must be aligned to an fsblock. */
+ if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is the bitmap file itself large enough to handle the rt volume?
+ * growfsrt expands the bitmap file before updating sb_rextents, so the
+ * file can be larger than sb_rbmblocks.
+ */
+ if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
return 0;
}
@@ -116,12 +192,11 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
-out:
- return error;
+ return 0;
}
/* xref check that the extent is not free in the rtbitmap */
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
new file mode 100644
index 000000000000..85304ff019e1
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTBITMAP_H__
+#define __XFS_SCRUB_RTBITMAP_H__
+
+struct xchk_rtbitmap {
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ unsigned int rextslog;
+ unsigned int resblks;
+};
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+#else
+# define xrep_setup_rtbitmap(sc, rtb) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
new file mode 100644
index 000000000000..46f5d5f605c9
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/rtbitmap.h"
+
+/* Set up to repair the realtime bitmap file metadata. */
+int
+xrep_setup_rtbitmap(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+
+ /*
+ * Reserve enough blocks to write out a completely new bmbt for a
+ * maximally fragmented bitmap file. We do not hold the rtbitmap
+ * ILOCK yet, so this is entirely speculative.
+ */
+ blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rtb->resblks += blocks;
+ return 0;
+}
+
+/*
+ * Make sure that the given range of the data fork of the realtime file is
+ * mapped to written blocks. The caller must ensure that the inode is joined
+ * to the transaction.
+ */
+STATIC int
+xrep_rtbitmap_data_mappings(
+ struct xfs_scrub *sc,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t off = 0;
+ int error;
+
+ ASSERT(sc->ip != NULL);
+
+ while (off < len) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Written extents are ok. Holes are not filled because we
+ * do not know the freespace information.
+ */
+ if (xfs_bmap_is_written_extent(&map) ||
+ map.br_startblock == HOLESTARTBLOCK) {
+ off = map.br_startoff + map.br_blockcount;
+ continue;
+ }
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /* Make sure we're really converting an unwritten extent. */
+ if (map.br_state != XFS_EXT_UNWRITTEN) {
+ ASSERT(map.br_state == XFS_EXT_UNWRITTEN);
+ return -EFSCORRUPTED;
+ }
+
+ /* Make sure this block has a real zeroed extent mapped. */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff,
+ map.br_blockcount,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+ 0, &map, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ /* Commit new extent and all deferred work. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ off = map.br_startoff + map.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Fix broken rt volume geometry. */
+STATIC int
+xrep_rtbitmap_geometry(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+
+ /* Superblock fields */
+ if (mp->m_sb.sb_rextents != rtb->rextents)
+ xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS,
+ rtb->rextents - mp->m_sb.sb_rextents);
+
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ rtb->rbmblocks - mp->m_sb.sb_rbmblocks);
+
+ if (mp->m_sb.sb_rextslog != rtb->rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ rtb->rextslog - mp->m_sb.sb_rextslog);
+
+ /* Fix broken isize */
+ sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size,
+ mp->m_sb.sb_blocksize);
+
+ if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks))
+ sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks);
+
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair the realtime bitmap file metadata. */
+int
+xrep_rtbitmap(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtbitmap *rtb = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+ int error;
+
+ /* Impossibly large rtbitmap means we can't touch the filesystem. */
+ if (rtb->rbmblocks > U32_MAX)
+ return 0;
+
+ /*
+ * If the size of the rt bitmap file is larger than what we reserved,
+ * figure out if we need to adjust the block reservation in the
+ * transaction.
+ */
+ blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+ if (blocks > rtb->resblks) {
+ error = xfs_trans_reserve_more(sc->tp, blocks, 0);
+ if (error)
+ return error;
+
+ rtb->resblks += blocks;
+ }
+
+ /* Fix inode core and forks. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Ensure no unwritten extents. */
+ error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks);
+ if (error)
+ return error;
+
+ /* Fix inconsistent bitmap geometry */
+ return xrep_rtbitmap_geometry(sc, rtb);
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 8b15c47408d0..fabd0ed9dfa6 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -31,6 +31,18 @@
* (potentially large) amount of data in pageable memory.
*/
+struct xchk_rtsummary {
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ uint64_t rsumsize;
+ unsigned int rsumlevels;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -38,8 +50,15 @@ xchk_setup_rtsummary(
{
struct xfs_mount *mp = sc->mp;
char *descr;
+ struct xchk_rtsummary *rts;
int error;
+ rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
+ XCHK_GFP_FLAGS);
+ if (!rts)
+ return -ENOMEM;
+ sc->buf = rts;
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
@@ -54,15 +73,14 @@ xchk_setup_rtsummary(
if (error)
return error;
- /* Allocate a memory buffer for the summary comparison. */
- sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
- if (!sc->buf)
- return -ENOMEM;
-
error = xchk_install_live_inode(sc, mp->m_rsumip);
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
/*
* Locking order requires us to take the rtbitmap first. We must be
* careful to unlock it ourselves when we are done with the rtbitmap
@@ -71,13 +89,29 @@ xchk_setup_rtsummary(
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+
+ /*
+ * Now that we've locked the rtbitmap and rtsummary, we can't race with
+ * growfsrt trying to expand the summary or change the size of the rt
+ * volume. Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ xfs_filblks_t rsumblocks;
+ int rextslog;
+
+ rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rextslog = xfs_compute_rextslog(rts->rextents);
+ rts->rsumlevels = rextslog + 1;
+ rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
+ rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
+ rts->rbmblocks);
+ rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
+ }
return 0;
}
/* Helper functions to record suminfo words in an xfile. */
-typedef unsigned int xchk_rtsumoff_t;
-
static inline int
xfsum_load(
struct xfs_scrub *sc,
@@ -143,7 +177,7 @@ xchk_rtsum_record_free(
/* Compute the relevant location in the rtsum file. */
rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
- lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+ lenlog = xfs_highbit64(rec->ar_extcount);
offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
@@ -188,19 +222,29 @@ STATIC int
xchk_rtsum_compare(
struct xfs_scrub *sc)
{
- struct xfs_rtalloc_args args = {
- .mp = sc->mp,
- .tp = sc->tp,
- };
- struct xfs_mount *mp = sc->mp;
struct xfs_bmbt_irec map;
- xfs_fileoff_t off;
- xchk_rtsumoff_t sumoff = 0;
- int nmap;
+ struct xfs_iext_cursor icur;
+
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xchk_rtsummary *rts = sc->buf;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
+ xfs_rtsumoff_t sumoff = 0;
+ int error = 0;
+
+ rts->args.mp = sc->mp;
+ rts->args.tp = sc->tp;
+
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
- for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
- union xfs_suminfo_raw *ondisk_info;
- int error = 0;
+ while (off < endoff) {
+ int nmap = 1;
if (xchk_should_terminate(sc, &error))
return error;
@@ -208,8 +252,7 @@ xchk_rtsum_compare(
return 0;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
@@ -219,24 +262,33 @@ xchk_rtsum_compare(
return 0;
}
+ off += map.br_blockcount;
+ }
+
+ for (off = 0; off < endoff; off++) {
+ union xfs_suminfo_raw *ondisk_info;
+
/* Read a block's worth of ondisk rtsummary file. */
- error = xfs_rtsummary_read_buf(&args, off);
+ error = xfs_rtsummary_read_buf(&rts->args, off);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
/* Read a block's worth of computed rtsummary file. */
- error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+ error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize);
if (error) {
- xfs_rtbuf_cache_relse(&args);
+ xfs_rtbuf_cache_relse(&rts->args);
return error;
}
- ondisk_info = xfs_rsumblock_infoptr(&args, 0);
- if (memcmp(ondisk_info, sc->buf,
- mp->m_blockwsize << XFS_WORDLOG) != 0)
+ ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0);
+ if (memcmp(ondisk_info, rts->words,
+ mp->m_blockwsize << XFS_WORDLOG) != 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ xfs_rtbuf_cache_relse(&rts->args);
+ return error;
+ }
- xfs_rtbuf_cache_relse(&args);
+ xfs_rtbuf_cache_relse(&rts->args);
sumoff += mp->m_blockwsize;
}
@@ -249,8 +301,43 @@ xchk_rtsummary(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xchk_rtsummary *rts = sc->buf;
int error = 0;
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rts->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumlevels correct? */
+ if (mp->m_rsumlevels != rts->rsumlevels) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumsize correct? */
+ if (mp->m_rsumsize != rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* The summary file length must be aligned to an fsblock. */
+ if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /*
+ * Is the summary file itself large enough to handle the rt volume?
+ * growfsrt expands the summary file before updating sb_rextents, so
+ * the file can be larger than rsumsize.
+ */
+ if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4849efcaa33a..caf324c2b991 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -14,8 +14,6 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_errortag.h"
-#include "xfs_error.h"
#include "xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -238,27 +236,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_bnobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_cntbt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_inobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_iallocbt,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_finobt,
+ .scrub = xchk_iallocbt,
.has = xfs_has_finobt,
- .repair = xrep_notsupported,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
@@ -272,31 +274,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
.has = xfs_has_reflink,
- .repair = xrep_notsupported,
+ .repair = xrep_refcountbt,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
.setup = xchk_setup_inode,
.scrub = xchk_inode,
- .repair = xrep_notsupported,
+ .repair = xrep_inode,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_data,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_data,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_attr,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_attr,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_cow,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_cow,
},
[XFS_SCRUB_TYPE_DIR] = { /* directory */
.type = ST_INODE,
@@ -326,33 +328,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
- .has = xfs_has_realtime,
- .repair = xrep_notsupported,
+ .repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
.type = ST_FS,
@@ -531,7 +531,10 @@ retry_op:
/* Scrub for errors. */
check_start = xchk_stats_now();
- error = sc->ops->scrub(sc);
+ if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
+ error = sc->ops->repair_eval(sc);
+ else
+ error = sc->ops->scrub(sc);
run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
goto try_harder;
@@ -542,23 +545,12 @@ retry_op:
xchk_update_health(sc);
- if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc->flags & XREP_ALREADY_FIXED)) {
- bool needs_fix = xchk_needs_repair(sc->sm);
-
- /* Userspace asked us to rebuild the structure regardless. */
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
- needs_fix = true;
-
- /* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- needs_fix = true;
-
+ if (xchk_could_repair(sc)) {
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
- if (!needs_fix) {
+ if (!xrep_will_attempt(sc)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1ef9c6b4842a..7fc50654c4fe 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -35,6 +35,14 @@ struct xchk_meta_ops {
/* Repair or optimize the metadata. */
int (*repair)(struct xfs_scrub *);
+ /*
+ * Re-scrub the metadata we repaired, in case there's extra work that
+ * we need to do to check our repair work. If this is NULL, we'll use
+ * the ->scrub function pointer, assuming that the regular scrub is
+ * sufficient.
+ */
+ int (*repair_eval)(struct xfs_scrub *sc);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_mount *);
@@ -113,6 +121,7 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
@@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc);
int xchk_agf(struct xfs_scrub *sc);
int xchk_agfl(struct xfs_scrub *sc);
int xchk_agi(struct xfs_scrub *sc);
-int xchk_bnobt(struct xfs_scrub *sc);
-int xchk_cntbt(struct xfs_scrub *sc);
-int xchk_inobt(struct xfs_scrub *sc);
-int xchk_finobt(struct xfs_scrub *sc);
+int xchk_allocbt(struct xfs_scrub *sc);
+int xchk_iallocbt(struct xfs_scrub *sc);
int xchk_rmapbt(struct xfs_scrub *sc);
int xchk_refcountbt(struct xfs_scrub *sc);
int xchk_inode(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 38708fb9a5d7..ddff86713df3 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -12,8 +12,10 @@
#include "xfs_log_format.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/health.h"
/* Set us up to scrub a symbolic link. */
int
@@ -41,29 +43,37 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
len = ip->i_disk_size;
/* Plausible size? */
if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Inline symlink? */
if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
if (len > xfs_inode_data_fork_size(ip) ||
- len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
+ len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Remote symlink; must read the contents. */
error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-out:
- return error;
+
+ /* If a remote symlink is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
+ return 0;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 29afa4851235..d0e24ffaf754 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -14,9 +14,12 @@
#include "xfs_btree.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
+#include "xfs_quota.h"
+#include "xfs_quota_defs.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
+#include "scrub/quota.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4a8bc6f3c8f2..6bbb4e8639dc 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -19,6 +19,7 @@
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
+struct xchk_dqiter;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -106,6 +107,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
DECLARE_EVENT_CLASS(xchk_class,
@@ -347,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xchk_dqiter_class,
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id),
+ TP_ARGS(cursor, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long long, cur_id)
+ __field(unsigned long long, id)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dqtype = cursor->dqtype;
+ __entry->ino = cursor->quota_ip->i_ino;
+ __entry->cur_id = cursor->id;
+ __entry->startoff = cursor->bmap.br_startoff;
+ __entry->startblock = cursor->bmap.br_startblock;
+ __entry->blockcount = cursor->bmap.br_blockcount;
+ __entry->state = cursor->bmap.br_state;
+ __entry->id = id;
+ ),
+ TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->ino,
+ __entry->cur_id,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->id)
+);
+
+#define DEFINE_SCRUB_DQITER_EVENT(name) \
+DEFINE_EVENT(xchk_dqiter_class, name, \
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \
+ TP_ARGS(cursor, id))
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+#endif /* CONFIG_XFS_QUOTA */
+
TRACE_EVENT(xchk_incomplete,
TP_PROTO(struct xfs_scrub *sc, void *ret_ip),
TP_ARGS(sc, ret_ip),
@@ -1172,37 +1222,125 @@ DEFINE_EVENT(xrep_rmap_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
-TRACE_EVENT(xrep_refcount_extent_fn,
+TRACE_EVENT(xrep_abt_found,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *irec),
- TP_ARGS(mp, agno, irec),
+ const struct xfs_alloc_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
- __field(xfs_nlink_t, refcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startblock = irec->rc_startblock;
- __entry->blockcount = irec->rc_blockcount;
- __entry->refcount = irec->rc_refcount;
+ __entry->startblock = rec->ar_startblock;
+ __entry->blockcount = rec->ar_blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount)
+)
+
+TRACE_EVENT(xrep_ibt_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint16_t, holemask)
+ __field(uint8_t, count)
+ __field(uint8_t, freecount)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = rec->ir_startino;
+ __entry->holemask = rec->ir_holemask;
+ __entry->count = rec->ir_count;
+ __entry->freecount = rec->ir_freecount;
+ __entry->freemask = rec->ir_free;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->holemask,
+ __entry->count,
+ __entry->freecount,
+ __entry->freemask)
+)
+
+TRACE_EVENT(xrep_refc_found,
+ TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
+ TP_ARGS(pag, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->domain = rec->rc_domain;
+ __entry->startblock = rec->rc_startblock;
+ __entry->blockcount = rec->rc_blockcount;
+ __entry->refcount = rec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
)
+TRACE_EVENT(xrep_bmap_found,
+ TP_PROTO(struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
@@ -1299,39 +1437,327 @@ TRACE_EVENT(xrep_reset_counters,
MAJOR(__entry->dev), MINOR(__entry->dev))
)
-TRACE_EVENT(xrep_ialloc_insert,
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino, uint16_t holemask, uint8_t count,
- uint8_t freecount, uint64_t freemask),
- TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ int64_t owner),
+ TP_ARGS(mp, agno, agbno, len, owner),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(xfs_agino_t, startino)
- __field(uint16_t, holemask)
- __field(uint8_t, count)
- __field(uint8_t, freecount)
- __field(uint64_t, freemask)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int64_t, owner)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startino = startino;
- __entry->holemask = holemask;
- __entry->count = count;
- __entry->freecount = freecount;
- __entry->freemask = freemask;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
),
- TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
- __entry->startino,
- __entry->holemask,
- __entry->count,
- __entry->freecount,
- __entry->freemask)
+ __entry->agbno,
+ __entry->len,
+ __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ int64_t owner), \
+ TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
+DECLARE_EVENT_CLASS(xrep_dinode_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip),
+ TP_ARGS(sc, dip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint16_t, mode)
+ __field(uint8_t, version)
+ __field(uint8_t, format)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(uint64_t, size)
+ __field(uint64_t, nblocks)
+ __field(uint32_t, extsize)
+ __field(uint32_t, nextents)
+ __field(uint16_t, anextents)
+ __field(uint8_t, forkoff)
+ __field(uint8_t, aformat)
+ __field(uint16_t, flags)
+ __field(uint32_t, gen)
+ __field(uint64_t, flags2)
+ __field(uint32_t, cowextsize)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->mode = be16_to_cpu(dip->di_mode);
+ __entry->version = dip->di_version;
+ __entry->format = dip->di_format;
+ __entry->uid = be32_to_cpu(dip->di_uid);
+ __entry->gid = be32_to_cpu(dip->di_gid);
+ __entry->size = be64_to_cpu(dip->di_size);
+ __entry->nblocks = be64_to_cpu(dip->di_nblocks);
+ __entry->extsize = be32_to_cpu(dip->di_extsize);
+ __entry->nextents = be32_to_cpu(dip->di_nextents);
+ __entry->anextents = be16_to_cpu(dip->di_anextents);
+ __entry->forkoff = dip->di_forkoff;
+ __entry->aformat = dip->di_aformat;
+ __entry->flags = be16_to_cpu(dip->di_flags);
+ __entry->gen = be32_to_cpu(dip->di_gen);
+ __entry->flags2 = be64_to_cpu(dip->di_flags2);
+ __entry->cowextsize = be32_to_cpu(dip->di_cowextsize);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->mode,
+ __entry->version,
+ __entry->format,
+ __entry->uid,
+ __entry->gid,
+ __entry->size,
+ __entry->nblocks,
+ __entry->extsize,
+ __entry->nextents,
+ __entry->anextents,
+ __entry->forkoff,
+ __entry->aformat,
+ __entry->flags,
+ __entry->gen,
+ __entry->flags2,
+ __entry->cowextsize)
)
+#define DEFINE_REPAIR_DINODE_EVENT(name) \
+DEFINE_EVENT(xrep_dinode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \
+ TP_ARGS(sc, dip))
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff);
+
+DECLARE_EVENT_CLASS(xrep_inode_class,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_rfsblock_t, nblocks)
+ __field(uint16_t, flags)
+ __field(uint64_t, flags2)
+ __field(uint32_t, nextents)
+ __field(uint8_t, format)
+ __field(uint32_t, anextents)
+ __field(uint8_t, aformat)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->size = sc->ip->i_disk_size;
+ __entry->nblocks = sc->ip->i_nblocks;
+ __entry->flags = sc->ip->i_diflags;
+ __entry->flags2 = sc->ip->i_diflags2;
+ __entry->nextents = sc->ip->i_df.if_nextents;
+ __entry->format = sc->ip->i_df.if_format;
+ __entry->anextents = sc->ip->i_af.if_nextents;
+ __entry->aformat = sc->ip->i_af.if_format;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->nblocks,
+ __entry->flags,
+ __entry->flags2,
+ __entry->nextents,
+ __entry->format,
+ __entry->anextents,
+ __entry->aformat)
+)
+
+#define DEFINE_REPAIR_INODE_EVENT(name) \
+DEFINE_EVENT(xrep_inode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc), \
+ TP_ARGS(sc))
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed);
+
+TRACE_EVENT(xrep_dinode_count_rmaps,
+ TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks,
+ xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks,
+ xfs_extnum_t data_extents, xfs_extnum_t rt_extents,
+ xfs_aextnum_t attr_extents),
+ TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents,
+ rt_extents, attr_extents),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_rfsblock_t, data_blocks)
+ __field(xfs_rfsblock_t, rt_blocks)
+ __field(xfs_rfsblock_t, attr_blocks)
+ __field(xfs_extnum_t, data_extents)
+ __field(xfs_extnum_t, rt_extents)
+ __field(xfs_aextnum_t, attr_extents)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->data_blocks = data_blocks;
+ __entry->rt_blocks = rt_blocks;
+ __entry->attr_blocks = attr_blocks;
+ __entry->data_extents = data_extents;
+ __entry->rt_extents = rt_extents;
+ __entry->attr_extents = attr_extents;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->data_blocks,
+ __entry->rt_blocks,
+ __entry->attr_blocks,
+ __entry->data_extents,
+ __entry->rt_extents,
+ __entry->attr_extents)
+);
+
+TRACE_EVENT(xrep_cow_mark_file_range,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
+ xfs_fileoff_t startoff, xfs_filblks_t blockcount),
+ TP_ARGS(ip, startblock, startoff, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = startoff;
+ __entry->startblock = startblock;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount)
+);
+
+TRACE_EVENT(xrep_cow_replace_mapping,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec,
+ xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount),
+ TP_ARGS(ip, irec, new_startblock, new_blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ __field(xfs_fsblock_t, new_startblock)
+ __field(xfs_extlen_t, new_blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = irec->br_startoff;
+ __entry->startblock = irec->br_startblock;
+ __entry->blockcount = irec->br_blockcount;
+ __entry->state = irec->br_state;
+ __entry->new_startblock = new_startblock;
+ __entry->new_blockcount = new_blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->new_startblock,
+ __entry->new_blockcount)
+);
+
+TRACE_EVENT(xrep_cow_free_staging,
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t blockcount),
+ TP_ARGS(pag, agbno, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->blockcount)
+);
+
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xrep_dquot_class,
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id),
+ TP_ARGS(mp, type, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint8_t, type)
+ __field(uint32_t, id)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->id = id;
+ __entry->type = type;
+ ),
+ TP_printk("dev %d:%d type %s id 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __entry->id)
+);
+
+#define DEFINE_XREP_DQUOT_EVENT(name) \
+DEFINE_EVENT(xrep_dquot_class, name, \
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \
+ TP_ARGS(mp, type, id))
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
+DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+#endif /* CONFIG_XFS_QUOTA */
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 4ecac01363d9..62b9c506fdd1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
uint64_t xfarray_length(struct xfarray *array);
int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+/*
+ * Iterate the non-null elements in a sparse xfarray. Callers should
+ * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it
+ * will be set to one more than the index of the record that was retrieved.
+ * Returns 1 if a record was retrieved, 0 if there weren't any more records, or
+ * a negative errno.
+ */
+static inline int
+xfarray_iter(
+ struct xfarray *array,
+ xfarray_idx_t *idx,
+ void *rec)
+{
+ int ret = xfarray_load_next(array, idx, rec);
+
+ if (ret == -ENODATA)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
+}
+
/* Declarations for xfile array sort functionality. */
typedef cmp_func_t xfarray_cmp_fn;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 465d7630bb21..813f85156b0c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = {
.bmap = xfs_vm_bmap,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 36fe2abb16e6..9e02111bd890 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache;
static const struct xfs_item_ops xfs_attri_item_ops;
static const struct xfs_item_ops xfs_attrd_item_ops;
-static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip);
static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
{
@@ -310,47 +308,6 @@ xfs_attrd_item_intent(
return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
}
-/*
- * Performs one step of an attribute update intent and marks the attrd item
- * dirty.. An attr operation may be a set or a remove. Note that the
- * transaction is marked dirty regardless of whether the operation succeeds or
- * fails to support the ATTRI/ATTRD lifecycle rules.
- */
-STATIC int
-xfs_xattri_finish_update(
- struct xfs_attr_intent *attr,
- struct xfs_attrd_log_item *attrdp)
-{
- struct xfs_da_args *args = attr->xattri_da_args;
- int error;
-
- if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
- error = -EIO;
- goto out;
- }
-
- error = xfs_attr_set_iter(attr);
- if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
- error = -EAGAIN;
-out:
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the ATTRI and frees the ATTRD
- * 2.) shuts down the filesystem
- */
- args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-
- /*
- * attr intent/done items are null when logged attributes are disabled
- */
- if (attrdp)
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
- return error;
-}
-
/* Log an attr to the intent item. */
STATIC void
xfs_attr_log_item(
@@ -360,9 +317,6 @@ xfs_attr_log_item(
{
struct xfs_attri_log_format *attrp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
-
/*
* At this point the xfs_attr_intent has been constructed, and we've
* created the log intent. Fill in the attri log item and log format
@@ -419,7 +373,6 @@ xfs_attr_create_intent(
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
- xfs_trans_add_item(tp, &attrip->attri_item);
xfs_attr_log_item(tp, attrip, attr);
return &attrip->attri_item;
@@ -447,23 +400,33 @@ xfs_attr_finish_item(
struct xfs_btree_cur **state)
{
struct xfs_attr_intent *attr;
- struct xfs_attrd_log_item *done_item = NULL;
+ struct xfs_da_args *args;
int error;
attr = container_of(item, struct xfs_attr_intent, xattri_list);
- if (done)
- done_item = ATTRD_ITEM(done);
+ args = attr->xattri_da_args;
- /*
- * Always reset trans after EAGAIN cycle
- * since the transaction is new
- */
- attr->xattri_da_args->trans = tp;
+ /* Reset trans after EAGAIN cycle since the transaction is new */
+ args->trans = tp;
- error = xfs_xattri_finish_update(attr, done_item);
- if (error != -EAGAIN)
- xfs_attr_free_item(attr);
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+ /* If an attr removal is trivially complete, we're done. */
+ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE &&
+ !xfs_inode_hasattr(args->dp)) {
+ error = 0;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ return -EAGAIN;
+
+out:
+ xfs_attr_free_item(attr);
return error;
}
@@ -532,41 +495,22 @@ xfs_attri_validate(
return xfs_verify_ino(mp, attrp->alfi_ino);
}
-/*
- * Process an attr intent item that was recovered from the log. We need to
- * delete the attr that it describes.
- */
-STATIC int
-xfs_attri_item_recover(
- struct xfs_log_item *lip,
- struct list_head *capture_list)
+static inline struct xfs_attr_intent *
+xfs_attri_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_attri_log_format *attrp,
+ struct xfs_inode **ipp,
+ struct xfs_attri_log_nameval *nv)
{
- struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
struct xfs_attr_intent *attr;
- struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_inode *ip;
struct xfs_da_args *args;
- struct xfs_trans *tp;
- struct xfs_trans_res resv;
- struct xfs_attri_log_format *attrp;
- struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error;
- int total;
int local;
- struct xfs_attrd_log_item *done_item = NULL;
-
- /*
- * First check the validity of the attr described by the ATTRI. If any
- * are bad, then assume that all are bad and just toss the ATTRI.
- */
- attrp = &attrip->attri_format;
- if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
- return -EFSCORRUPTED;
+ int error;
- error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ error = xlog_recover_iget(mp, attrp->alfi_ino, ipp);
if (error)
- return error;
+ return ERR_PTR(error);
attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
sizeof(struct xfs_da_args), KM_NOFS);
@@ -584,7 +528,7 @@ xfs_attri_item_recover(
attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
ASSERT(attr->xattri_nameval);
- args->dp = ip;
+ args->dp = *ipp;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
@@ -608,43 +552,65 @@ xfs_attri_item_recover(
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
- if (!xfs_inode_hasattr(args->dp))
- goto out;
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- goto out;
}
+ xfs_defer_add_item(dfp, &attr->xattri_list);
+ return attr;
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attr_recover_work(
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ struct xfs_log_item *lip = dfp->dfp_intent;
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res resv;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error;
+ int total;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+ args = attr->xattri_da_args;
+
xfs_init_attr_trans(args, &resv, &total);
resv = xlog_recover_resv(&resv);
error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
- goto out;
-
+ return error;
args->trans = tp;
- done_item = xfs_trans_get_attrd(tp, attrip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_xattri_finish_update(attr, done_item);
- if (error == -EAGAIN) {
- /*
- * There's more work to do, so add the intent item to this
- * transaction so that we can continue it later.
- */
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- error = xfs_defer_ops_capture_and_commit(tp, capture_list);
- if (error)
- goto out_unlock;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- return 0;
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &attrip->attri_format,
+ sizeof(attrip->attri_format));
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
@@ -654,18 +620,16 @@ xfs_attri_item_recover(
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
-out:
- xfs_attr_free_item(attr);
return error;
}
/* Re-log an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_attri_item_relog(
+xfs_attr_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_attrd_log_item *attrdp;
struct xfs_attri_log_item *old_attrip;
struct xfs_attri_log_item *new_attrip;
struct xfs_attri_log_format *new_attrp;
@@ -674,10 +638,6 @@ xfs_attri_item_relog(
old_attrip = ATTRI_ITEM(intent);
old_attrp = &old_attrip->attri_format;
- tp->t_flags |= XFS_TRANS_DIRTY;
- attrdp = xfs_trans_get_attrd(tp, old_attrip);
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
/*
* Create a new log item that shares the same name/value buffer as the
* old log item.
@@ -691,12 +651,43 @@ xfs_attri_item_relog(
new_attrp->alfi_name_len = old_attrp->alfi_name_len;
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
- xfs_trans_add_item(tp, &new_attrip->attri_item);
- set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
-
return &new_attrip->attri_item;
}
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attrd_log_item *attrdp;
+
+ attrip = ATTRI_ITEM(intent);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ return &attrdp->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .name = "attr",
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+ .recover_work = xfs_attr_recover_work,
+ .relog_intent = xfs_attr_relog_intent,
+};
+
STATIC int
xlog_recover_attri_commit_pass2(
struct xlog *log,
@@ -767,63 +758,13 @@ xlog_recover_attri_commit_pass2(
attrip = xfs_attri_init(mp, nv);
memcpy(&attrip->attri_format, attri_formatp, len);
- /*
- * The ATTRI has two references. One for the ATTRD and one for ATTRI to
- * ensure it makes it into the AIL. Insert the ATTRI into the AIL
- * directly and drop the ATTRI reference. Note that
- * xfs_trans_ail_update() drops the AIL lock.
- */
- xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
- xfs_attri_release(attrip);
+ xlog_recover_intent_item(log, &attrip->attri_item, lsn,
+ &xfs_attr_defer_type);
xfs_attri_log_nameval_put(nv);
return 0;
}
/*
- * This routine is called to allocate an "attr free done" log item.
- */
-static struct xfs_attrd_log_item *
-xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip)
-{
- struct xfs_attrd_log_item *attrdp;
-
- ASSERT(tp != NULL);
-
- attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
-
- xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
- &xfs_attrd_item_ops);
- attrdp->attrd_attrip = attrip;
- attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
-
- xfs_trans_add_item(tp, &attrdp->attrd_item);
- return attrdp;
-}
-
-/* Get an ATTRD so we can process all the attrs. */
-static struct xfs_log_item *
-xfs_attr_create_done(
- struct xfs_trans *tp,
- struct xfs_log_item *intent,
- unsigned int count)
-{
- if (!intent)
- return NULL;
-
- return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
-}
-
-const struct xfs_defer_op_type xfs_attr_defer_type = {
- .max_items = 1,
- .create_intent = xfs_attr_create_intent,
- .abort_intent = xfs_attr_abort_intent,
- .create_done = xfs_attr_create_done,
- .finish_item = xfs_attr_finish_item,
- .cancel_item = xfs_attr_cancel_item,
-};
-
-/*
* This routine is called when an ATTRD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding ATTRI if
* it was still in the log. To do this it searches the AIL for the ATTRI with
@@ -857,9 +798,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = {
.iop_format = xfs_attri_item_format,
.iop_unpin = xfs_attri_item_unpin,
.iop_release = xfs_attri_item_release,
- .iop_recover = xfs_attri_item_recover,
.iop_match = xfs_attri_item_match,
- .iop_relog = xfs_attri_item_relog,
};
const struct xlog_recover_item_ops xlog_attri_item_ops = {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 99bbbe1a0e44..e368ad671e26 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -56,14 +56,13 @@ xfs_attr_shortform_list(
struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_inode *dp = context->dp;
struct xfs_attr_sf_sort *sbuf, *sbp;
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
int sbsize, nsbuf, count, i;
int error = 0;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
- if (!sf->hdr.count)
+ if (!sf->count)
return 0;
trace_xfs_attr_list_sf(context);
@@ -79,8 +78,8 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
sfe->namelen)))
@@ -109,7 +108,7 @@ xfs_attr_shortform_list(
/*
* It didn't all fit, so we have to sort everything on hashval.
*/
- sbsize = sf->hdr.count * sizeof(*sbuf);
+ sbsize = sf->count * sizeof(*sbuf);
sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
/*
@@ -117,7 +116,7 @@ xfs_attr_shortform_list(
* the relevant info from only those that match into a buffer.
*/
nsbuf = 0;
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e736a0844c89..52fb8a148b7d 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -221,51 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
.iop_intent = xfs_bud_item_intent,
};
-static struct xfs_bud_log_item *
-xfs_trans_get_bud(
- struct xfs_trans *tp,
- struct xfs_bui_log_item *buip)
-{
- struct xfs_bud_log_item *budp;
-
- budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
- &xfs_bud_item_ops);
- budp->bud_buip = buip;
- budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
-
- xfs_trans_add_item(tp, &budp->bud_item);
- return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_bmap_update(
- struct xfs_trans *tp,
- struct xfs_bud_log_item *budp,
- struct xfs_bmap_intent *bi)
-{
- int error;
-
- error = xfs_bmap_finish_one(tp, bi);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the BUI and frees the BUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
- return error;
-}
-
/* Sort bmap intents by inode. */
static int
xfs_bmap_update_diff_items(
@@ -314,9 +269,6 @@ xfs_bmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -346,7 +298,6 @@ xfs_bmap_update_create_intent(
ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- xfs_trans_add_item(tp, &buip->bui_item);
if (sort)
list_sort(mp, items, xfs_bmap_update_diff_items);
list_for_each_entry(bi, items, bi_list)
@@ -354,14 +305,23 @@ xfs_bmap_update_create_intent(
return &buip->bui_item;
}
-/* Get an BUD so we can process all the deferred rmap updates. */
+/* Get an BUD so we can process all the deferred bmap updates. */
static struct xfs_log_item *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
+ struct xfs_bui_log_item *buip = BUI_ITEM(intent);
+ struct xfs_bud_log_item *budp;
+
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+ &xfs_bud_item_ops);
+ budp->bud_buip = buip;
+ budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+ return &budp->bud_item;
}
/* Take a passive ref to the AG containing the space we're mapping. */
@@ -392,7 +352,7 @@ xfs_bmap_update_put_group(
xfs_perag_intent_put(bi->bi_pag);
}
-/* Process a deferred rmap update. */
+/* Process a deferred bmap update. */
STATIC int
xfs_bmap_update_finish_item(
struct xfs_trans *tp,
@@ -405,7 +365,7 @@ xfs_bmap_update_finish_item(
bi = container_of(item, struct xfs_bmap_intent, bi_list);
- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+ error = xfs_bmap_finish_one(tp, bi);
if (!error && bi->bi_bmap.br_blockcount > 0) {
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
@@ -437,15 +397,6 @@ xfs_bmap_update_cancel_item(
kmem_cache_free(xfs_bmap_intent_cache, bi);
}
-const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_bmap_update_create_intent,
- .abort_intent = xfs_bmap_update_abort_intent,
- .create_done = xfs_bmap_update_create_done,
- .finish_item = xfs_bmap_update_finish_item,
- .cancel_item = xfs_bmap_update_cancel_item,
-};
-
/* Is this recovered BUI ok? */
static inline bool
xfs_bui_validate(
@@ -480,23 +431,53 @@ xfs_bui_validate(
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline struct xfs_bmap_intent *
+xfs_bui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_inode **ipp,
+ struct xfs_map_extent *map)
+{
+ struct xfs_bmap_intent *bi;
+ int error;
+
+ error = xlog_recover_iget(mp, map->me_owner, ipp);
+ if (error)
+ return ERR_PTR(error);
+
+ bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ bi->bi_bmap.br_startblock = map->me_startblock;
+ bi->bi_bmap.br_startoff = map->me_startoff;
+ bi->bi_bmap.br_blockcount = map->me_len;
+ bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ bi->bi_owner = *ipp;
+ xfs_bmap_update_get_group(mp, bi);
+
+ xfs_defer_add_item(dfp, &bi->bi_list);
+ return bi;
+}
+
/*
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
*/
STATIC int
-xfs_bui_item_recover(
- struct xfs_log_item *lip,
+xfs_bmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
- struct xfs_bmap_intent fake = { };
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *map;
- struct xfs_bud_log_item *budp;
+ struct xfs_bmap_intent *work;
int iext_delta;
int error = 0;
@@ -507,13 +488,9 @@ xfs_bui_item_recover(
}
map = &buip->bui_format.bui_extents[0];
- fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
-
- error = xlog_recover_iget(mp, map->me_owner, &ip);
- if (error)
- return error;
+ work = xfs_bui_recover_work(mp, dfp, &ip, map);
+ if (IS_ERR(work))
+ return PTR_ERR(work);
/* Allocate transaction and do the work. */
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -522,42 +499,27 @@ xfs_bui_item_recover(
if (error)
goto err_rele;
- budp = xfs_trans_get_bud(tp, buip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- if (fake.bi_type == XFS_BMAP_MAP)
+ if (work->bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
+ error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
- fake.bi_owner = ip;
- fake.bi_bmap.br_startblock = map->me_startblock;
- fake.bi_bmap.br_startoff = map->me_startoff;
- fake.bi_bmap.br_blockcount = map->me_len;
- fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_bmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
+ error = xlog_recover_finish_intent(tp, dfp);
if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
- sizeof(*map));
- xfs_bmap_update_put_group(&fake);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &buip->bui_format, sizeof(buip->bui_format));
if (error)
goto err_cancel;
- if (fake.bi_bmap.br_blockcount > 0) {
- ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
- xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
- }
-
/*
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
@@ -579,21 +541,13 @@ err_rele:
return error;
}
-STATIC bool
-xfs_bui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_bui_item_relog(
+xfs_bmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_bud_log_item *budp;
struct xfs_bui_log_item *buip;
struct xfs_map_extent *map;
unsigned int count;
@@ -601,27 +555,40 @@ xfs_bui_item_relog(
count = BUI_ITEM(intent)->bui_format.bui_nextents;
map = BUI_ITEM(intent)->bui_format.bui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
buip = xfs_bui_init(tp->t_mountp);
memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
atomic_set(&buip->bui_next_extent, count);
- xfs_trans_add_item(tp, &buip->bui_item);
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
return &buip->bui_item;
}
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .name = "bmap",
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+ .recover_work = xfs_bmap_recover_work,
+ .relog_intent = xfs_bmap_relog_intent,
+};
+
+STATIC bool
+xfs_bui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_bui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
.iop_release = xfs_bui_item_release,
- .iop_recover = xfs_bui_item_recover,
.iop_match = xfs_bui_item_match,
- .iop_relog = xfs_bui_item_relog,
};
static inline void
@@ -681,12 +648,9 @@ xlog_recover_bui_commit_pass2(
buip = xfs_bui_init(mp);
xfs_bui_copy_format(&buip->bui_format, bui_formatp);
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
- xfs_bui_release(buip);
+
+ xlog_recover_intent_item(log, &buip->bui_item, lsn,
+ &xfs_bmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 731260a5af6d..c2531c28905c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -69,147 +69,6 @@ xfs_zero_extent(
GFP_NOFS, 0);
}
-#ifdef CONFIG_XFS_RT
-int
-xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_mount *mp = ap->ip->i_mount;
- xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtxnum_t rtx;
- xfs_rtxlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_rtxlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_extlen_t orig_length = ap->length;
- xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_rtxlen_t raminlen;
- bool rtlocked = false;
- bool ignore_locality = false;
- int error;
-
- align = xfs_get_extsz_hint(ap->ip);
-retry:
- prod = xfs_extlen_to_rtxlen(mp, align);
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 1, ap->eof, 0,
- ap->conv, &ap->offset, &ap->length);
- if (error)
- return error;
- ASSERT(ap->length);
- ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
-
- /*
- * If we shifted the file offset downward to satisfy an extent size
- * hint, increase minlen by that amount so that the allocator won't
- * give us an allocation that's too short to cover at least one of the
- * blocks that the caller asked for.
- */
- if (ap->offset != orig_offset)
- minlen += orig_offset - ap->offset;
-
- /*
- * If the offset & length are not perfectly aligned
- * then kill prod, it will just get us in trouble.
- */
- div_u64_rem(ap->offset, align, &mod);
- if (mod || ap->length % align)
- prod = 1;
- /*
- * Set ralen to be the actual requested length in rtextents.
- *
- * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
- * we rounded up to it, cut it back so it's valid again.
- * Note that if it's a really large request (bigger than
- * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
- * adjust the starting point to match it.
- */
- ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
-
- /*
- * Lock out modifications to both the RT bitmap and summary inodes
- */
- if (!rtlocked) {
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- rtlocked = true;
- }
-
- /*
- * If it's an allocation to an empty file at offset 0,
- * pick an extent that will space things out in the rt area.
- */
- if (ap->eof && ap->offset == 0) {
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
- if (error)
- return error;
- ap->blkno = xfs_rtx_to_rtb(mp, rtx);
- } else {
- ap->blkno = 0;
- }
-
- xfs_bmap_adjacent(ap);
-
- /*
- * Realtime allocation, done through xfs_rtallocate_extent.
- */
- if (ignore_locality)
- rtx = 0;
- else
- rtx = xfs_rtb_to_rtx(mp, ap->blkno);
- raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
- error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
- ap->wasdel, prod, &rtx);
- if (error)
- return error;
-
- if (rtx != NULLRTEXTNO) {
- ap->blkno = xfs_rtx_to_rtb(mp, rtx);
- ap->length = xfs_rtxlen_to_extlen(mp, ralen);
- ap->ip->i_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= ap->length;
- /*
- * Adjust the disk quota also. This was reserved
- * earlier.
- */
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, ap->length);
- return 0;
- }
-
- if (align > mp->m_sb.sb_rextsize) {
- /*
- * We previously enlarged the request length to try to satisfy
- * an extent size hint. The allocator didn't return anything,
- * so reset the parameters to the original values and try again
- * without alignment criteria.
- */
- ap->offset = orig_offset;
- ap->length = orig_length;
- minlen = align = mp->m_sb.sb_rextsize;
- goto retry;
- }
-
- if (!ignore_locality && ap->blkno != 0) {
- /*
- * If we can't allocate near a specific rt extent, try again
- * without locality criteria.
- */
- ignore_locality = true;
- goto retry;
- }
-
- ap->blkno = NULLFSBLOCK;
- ap->length = 0;
- return 0;
-}
-#endif /* CONFIG_XFS_RT */
-
/*
* Extent tree block counting routines.
*/
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..77ecbb753ef2 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
int rt, int eof, int delay, int convert,
xfs_fileoff_t *offp, xfs_extlen_t *lenp);
-void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+bool xfs_bmap_adjacent(struct xfs_bmalloca *ap);
int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *rec,
int *is_empty);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 545c7991b9b5..8e5bd50d29fe 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -169,7 +169,7 @@ xfs_buf_stale(
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold);
ASSERT(atomic_read(&bp->b_hold) >= 1);
@@ -1047,7 +1047,7 @@ xfs_buf_rele(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
@@ -1060,7 +1060,7 @@ xfs_buf_rele(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
@@ -2049,6 +2049,14 @@ error_free:
return NULL;
}
+static inline void
+xfs_buf_list_del(
+ struct xfs_buf *bp)
+{
+ list_del_init(&bp->b_list);
+ wake_up_var(&bp->b_list);
+}
+
/*
* Cancel a delayed write list.
*
@@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel(
xfs_buf_lock(bp);
bp->b_flags &= ~_XBF_DELWRI_Q;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
}
}
@@ -2120,6 +2128,34 @@ xfs_buf_delwri_queue(
}
/*
+ * Queue a buffer to this delwri list as part of a data integrity operation.
+ * If the buffer is on any other delwri list, we'll wait for that to clear
+ * so that the caller can submit the buffer for IO and wait for the result.
+ * Callers must ensure the buffer is not already on the list.
+ */
+void
+xfs_buf_delwri_queue_here(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ /*
+ * We need this buffer to end up on the /caller's/ delwri list, not any
+ * old list. This can happen if the buffer is marked stale (which
+ * clears DELWRI_Q) after the AIL queues the buffer to its list but
+ * before the AIL has a chance to submit the list.
+ */
+ while (!list_empty(&bp->b_list)) {
+ xfs_buf_unlock(bp);
+ wait_var_event(&bp->b_list, list_empty(&bp->b_list));
+ xfs_buf_lock(bp);
+ }
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+ xfs_buf_delwri_queue(bp, buffer_list);
+}
+
+/*
* Compare function is more complex than it needs to be because
* the return value is only 32 bits and we are doing comparisons
* on 64 bit values
@@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers(
* reference and remove it from the list here.
*/
if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
continue;
}
@@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers(
list_move_tail(&bp->b_list, wait_list);
} else {
bp->b_flags |= XBF_ASYNC;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
}
__xfs_buf_submit(bp, false);
}
@@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit(
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
/*
* Wait on the locked buffer, check for errors and unlock and
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c86e16419656..b470de08a46c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 9f3ceb461515..cc6dc56f455d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,6 +18,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Directory file type support functions
@@ -51,7 +52,7 @@ xfs_dir2_sf_getdents(
struct xfs_mount *mp = dp->i_mount;
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
@@ -59,9 +60,7 @@ xfs_dir2_sf_getdents(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/*
* If the block number in the offset is out of range, we're done.
@@ -519,6 +518,8 @@ xfs_readdir(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a013b87ab8d5..b4f20d9c8f98 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers(
/*
* initialize a buffer full of dquots and log the whole thing
*/
-STATIC void
+void
xfs_qm_init_dquot_blk(
struct xfs_trans *tp,
- struct xfs_mount *mp,
xfs_dqid_t id,
xfs_dqtype_t type,
struct xfs_buf *bp)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_dqblk *d;
xfs_dqid_t curid;
@@ -353,7 +353,7 @@ xfs_dquot_disk_alloc(
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
+ xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
@@ -1065,7 +1065,7 @@ xfs_qm_dqput(
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
trace_xfs_dqput_free(dqp);
- if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
@@ -1362,34 +1362,3 @@ xfs_qm_exit(void)
kmem_cache_destroy(xfs_dqtrx_cache);
kmem_cache_destroy(xfs_dquot_cache);
}
-
-/*
- * Iterate every dquot of a particular type. The caller must ensure that the
- * particular quota type is active. iter_fn can return negative error codes,
- * or -ECANCELED to indicate that it wants to stop iterating.
- */
-int
-xfs_qm_dqiterate(
- struct xfs_mount *mp,
- xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn,
- void *priv)
-{
- struct xfs_dquot *dq;
- xfs_dqid_t id = 0;
- int error;
-
- do {
- error = xfs_qm_dqget_next(mp, id, type, &dq);
- if (error == -ENOENT)
- return 0;
- if (error)
- return error;
-
- error = iter_fn(dq, type, priv);
- id = dq->q_id + 1;
- xfs_qm_dqput(dq);
- } while (error == 0 && id != 0);
-
- return error;
-}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 80c8f851a2f3..956272d9b302 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
- xfs_dqtype_t type, void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn, void *priv);
-
time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
time64_t xfs_dquot_set_grace_period(time64_t grace);
+void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t
+ type, struct xfs_buf *bp);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 9ecfdcdc752f..2ccde32c9a9e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp(
diff = b1->bno - b2->bno;
return diff;
}
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+ struct xfs_perag *pag)
+{
+ bool res;
+
+ spin_lock(&pag->pagb_lock);
+ res = RB_EMPTY_ROOT(&pag->pagb_tree);
+ spin_unlock(&pag->pagb_lock);
+ return res;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 0639aab336f3..470032de3139 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3fa8789820ad..1d1185fca6a5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
};
/*
- * Allocate an "extent free done" log item that will hold nextents worth of
- * extents. The caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-static struct xfs_efd_log_item *
-xfs_trans_get_efd(
- struct xfs_trans *tp,
- struct xfs_efi_log_item *efip,
- unsigned int nextents)
-{
- struct xfs_efd_log_item *efdp;
-
- ASSERT(nextents > 0);
-
- if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
- GFP_KERNEL | __GFP_NOFAIL);
- } else {
- efdp = kmem_cache_zalloc(xfs_efd_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- }
-
- xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
- &xfs_efd_item_ops);
- efdp->efd_efip = efip;
- efdp->efd_format.efd_nextents = nextents;
- efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-
- xfs_trans_add_item(tp, &efdp->efd_item);
- return efdp;
-}
-
-/*
* Fill the EFD with all extents from the EFI when we need to roll the
* transaction and continue with a new EFI.
*
@@ -364,69 +331,6 @@ xfs_efd_from_efi(
efdp->efd_next_extent = efip->efi_format.efi_nextents;
}
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-static int
-xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- struct xfs_extent_free_item *xefi)
-{
- struct xfs_owner_info oinfo = { };
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent *extp;
- uint next_extent;
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- xefi->xefi_startblock);
- int error;
-
- oinfo.oi_owner = xefi->xefi_owner;
- if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
-
- trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
- agbno, xefi->xefi_blockcount);
-
- error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
- xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
- xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- /*
- * If we need a new transaction to make progress, the caller will log a
- * new EFI with the current contents. It will also log an EFD to cancel
- * the existing EFI, and so we need to copy all the unprocessed extents
- * in this EFI to the EFD so this works correctly.
- */
- if (error == -EAGAIN) {
- xfs_efd_from_efi(efdp);
- return error;
- }
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = xefi->xefi_startblock;
- extp->ext_len = xefi->xefi_blockcount;
- efdp->efd_next_extent++;
-
- return error;
-}
-
/* Sort bmap items by AG. */
static int
xfs_extent_free_diff_items(
@@ -453,9 +357,6 @@ xfs_extent_free_log_item(
uint next_extent;
struct xfs_extent *extp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -481,7 +382,6 @@ xfs_extent_free_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &efip->efi_item);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
list_for_each_entry(xefi, items, xefi_list)
@@ -496,7 +396,26 @@ xfs_extent_free_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
+ struct xfs_efi_log_item *efip = EFI_ITEM(intent);
+ struct xfs_efd_log_item *efdp;
+
+ ASSERT(count > 0);
+
+ if (count > XFS_EFD_MAX_FAST_EXTENTS) {
+ efdp = kzalloc(xfs_efd_log_item_sizeof(count),
+ GFP_KERNEL | __GFP_NOFAIL);
+ } else {
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ }
+
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+ &xfs_efd_item_ops);
+ efdp->efd_efip = efip;
+ efdp->efd_format.efd_nextents = count;
+ efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+ return &efdp->efd_item;
}
/* Take a passive ref to the AG containing the space we're freeing. */
@@ -527,19 +446,49 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *xefi;
- int error;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agblock_t agbno;
+ int error = 0;
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+
+ oinfo.oi_owner = xefi->xefi_owner;
+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+ trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+ agbno, xefi->xefi_blockcount);
/*
- * Don't free the XEFI if we need a new transaction to complete
- * processing of it.
+ * If we need a new transaction to make progress, the caller will log a
+ * new EFI with the current contents. It will also log an EFD to cancel
+ * the existing EFI, and so we need to copy all the unprocessed extents
+ * in this EFI to the EFD so this works correctly.
*/
- if (error == -EAGAIN)
+ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
+ error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+ xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ if (error == -EAGAIN) {
+ xfs_efd_from_efi(efdp);
return error;
+ }
+
+ /* Add the work we finished to the EFD, even though nobody uses that */
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
+ efdp->efd_next_extent++;
xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -567,15 +516,6 @@ xfs_extent_free_cancel_item(
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
-const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_extent_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/*
* AGFL blocks are accounted differently in the reserve pools and are not
* inserted into the busy extent list.
@@ -610,16 +550,6 @@ xfs_agfl_free_finish_item(
error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
agbno, agbp, &oinfo);
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
@@ -632,16 +562,6 @@ xfs_agfl_free_finish_item(
return error;
}
-/* sub-type with special handling for AGFL deferred frees */
-const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_agfl_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/* Is this recovered EFI ok? */
static inline bool
xfs_efi_validate_ext(
@@ -651,23 +571,41 @@ xfs_efi_validate_ext(
return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
}
+static inline void
+xfs_efi_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_extent *extp)
+{
+ struct xfs_extent_free_item *xefi;
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ xefi->xefi_startblock = extp->ext_start;
+ xefi->xefi_blockcount = extp->ext_len;
+ xefi->xefi_agresv = XFS_AG_RESV_NONE;
+ xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
+ xfs_extent_free_get_group(mp, xefi);
+
+ xfs_defer_add_item(dfp, &xefi->xefi_list);
+}
+
/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
STATIC int
-xfs_efi_item_recover(
- struct xfs_log_item *lip,
+xfs_extent_free_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
int i;
int error = 0;
- bool requeue_only = false;
/*
* First check the validity of the extents described by the
@@ -682,55 +620,22 @@ xfs_efi_item_recover(
sizeof(efip->efi_format));
return -EFSCORRUPTED;
}
+
+ xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
- efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
-
- for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- struct xfs_extent_free_item fake = {
- .xefi_owner = XFS_RMAP_OWN_UNKNOWN,
- .xefi_agresv = XFS_AG_RESV_NONE,
- };
- struct xfs_extent *extp;
- extp = &efip->efi_format.efi_extents[i];
-
- fake.xefi_startblock = extp->ext_start;
- fake.xefi_blockcount = extp->ext_len;
-
- if (!requeue_only) {
- xfs_extent_free_get_group(mp, &fake);
- error = xfs_trans_free_extent(tp, efdp, &fake);
- xfs_extent_free_put_group(&fake);
- }
-
- /*
- * If we can't free the extent without potentially deadlocking,
- * requeue the rest of the extents to a new so that they get
- * run again later with a new transaction context.
- */
- if (error == -EAGAIN || requeue_only) {
- error = xfs_free_extent_later(tp, fake.xefi_startblock,
- fake.xefi_blockcount,
- &XFS_RMAP_OINFO_ANY_OWNER,
- fake.xefi_agresv);
- if (!error) {
- requeue_only = true;
- continue;
- }
- }
-
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- extp, sizeof(*extp));
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &efip->efi_format,
+ sizeof(efip->efi_format));
+ if (error)
+ goto abort_error;
return xfs_defer_ops_capture_and_commit(tp, capture_list);
@@ -739,21 +644,14 @@ abort_error:
return error;
}
-STATIC bool
-xfs_efi_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_efi_item_relog(
+xfs_extent_free_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_efd_log_item *efdp;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done_item);
struct xfs_efi_log_item *efip;
struct xfs_extent *extp;
unsigned int count;
@@ -761,29 +659,56 @@ xfs_efi_item_relog(
count = EFI_ITEM(intent)->efi_format.efi_nextents;
extp = EFI_ITEM(intent)->efi_format.efi_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
efdp->efd_next_extent = count;
memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
efip = xfs_efi_init(tp->t_mountp, count);
memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
atomic_set(&efip->efi_next_extent, count);
- xfs_trans_add_item(tp, &efip->efi_item);
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
return &efip->efi_item;
}
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .name = "extent_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .name = "agfl_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+STATIC bool
+xfs_efi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_efi_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
.iop_release = xfs_efi_item_release,
- .iop_recover = xfs_efi_item_recover,
.iop_match = xfs_efi_item_match,
- .iop_relog = xfs_efi_item_relog,
};
/*
@@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2(
return error;
}
atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
- xfs_efi_release(efip);
+
+ xlog_recover_intent_item(log, &efip->efi_item, lsn,
+ &xfs_extent_free_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7cb75cb6b8e9..83f708f62ed9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -134,6 +134,10 @@ xfs_growfs_data_private(
if (delta < 0 && nagcount < 2)
return -EINVAL;
+ /* No work to do */
+ if (delta == 0)
+ return 0;
+
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
@@ -153,7 +157,7 @@ xfs_growfs_data_private(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0,
0, &tp);
if (error)
- return error;
+ goto out_free_unused_perag;
last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
@@ -227,6 +231,9 @@ xfs_growfs_data_private(
out_trans_cancel:
xfs_trans_cancel(tp);
+out_free_unused_perag:
+ if (nagcount > oagcount)
+ xfs_free_unused_perag_range(mp, oagcount, nagcount);
return error;
}
@@ -344,59 +351,20 @@ xfs_growfs_log(
}
/*
- * exported through ioctl XFS_IOC_FSCOUNTS
- */
-
-void
-xfs_fs_counts(
- xfs_mount_t *mp,
- xfs_fsop_counts_t *cnt)
-{
- cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
- cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
- cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
-}
-
-/*
- * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
- *
- * xfs_reserve_blocks is called to set m_resblks
- * in the in-core mount table. The number of unused reserved blocks
- * is kept in m_resblks_avail.
- *
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
- * reserved are returned in outval
- *
- * A null inval pointer indicates that only the current reserved blocks
- * available should be returned no settings are changed.
+ * reserved are returned in outval.
*/
-
int
xfs_reserve_blocks(
- xfs_mount_t *mp,
- uint64_t *inval,
- xfs_fsop_resblks_t *outval)
+ struct xfs_mount *mp,
+ uint64_t request)
{
int64_t lcounter, delta;
int64_t fdblks_delta = 0;
- uint64_t request;
int64_t free;
int error = 0;
- /* If inval is null, report current values and return */
- if (inval == (uint64_t *)NULL) {
- if (!outval)
- return -EINVAL;
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- return 0;
- }
-
- request = *inval;
-
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -466,11 +434,6 @@ xfs_reserve_blocks(
spin_lock(&mp->m_sb_lock);
}
out:
- if (outval) {
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- }
-
spin_unlock(&mp->m_sb_lock);
return error;
}
@@ -482,9 +445,9 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- if (!freeze_bdev(mp->m_super->s_bdev)) {
+ if (!bdev_freeze(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(mp->m_super->s_bdev);
+ bdev_thaw(mp->m_super->s_bdev);
}
break;
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 2cffe51a31e8..44457b0a0593 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,14 +6,12 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
-extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
-extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
- xfs_fsop_resblks_t *outval);
-extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
+int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
+int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
-extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 9edc1f2bc939..f18fec0adf66 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = {
.pwork_threads = -1, /* automatic thread detection */
.larp = false, /* log attribute replay */
#endif
+
+ /*
+ * Leave this many record slots empty when bulk loading btrees. By
+ * default we load new btree leaf blocks 75% full.
+ */
+ .bload_leaf_slack = -1,
+
+ /*
+ * Leave this many key/ptr slots empty when bulk loading btrees. By
+ * default we load new btree node blocks 75% full.
+ */
+ .bload_node_slack = -1,
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 72a075bb2c10..9a57afee9338 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -222,7 +222,7 @@ xfs_inode_mark_sick(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
trace_xfs_inode_mark_sick(ip, mask);
spin_lock(&ip->i_flags_lock);
@@ -246,7 +246,7 @@ xfs_inode_mark_healthy(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
trace_xfs_inode_mark_healthy(ip, mask);
spin_lock(&ip->i_flags_lock);
@@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR },
{ XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK },
{ XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT },
+ { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD },
+ { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA },
+ { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
+ { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
{ 0, 0 },
};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0f1c89786c2..1fd94958aa97 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -37,15 +37,10 @@
#include "xfs_reflink.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_inode_cache;
-/*
- * Used in xfs_itruncate_extents(). This is the maximum number of extents
- * freed from a file in a single transaction.
- */
-#define XFS_ITRUNC_MAX_EXTENTS 2
-
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_inode *);
@@ -661,6 +656,8 @@ xfs_lookup(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
if (error)
@@ -875,7 +872,7 @@ xfs_init_new_inode(
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_bytes = 0;
- ip->i_df.if_u1.if_root = NULL;
+ ip->i_df.if_data = NULL;
break;
default:
ASSERT(0);
@@ -978,6 +975,8 @@ xfs_create(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1217,6 +1216,8 @@ xfs_link(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(sip);
if (error)
@@ -1339,7 +1340,6 @@ xfs_itruncate_extents_flags(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_filblks_t unmap_len;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1371,19 +1371,10 @@ xfs_itruncate_extents_flags(
return 0;
}
- unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
- while (unmap_len > 0) {
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
- flags, XFS_ITRUNC_MAX_EXTENTS);
- if (error)
- goto out;
-
- /* free the just unmapped extents */
- error = xfs_defer_finish(&tp);
- if (error)
- goto out;
- }
+ error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
@@ -2387,8 +2378,8 @@ xfs_ifree(
* already been freed by xfs_attr_inactive.
*/
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kmem_free(ip->i_df.if_u1.if_data);
- ip->i_df.if_u1.if_data = NULL;
+ kmem_free(ip->i_df.if_data);
+ ip->i_df.if_data = NULL;
ip->i_df.if_bytes = 0;
}
@@ -2506,6 +2497,8 @@ xfs_remove(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(dp);
if (error)
@@ -3758,3 +3751,29 @@ xfs_inode_reload_unlinked(
return error;
}
+
+/* Has this inode fork been zapped by repair? */
+bool
+xfs_ifork_zapped(
+ const struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int datamask = 0;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ switch (ip->i_vnode.i_mode & S_IFMT) {
+ case S_IFDIR:
+ datamask = XFS_SICK_INO_DIR_ZAPPED;
+ break;
+ case S_IFLNK:
+ datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
+ break;
+ }
+ return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
+ case XFS_ATTR_FORK:
+ return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
+ default:
+ return false;
+ }
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3beb470f1892..97f63bacd4c2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete(
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cd7803fda8b1..0aee97ba0be8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_df.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data,
- ip->i_df.if_bytes);
+ ip->i_df.if_data, ip->i_df.if_bytes);
ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
@@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_af.if_bytes > 0) {
- ASSERT(ip->i_af.if_u1.if_data != NULL);
+ ASSERT(ip->i_af.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_af.if_u1.if_data,
- ip->i_af.if_bytes);
+ ip->i_af.if_data, ip->i_af.if_bytes);
ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
@@ -557,6 +555,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_v3_pad = 0;
+
+ /* dummy value for initialisation */
+ to->di_crc = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6c3919687ea6..f02b6e558af5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1872,6 +1872,63 @@ xfs_fs_eofblocks_from_user(
return 0;
}
+static int
+xfs_ioctl_getset_resblocks(
+ struct file *filp,
+ unsigned int cmd,
+ void __user *arg)
+{
+ struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount;
+ struct xfs_fsop_resblks fsop = { };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (cmd == XFS_IOC_SET_RESBLKS) {
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ if (copy_from_user(&fsop, arg, sizeof(fsop)))
+ return -EFAULT;
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_reserve_blocks(mp, fsop.resblks);
+ mnt_drop_write_file(filp);
+ if (error)
+ return error;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ fsop.resblks = mp->m_resblks;
+ fsop.resblks_avail = mp->m_resblks_avail;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (copy_to_user(arg, &fsop, sizeof(fsop)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+xfs_ioctl_fs_counts(
+ struct xfs_mount *mp,
+ struct xfs_fsop_counts __user *uarg)
+{
+ struct xfs_fsop_counts out = {
+ .allocino = percpu_counter_read_positive(&mp->m_icount),
+ .freeino = percpu_counter_read_positive(&mp->m_ifree),
+ .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp),
+ .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ };
+
+ if (copy_to_user(uarg, &out, sizeof(out)))
+ return -EFAULT;
+ return 0;
+}
+
/*
* These long-unused ioctls were removed from the official ioctl API in 5.17,
* but retain these definitions so that we can log warnings about them.
@@ -2008,60 +2065,12 @@ xfs_file_ioctl(
return error;
}
- case XFS_IOC_FSCOUNTS: {
- xfs_fsop_counts_t out;
+ case XFS_IOC_FSCOUNTS:
+ return xfs_ioctl_fs_counts(mp, arg);
- xfs_fs_counts(mp, &out);
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_SET_RESBLKS: {
- xfs_fsop_resblks_t inout;
- uint64_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (xfs_is_readonly(mp))
- return -EROFS;
-
- if (copy_from_user(&inout, arg, sizeof(inout)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- /* input parameter is passed in resblks field of structure */
- in = inout.resblks;
- error = xfs_reserve_blocks(mp, &in, &inout);
- mnt_drop_write_file(filp);
- if (error)
- return error;
-
- if (copy_to_user(arg, &inout, sizeof(inout)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_GET_RESBLKS: {
- xfs_fsop_resblks_t out;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_reserve_blocks(mp, NULL, &out);
- if (error)
- return error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
-
- return 0;
- }
+ case XFS_IOC_SET_RESBLKS:
+ case XFS_IOC_GET_RESBLKS:
+ return xfs_ioctl_getset_resblocks(filp, cmd, arg);
case XFS_IOC_FSGROWFSDATA: {
struct xfs_growfs_data in;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ee206facf0dc..a1650fc81382 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1542,6 +1542,7 @@ xlog_alloc_log(
log->l_covered_state = XLOG_STATE_COVER_IDLE;
set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+ INIT_LIST_HEAD(&log->r_dfops);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fa3ad1d7b31c..e30c06ec20e3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -407,6 +407,7 @@ struct xlog {
long l_opstate; /* operational state */
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
+ struct list_head r_dfops; /* recovered log intent items */
int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a1e18b24971a..1251c81e55f9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks(
*/
void
xlog_recover_release_intent(
- struct xlog *log,
- unsigned short intent_type,
- uint64_t intent_id)
+ struct xlog *log,
+ unsigned short intent_type,
+ uint64_t intent_id)
{
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp = log->l_ailp;
+ struct xfs_defer_pending *dfp, *n;
+
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ struct xfs_log_item *lip = dfp->dfp_intent;
- spin_lock(&ailp->ail_lock);
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
if (lip->li_type != intent_type)
continue;
if (!lip->li_ops->iop_match(lip, intent_id))
continue;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- break;
- }
+ ASSERT(xlog_item_is_intent(lip));
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
+ }
}
int
@@ -1939,6 +1933,29 @@ xlog_buf_readahead(
xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
}
+/*
+ * Create a deferred work structure for resuming and tracking the progress of a
+ * log intent item that was found during recovery.
+ */
+void
+xlog_recover_intent_item(
+ struct xlog *log,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn,
+ const struct xfs_defer_op_type *ops)
+{
+ ASSERT(xlog_item_is_intent(lip));
+
+ xfs_defer_start_recovery(lip, &log->r_dfops, ops);
+
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, lip, lsn);
+ lip->li_ops->iop_unpin(lip, 0);
+}
+
STATIC int
xlog_recover_items_pass2(
struct xlog *log,
@@ -2533,36 +2550,26 @@ xlog_abort_defer_ops(
*/
STATIC int
xlog_recover_process_intents(
- struct xlog *log)
+ struct xlog *log)
{
LIST_HEAD(capture_list);
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp;
- int error = 0;
+ struct xfs_defer_pending *dfp, *n;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
- xfs_lsn_t last_lsn;
-#endif
+ xfs_lsn_t last_lsn;
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
-#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
- const struct xfs_item_ops *ops;
- if (!xlog_item_is_intent(lip))
- break;
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
/*
* We should never see a redo item with a LSN higher than
* the last transaction we found in the log at the start
* of recovery.
*/
- ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+ ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0);
/*
* NOTE: If your intent processing routine can create more
@@ -2571,21 +2578,14 @@ xlog_recover_process_intents(
* replayed in the wrong order!
*
* The recovery function can free the log item, so we must not
- * access lip after it returns.
+ * access dfp->dfp_intent after it returns. It must dispose of
+ * @dfp if it returns 0.
*/
- spin_unlock(&ailp->ail_lock);
- ops = lip->li_ops;
- error = ops->iop_recover(lip, &capture_list);
- spin_lock(&ailp->ail_lock);
- if (error) {
- trace_xlog_intent_recovery_failed(log->l_mp, error,
- ops->iop_recover);
+ error = xfs_defer_finish_recovery(log->l_mp, dfp,
+ &capture_list);
+ if (error)
break;
- }
}
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
if (error)
goto err;
@@ -2606,27 +2606,34 @@ err:
*/
STATIC void
xlog_recover_cancel_intents(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_log_item *lip;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp;
-
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (!xlog_item_is_intent(lip))
- break;
+ struct xfs_defer_pending *dfp, *n;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
+
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
}
+}
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+/*
+ * Transfer ownership of the recovered pending work to the recovery transaction
+ * and try to finish the work. If there is more work to be done, the dfp will
+ * remain attached to the transaction. If not, the dfp is freed.
+ */
+int
+xlog_recover_finish_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ int error;
+
+ list_move(&dfp->dfp_list, &tp->t_dfops);
+ error = xfs_defer_finish_one(tp, dfp);
+ if (error == -EAGAIN)
+ return 0;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aed5be5508fe..aabb25dc3efa 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -637,7 +637,6 @@ xfs_mountfs(
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
@@ -974,8 +973,7 @@ xfs_mountfs(
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- resblks = xfs_default_resblks(mp);
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
@@ -1053,7 +1051,6 @@ void
xfs_unmountfs(
struct xfs_mount *mp)
{
- uint64_t resblks;
int error;
/*
@@ -1090,8 +1087,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- resblks = 0;
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00f..fa50e5308292 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/dax.h>
+#include <linux/fs.h>
struct xfs_failure_info {
xfs_agblock_t startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip;
struct xfs_failure_info *notify = data;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+ unsigned long pgcnt;
int error = 0;
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ /* Continue the query because this isn't a failure. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
notify->want_shutdown = true;
return 0;
}
@@ -92,15 +99,61 @@ xfs_dax_failure_fn(
return 0;
}
- error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
- xfs_failure_pgoff(mp, rec, notify),
- xfs_failure_pgcnt(mp, rec, notify),
- notify->mf_flags);
+ mapping = VFS_I(ip)->i_mapping;
+ pgoff = xfs_failure_pgoff(mp, rec, notify);
+ pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+ /* Continue the rmap query if the inode isn't a dax file. */
+ if (dax_mapping(mapping))
+ error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+ notify->mf_flags);
+
+ /* Invalidate the cache in dax pages. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ invalidate_inode_pages2_range(mapping, pgoff,
+ pgoff + pgcnt - 1);
+
xfs_irele(ip);
return error;
}
static int
+xfs_dax_notify_failure_freeze(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+ return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+ struct xfs_mount *mp,
+ bool kernel_frozen)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ if (kernel_frozen) {
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "still frozen after notify failure, err=%d",
+ error);
+ }
+
+ /*
+ * Also thaw userspace call anyway because the device is about to be
+ * removed immediately.
+ */
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
+static int
xfs_dax_notify_ddev_failure(
struct xfs_mount *mp,
xfs_daddr_t daddr,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
struct xfs_btree_cur *cur = NULL;
struct xfs_buf *agf_bp = NULL;
int error = 0;
+ bool kernel_frozen = false;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
daddr + bblen - 1);
xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+ if (mf_flags & MF_MEM_PRE_REMOVE) {
+ xfs_info(mp, "Device is about to be removed!");
+ /*
+ * Freeze fs to prevent new mappings from being created.
+ * - Keep going on if others already hold the kernel forzen.
+ * - Keep going on if other errors too because this device is
+ * starting to fail.
+ * - If kernel frozen state is hold successfully here, thaw it
+ * here as well at the end.
+ */
+ kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+ }
+
error = xfs_trans_alloc_empty(mp, &tp);
if (error)
- return error;
+ goto out;
for (; agno <= end_agno; agno++) {
struct xfs_rmap_irec ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
}
xfs_trans_cancel(tp);
- if (error || notify.want_shutdown) {
+
+ /*
+ * Shutdown fs from a force umount in pre-remove case which won't fail,
+ * so errors can be ignored. Otherwise, shutdown the filesystem with
+ * CORRUPT flag if error occured or notify.want_shutdown was set during
+ * RMAP querying.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+ else if (error || notify.want_shutdown) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
if (!error)
error = -EFSCORRUPTED;
}
+
+out:
+ /* Thaw the fs if it has been frozen before. */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
return error;
}
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
mp->m_logdev_targp != mp->m_ddev_targp) {
+ /*
+ * In the pre-remove case the failure notification is attempting
+ * to trigger a force unmount. The expectation is that the
+ * device is still present, but its removal is in progress and
+ * can not be cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
xfs_err(mp, "ondisk log corrupt, shutting down fs!");
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
ddev_start = mp->m_ddev_targp->bt_dax_part_off;
ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = ddev_start;
+ len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+ }
+
/* Ignore the range out of filesystem area */
if (offset + len - 1 < ddev_start)
return -ENXIO;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 94a7932ac570..67d0a8564ff3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -171,7 +171,7 @@ xfs_qm_dqpurge(
* hits zero, so it really should be on the freelist here.
*/
ASSERT(!list_empty(&dqp->q_lru));
- list_lru_del(&qi->qi_lru, &dqp->q_lru);
+ list_lru_del_obj(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index dcc785fdd345..e0d56489f3b2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
+static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
+ struct xfs_inode *ip, uint field, int64_t delta)
+{
+}
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2d4444d61e98..20ad8086da60 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
.iop_intent = xfs_cud_item_intent,
};
-static struct xfs_cud_log_item *
-xfs_trans_get_cud(
- struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip)
-{
- struct xfs_cud_log_item *cudp;
-
- cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
- &xfs_cud_item_ops);
- cudp->cud_cuip = cuip;
- cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
-
- xfs_trans_add_item(tp, &cudp->cud_item);
- return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_refcount_update(
- struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- struct xfs_refcount_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_refcount_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the CUI and frees the CUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
- return error;
-}
-
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -318,9 +272,6 @@ xfs_refcount_update_log_item(
uint next_extent;
struct xfs_phys_extent *pmap;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -347,7 +298,6 @@ xfs_refcount_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &cuip->cui_item);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -362,7 +312,16 @@ xfs_refcount_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
+ struct xfs_cui_log_item *cuip = CUI_ITEM(intent);
+ struct xfs_cud_log_item *cudp;
+
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+ &xfs_cud_item_ops);
+ cudp->cud_cuip = cuip;
+ cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+ return &cudp->cud_item;
}
/* Take a passive ref to the AG containing the space we're refcounting. */
@@ -397,10 +356,9 @@ xfs_refcount_update_finish_item(
int error;
ri = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
- state);
/* Did we run out of reservation? Requeue what we didn't finish. */
+ error = xfs_refcount_finish_one(tp, ri, state);
if (!error && ri->ri_blockcount > 0) {
ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
ri->ri_type == XFS_REFCOUNT_DECREASE);
@@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item(
kmem_cache_free(xfs_refcount_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_refcount_update_create_intent,
- .abort_intent = xfs_refcount_update_abort_intent,
- .create_done = xfs_refcount_update_create_done,
- .finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_finish_one_cleanup,
- .cancel_item = xfs_refcount_update_cancel_item,
-};
-
/* Is this recovered CUI ok? */
static inline bool
xfs_cui_validate_phys(
@@ -468,23 +416,38 @@ xfs_cui_validate_phys(
return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
+static inline void
+xfs_cui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_phys_extent *pmap)
+{
+ struct xfs_refcount_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ ri->ri_startblock = pmap->pe_startblock;
+ ri->ri_blockcount = pmap->pe_len;
+ xfs_refcount_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
STATIC int
-xfs_cui_item_recover(
- struct xfs_log_item *lip,
+xfs_refcount_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
- struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- unsigned int refc_type;
- bool requeue_only = false;
int i;
int error = 0;
@@ -501,6 +464,8 @@ xfs_cui_item_recover(
sizeof(cuip->cui_format));
return -EFSCORRUPTED;
}
+
+ xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
}
/*
@@ -521,100 +486,28 @@ xfs_cui_item_recover(
if (error)
return error;
- cudp = xfs_trans_get_cud(tp, cuip);
-
- for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- struct xfs_refcount_intent fake = { };
- struct xfs_phys_extent *pmap;
-
- pmap = &cuip->cui_format.cui_extents[i];
- refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
- switch (refc_type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- fake.ri_type = refc_type;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_startblock = pmap->pe_startblock;
- fake.ri_blockcount = pmap->pe_len;
-
- if (!requeue_only) {
- xfs_refcount_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_refcount_update(tp, cudp,
- &fake, &rcur);
- xfs_refcount_update_put_group(&fake);
- }
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- if (error)
- goto abort_error;
-
- /* Requeue what we didn't finish. */
- if (fake.ri_blockcount > 0) {
- struct xfs_bmbt_irec irec = {
- .br_startblock = fake.ri_startblock,
- .br_blockcount = fake.ri_blockcount,
- };
-
- switch (fake.ri_type) {
- case XFS_REFCOUNT_INCREASE:
- xfs_refcount_increase_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_DECREASE:
- xfs_refcount_decrease_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_ALLOC_COW:
- xfs_refcount_alloc_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- case XFS_REFCOUNT_FREE_COW:
- xfs_refcount_free_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- default:
- ASSERT(0);
- }
- requeue_only = true;
- }
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
+ if (error)
+ goto abort_error;
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_cui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_cui_item_relog(
+xfs_refcount_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_cud_log_item *cudp;
struct xfs_cui_log_item *cuip;
struct xfs_phys_extent *pmap;
unsigned int count;
@@ -622,27 +515,41 @@ xfs_cui_item_relog(
count = CUI_ITEM(intent)->cui_format.cui_nextents;
pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
cuip = xfs_cui_init(tp->t_mountp, count);
memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
- xfs_trans_add_item(tp, &cuip->cui_item);
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
return &cuip->cui_item;
}
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .name = "refcount",
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_finish_one_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+ .recover_work = xfs_refcount_recover_work,
+ .relog_intent = xfs_refcount_relog_intent,
+};
+
+STATIC bool
+xfs_cui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_cui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
.iop_release = xfs_cui_item_release,
- .iop_recover = xfs_cui_item_recover,
.iop_match = xfs_cui_item_match,
- .iop_relog = xfs_cui_item_relog,
};
static inline void
@@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2(
cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
- xfs_cui_release(cuip);
+
+ xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+ &xfs_refcount_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e5b62dc28466..d5ca8bcae65b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks(
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
break;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0e0e747028da..79ad0087aeca 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -225,23 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
.iop_intent = xfs_rud_item_intent,
};
-static struct xfs_rud_log_item *
-xfs_trans_get_rud(
- struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip)
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
- &xfs_rud_item_ops);
- rudp->rud_ruip = ruip;
- rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
- xfs_trans_add_item(tp, &rudp->rud_item);
- return rudp;
-}
-
/* Set the map extent flags for this reverse mapping. */
static void
xfs_trans_set_rmap_flags(
@@ -285,35 +268,6 @@ xfs_trans_set_rmap_flags(
}
}
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_rmap_update(
- struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp,
- struct xfs_rmap_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_rmap_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the RUI and frees the RUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
- return error;
-}
-
/* Sort rmap intents by AG. */
static int
xfs_rmap_update_diff_items(
@@ -340,9 +294,6 @@ xfs_rmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -372,7 +323,6 @@ xfs_rmap_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &ruip->rui_item);
if (sort)
list_sort(mp, items, xfs_rmap_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -387,7 +337,16 @@ xfs_rmap_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
+ struct xfs_rui_log_item *ruip = RUI_ITEM(intent);
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+ &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ return &rudp->rud_item;
}
/* Take a passive ref to the AG containing the space we're rmapping. */
@@ -423,8 +382,7 @@ xfs_rmap_update_finish_item(
ri = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
- state);
+ error = xfs_rmap_finish_one(tp, ri, state);
xfs_rmap_update_put_group(ri);
kmem_cache_free(xfs_rmap_intent_cache, ri);
@@ -452,16 +410,6 @@ xfs_rmap_update_cancel_item(
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_rmap_update_create_intent,
- .abort_intent = xfs_rmap_update_abort_intent,
- .create_done = xfs_rmap_update_create_done,
- .finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_finish_one_cleanup,
- .cancel_item = xfs_rmap_update_cancel_item,
-};
-
/* Is this recovered RUI ok? */
static inline bool
xfs_rui_validate_map(
@@ -498,20 +446,72 @@ xfs_rui_validate_map(
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline void
+xfs_rui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ const struct xfs_map_extent *map)
+{
+ struct xfs_rmap_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ ri->ri_type = XFS_RMAP_MAP;
+ break;
+ case XFS_RMAP_EXTENT_MAP_SHARED:
+ ri->ri_type = XFS_RMAP_MAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP:
+ ri->ri_type = XFS_RMAP_UNMAP;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
+ ri->ri_type = XFS_RMAP_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT:
+ ri->ri_type = XFS_RMAP_CONVERT;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
+ ri->ri_type = XFS_RMAP_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_ALLOC:
+ ri->ri_type = XFS_RMAP_ALLOC;
+ break;
+ case XFS_RMAP_EXTENT_FREE:
+ ri->ri_type = XFS_RMAP_FREE;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ ri->ri_owner = map->me_owner;
+ ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ ri->ri_bmap.br_startblock = map->me_startblock;
+ ri->ri_bmap.br_startoff = map->me_startoff;
+ ri->ri_bmap.br_blockcount = map->me_len;
+ ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ xfs_rmap_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
STATIC int
-xfs_rui_item_recover(
- struct xfs_log_item *lip,
+xfs_rmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
- struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
int i;
int error = 0;
@@ -529,6 +529,8 @@ xfs_rui_item_recover(
sizeof(ruip->rui_format));
return -EFSCORRUPTED;
}
+
+ xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -536,91 +538,29 @@ xfs_rui_item_recover(
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- rudp = xfs_trans_get_rud(tp, ruip);
- for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- struct xfs_rmap_intent fake = { };
- struct xfs_map_extent *map;
-
- map = &ruip->rui_format.rui_extents[i];
- switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- case XFS_RMAP_EXTENT_MAP:
- fake.ri_type = XFS_RMAP_MAP;
- break;
- case XFS_RMAP_EXTENT_MAP_SHARED:
- fake.ri_type = XFS_RMAP_MAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_UNMAP:
- fake.ri_type = XFS_RMAP_UNMAP;
- break;
- case XFS_RMAP_EXTENT_UNMAP_SHARED:
- fake.ri_type = XFS_RMAP_UNMAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_CONVERT:
- fake.ri_type = XFS_RMAP_CONVERT;
- break;
- case XFS_RMAP_EXTENT_CONVERT_SHARED:
- fake.ri_type = XFS_RMAP_CONVERT_SHARED;
- break;
- case XFS_RMAP_EXTENT_ALLOC:
- fake.ri_type = XFS_RMAP_ALLOC;
- break;
- case XFS_RMAP_EXTENT_FREE:
- fake.ri_type = XFS_RMAP_FREE;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &ruip->rui_format,
- sizeof(ruip->rui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_owner = map->me_owner;
- fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.ri_bmap.br_startblock = map->me_startblock;
- fake.ri_bmap.br_startoff = map->me_startoff;
- fake.ri_bmap.br_blockcount = map->me_len;
- fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_rmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
- &rcur);
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- map, sizeof(*map));
- xfs_rmap_update_put_group(&fake);
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
+ if (error)
+ goto abort_error;
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_rui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_rui_item_relog(
+xfs_rmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_rud_log_item *rudp;
struct xfs_rui_log_item *ruip;
struct xfs_map_extent *map;
unsigned int count;
@@ -628,27 +568,41 @@ xfs_rui_item_relog(
count = RUI_ITEM(intent)->rui_format.rui_nextents;
map = RUI_ITEM(intent)->rui_format.rui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
ruip = xfs_rui_init(tp->t_mountp, count);
memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
atomic_set(&ruip->rui_next_extent, count);
- xfs_trans_add_item(tp, &ruip->rui_item);
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
return &ruip->rui_item;
}
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .name = "rmap",
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_finish_one_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+ .recover_work = xfs_rmap_recover_work,
+ .relog_intent = xfs_rmap_relog_intent,
+};
+
+STATIC bool
+xfs_rui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_rui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
.iop_release = xfs_rui_item_release,
- .iop_recover = xfs_rui_item_recover,
.iop_match = xfs_rui_item_match,
- .iop_relog = xfs_rui_item_relog,
};
static inline void
@@ -702,12 +656,9 @@ xlog_recover_rui_commit_pass2(
ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
- xfs_rui_release(ruip);
+
+ xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+ &xfs_rmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 88c48de5c9c8..8649d981a097 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -14,28 +14,14 @@
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
#include "xfs_rtbitmap.h"
-
-/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-static int
-xfs_rtget_summary(
- struct xfs_rtalloc_args *args,
- int log, /* log2 of extent size */
- xfs_fileoff_t bbno, /* bitmap block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
-{
- return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
-}
+#include "xfs_quota.h"
/*
* Return whether there are any free extents in the size range given
@@ -154,56 +140,55 @@ xfs_rtallocate_range(
* properly update the summary.
*/
error = xfs_rtfind_back(args, start, 0, &preblock);
- if (error) {
+ if (error)
return error;
- }
+
/*
* Find the next allocated block (end of free extent).
*/
error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
&postblock);
- if (error) {
+ if (error)
return error;
- }
+
/*
* Decrement the summary information corresponding to the entire
* (old) free extent.
*/
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_highbit64(postblock + 1 - preblock),
xfs_rtx_to_rbmblock(mp, preblock), -1);
- if (error) {
+ if (error)
return error;
- }
+
/*
* If there are blocks not being allocated at the front of the
* old extent, add summary data for them to be free.
*/
if (preblock < start) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(start - preblock),
+ xfs_highbit64(start - preblock),
xfs_rtx_to_rbmblock(mp, preblock), 1);
- if (error) {
+ if (error)
return error;
- }
}
+
/*
* If there are blocks not being allocated at the end of the
* old extent, add summary data for them to be free.
*/
if (postblock > end) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock - end),
+ xfs_highbit64(postblock - end),
xfs_rtx_to_rbmblock(mp, end + 1), 1);
- if (error) {
+ if (error)
return error;
- }
}
+
/*
* Modify the bitmap to mark this extent allocated.
*/
- error = xfs_rtmodify_range(args, start, len, 0);
- return error;
+ return xfs_rtmodify_range(args, start, len, 0);
}
/*
@@ -265,21 +250,17 @@ xfs_rtallocate_extent_block(
* If it's not so then next will contain the first non-free.
*/
error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
- if (error) {
+ if (error)
return error;
- }
if (stat) {
/*
* i for maxlen is all free, allocate and return that.
*/
- error = xfs_rtallocate_range(args, i, maxlen);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtx = i;
- return 0;
+ bestlen = maxlen;
+ besti = i;
+ goto allocate;
}
+
/*
* In the case where we have a variable-sized allocation
* request, figure out how big this free piece is,
@@ -298,45 +279,44 @@ xfs_rtallocate_extent_block(
/*
* If not done yet, find the start of the next free space.
*/
- if (next < end) {
- error = xfs_rtfind_forw(args, next, end, &i);
- if (error) {
- return error;
- }
- } else
+ if (next >= end)
break;
+ error = xfs_rtfind_forw(args, next, end, &i);
+ if (error)
+ return error;
}
+
/*
* Searched the whole thing & didn't find a maxlen free extent.
*/
- if (minlen < maxlen && besti != -1) {
- xfs_rtxlen_t p; /* amount to trim length by */
-
+ if (minlen > maxlen || besti == -1) {
/*
- * If size should be a multiple of prod, make that so.
+ * Allocation failed. Set *nextp to the next block to try.
*/
- if (prod > 1) {
- div_u64_rem(bestlen, prod, &p);
- if (p)
- bestlen -= p;
- }
+ *nextp = next;
+ return -ENOSPC;
+ }
- /*
- * Allocate besti for bestlen & return that.
- */
- error = xfs_rtallocate_range(args, besti, bestlen);
- if (error) {
- return error;
- }
- *len = bestlen;
- *rtx = besti;
- return 0;
+ /*
+ * If size should be a multiple of prod, make that so.
+ */
+ if (prod > 1) {
+ xfs_rtxlen_t p; /* amount to trim length by */
+
+ div_u64_rem(bestlen, prod, &p);
+ if (p)
+ bestlen -= p;
}
+
/*
- * Allocation failed. Set *nextp to the next block to try.
+ * Allocate besti for bestlen & return that.
*/
- *nextp = next;
- *rtx = NULLRTEXTNO;
+allocate:
+ error = xfs_rtallocate_range(args, besti, bestlen);
+ if (error)
+ return error;
+ *len = bestlen;
+ *rtx = besti;
return 0;
}
@@ -367,52 +347,33 @@ xfs_rtallocate_extent_exact(
* Check if the range in question (for maxlen) is free.
*/
error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
- if (error) {
+ if (error)
return error;
- }
- if (isfree) {
+
+ if (!isfree) {
/*
- * If it is, allocate it and return success.
+ * If not, allocate what there is, if it's at least minlen.
*/
- error = xfs_rtallocate_range(args, start, maxlen);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtx = start;
- return 0;
- }
- /*
- * If not, allocate what there is, if it's at least minlen.
- */
- maxlen = next - start;
- if (maxlen < minlen) {
+ maxlen = next - start;
+ if (maxlen < minlen)
+ return -ENOSPC;
+
/*
- * Failed, return failure status.
+ * Trim off tail of extent, if prod is specified.
*/
- *rtx = NULLRTEXTNO;
- return 0;
- }
- /*
- * Trim off tail of extent, if prod is specified.
- */
- if (prod > 1 && (i = maxlen % prod)) {
- maxlen -= i;
- if (maxlen < minlen) {
- /*
- * Now we can't do it, return failure status.
- */
- *rtx = NULLRTEXTNO;
- return 0;
+ if (prod > 1 && (i = maxlen % prod)) {
+ maxlen -= i;
+ if (maxlen < minlen)
+ return -ENOSPC;
}
}
+
/*
* Allocate what we can and return it.
*/
error = xfs_rtallocate_range(args, start, maxlen);
- if (error) {
+ if (error)
return error;
- }
*len = maxlen;
*rtx = start;
return 0;
@@ -441,7 +402,6 @@ xfs_rtallocate_extent_near(
int j; /* secondary loop control */
int log2len; /* log2 of minlen */
xfs_rtxnum_t n; /* next rtext to try */
- xfs_rtxnum_t r; /* result rtext */
ASSERT(minlen % prod == 0);
ASSERT(maxlen % prod == 0);
@@ -455,26 +415,18 @@ xfs_rtallocate_extent_near(
/* Make sure we don't run off the end of the rt volume. */
maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
- if (maxlen < minlen) {
- *rtx = NULLRTEXTNO;
- return 0;
- }
+ if (maxlen < minlen)
+ return -ENOSPC;
/*
* Try the exact allocation first.
*/
error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
- prod, &r);
- if (error) {
+ prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If the exact allocation worked, return that.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
+
+
bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
j = -1;
@@ -490,9 +442,9 @@ xfs_rtallocate_extent_near(
*/
error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
bbno + i, &maxlog);
- if (error) {
+ if (error)
return error;
- }
+
/*
* If there are any useful extents starting here, try
* allocating one.
@@ -511,17 +463,9 @@ xfs_rtallocate_extent_near(
*/
error = xfs_rtallocate_extent_block(args,
bbno + i, minlen, maxavail, len,
- &n, prod, &r);
- if (error) {
+ &n, prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If it worked, return it.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
}
/*
* On the negative side of the starting location.
@@ -555,17 +499,9 @@ xfs_rtallocate_extent_near(
error = xfs_rtallocate_extent_block(args,
bbno + j, minlen,
maxavail, len, &n, prod,
- &r);
- if (error) {
+ rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If it works, return the extent.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
}
}
}
@@ -599,8 +535,53 @@ xfs_rtallocate_extent_near(
else
break;
}
- *rtx = NULLRTEXTNO;
- return 0;
+ return -ENOSPC;
+}
+
+static int
+xfs_rtalloc_sumlevel(
+ struct xfs_rtalloc_args *args,
+ int l, /* level number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
+{
+ xfs_fileoff_t i; /* bitmap block number */
+
+ for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) {
+ xfs_suminfo_t sum; /* summary information for extents */
+ xfs_rtxnum_t n; /* next rtext to be tried */
+ int error;
+
+ error = xfs_rtget_summary(args, l, i, &sum);
+ if (error)
+ return error;
+
+ /*
+ * Nothing there, on to the next block.
+ */
+ if (!sum)
+ continue;
+
+ /*
+ * Try allocating the extent.
+ */
+ error = xfs_rtallocate_extent_block(args, i, minlen, maxlen,
+ len, &n, prod, rtx);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * If the "next block to try" returned from the allocator is
+ * beyond the next bitmap block, skip to that bitmap block.
+ */
+ if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(args->mp, n) - 1;
+ }
+
+ return -ENOSPC;
}
/*
@@ -617,13 +598,8 @@ xfs_rtallocate_extent_size(
xfs_rtxlen_t prod, /* extent product factor */
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- struct xfs_mount *mp = args->mp;
int error;
- xfs_fileoff_t i; /* bitmap block number */
int l; /* level number (loop control) */
- xfs_rtxnum_t n; /* next rtext to be tried */
- xfs_rtxnum_t r; /* result rtext number */
- xfs_suminfo_t sum; /* summary information for extents */
ASSERT(minlen % prod == 0);
ASSERT(maxlen % prod == 0);
@@ -631,119 +607,46 @@ xfs_rtallocate_extent_size(
/*
* Loop over all the levels starting with maxlen.
- * At each level, look at all the bitmap blocks, to see if there
- * are extents starting there that are long enough (>= maxlen).
- * Note, only on the initial level can the allocation fail if
- * the summary says there's an extent.
+ *
+ * At each level, look at all the bitmap blocks, to see if there are
+ * extents starting there that are long enough (>= maxlen).
+ *
+ * Note, only on the initial level can the allocation fail if the
+ * summary says there's an extent.
*/
- for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
- /*
- * Loop over all the bitmap blocks.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary for this level/block.
- */
- error = xfs_rtget_summary(args, l, i, &sum);
- if (error) {
- return error;
- }
- /*
- * Nothing there, on to the next block.
- */
- if (!sum)
- continue;
- /*
- * Try allocating the extent.
- */
- error = xfs_rtallocate_extent_block(args, i, maxlen,
- maxlen, len, &n, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
- i = xfs_rtx_to_rbmblock(mp, n) - 1;
- }
+ for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) {
+ error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len,
+ rtx);
+ if (error != -ENOSPC)
+ return error;
}
+
/*
- * Didn't find any maxlen blocks. Try smaller ones, unless
- * we're asking for a fixed size extent.
+ * Didn't find any maxlen blocks. Try smaller ones, unless we are
+ * looking for a fixed size extent.
*/
- if (minlen > --maxlen) {
- *rtx = NULLRTEXTNO;
- return 0;
- }
+ if (minlen > --maxlen)
+ return -ENOSPC;
ASSERT(minlen != 0);
ASSERT(maxlen != 0);
/*
* Loop over sizes, from maxlen down to minlen.
- * This time, when we do the allocations, allow smaller ones
- * to succeed.
+ *
+ * This time, when we do the allocations, allow smaller ones to succeed,
+ * but make sure the specified minlen/maxlen are in the possible range
+ * for this summary level.
*/
for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
- /*
- * Loop over all the bitmap blocks, try an allocation
- * starting in that block.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary information for this level/block.
- */
- error = xfs_rtget_summary(args, l, i, &sum);
- if (error) {
- return error;
- }
- /*
- * If nothing there, go on to next.
- */
- if (!sum)
- continue;
- /*
- * Try the allocation. Make sure the specified
- * minlen/maxlen are in the possible range for
- * this summary level.
- */
- error = xfs_rtallocate_extent_block(args, i,
- XFS_RTMAX(minlen, 1 << l),
- XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
- len, &n, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that extent.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
- i = xfs_rtx_to_rbmblock(mp, n) - 1;
- }
+ error = xfs_rtalloc_sumlevel(args, l,
+ max_t(xfs_rtxlen_t, minlen, 1 << l),
+ min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1),
+ prod, len, rtx);
+ if (error != -ENOSPC)
+ return error;
}
- /*
- * Got nothing, return failure.
- */
- *rtx = NULLRTEXTNO;
- return 0;
+
+ return -ENOSPC;
}
/*
@@ -963,8 +866,10 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
+ if (!xfs_validate_rtextents(nrextents))
+ return -EINVAL;
nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
- nrextslog = xfs_highbit32(nrextents);
+ nrextslog = xfs_compute_rextslog(nrextents);
nrsumlevels = nrextslog + 1;
nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
@@ -1031,11 +936,14 @@ xfs_growfs_rt(
nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
ASSERT(nsbp->sb_rextents != 0);
- nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+ nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
nsbp->sb_rbmblocks);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+
/*
* Start a transaction, get the log reservation.
*/
@@ -1122,6 +1030,8 @@ error_cancel:
*/
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(mp, &mp->m_resv);
error = xfs_trans_commit(tp);
if (error)
@@ -1160,81 +1070,6 @@ out_free:
}
/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int
-xfs_rtallocate_extent(
- struct xfs_trans *tp,
- xfs_rtxnum_t start, /* starting rtext number to allocate */
- xfs_rtxlen_t minlen, /* minimum length to allocate */
- xfs_rtxlen_t maxlen, /* maximum length to allocate */
- xfs_rtxlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_rtxlen_t prod, /* extent product factor */
- xfs_rtxnum_t *rtblock) /* out: start rtext allocated */
-{
- struct xfs_rtalloc_args args = {
- .mp = tp->t_mountp,
- .tp = tp,
- };
- int error; /* error value */
- xfs_rtxnum_t r; /* result allocated rtext */
-
- ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
- ASSERT(minlen > 0 && minlen <= maxlen);
-
- /*
- * If prod is set then figure out what to do to minlen and maxlen.
- */
- if (prod > 1) {
- xfs_rtxlen_t i;
-
- if ((i = maxlen % prod))
- maxlen -= i;
- if ((i = minlen % prod))
- minlen += prod - i;
- if (maxlen < minlen) {
- *rtblock = NULLRTEXTNO;
- return 0;
- }
- }
-
-retry:
- if (start == 0) {
- error = xfs_rtallocate_extent_size(&args, minlen,
- maxlen, len, prod, &r);
- } else {
- error = xfs_rtallocate_extent_near(&args, start, minlen,
- maxlen, len, prod, &r);
- }
-
- xfs_rtbuf_cache_relse(&args);
- if (error)
- return error;
-
- /*
- * If it worked, update the superblock.
- */
- if (r != NULLRTEXTNO) {
- long slen = (long)*len;
-
- ASSERT(*len >= minlen && *len <= maxlen);
- if (wasdel)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
- else
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
- } else if (prod > 1) {
- prod = 1;
- goto retry;
- }
-
- *rtblock = r;
- return 0;
-}
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -1412,7 +1247,7 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-int /* error */
+static int
xfs_rtpick_extent(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
@@ -1451,3 +1286,177 @@ xfs_rtpick_extent(
*pick = b;
return 0;
}
+
+static void
+xfs_rtalloc_align_minmax(
+ xfs_rtxlen_t *raminlen,
+ xfs_rtxlen_t *ramaxlen,
+ xfs_rtxlen_t *prod)
+{
+ xfs_rtxlen_t newmaxlen = *ramaxlen;
+ xfs_rtxlen_t newminlen = *raminlen;
+ xfs_rtxlen_t slack;
+
+ slack = newmaxlen % *prod;
+ if (slack)
+ newmaxlen -= slack;
+ slack = newminlen % *prod;
+ if (slack)
+ newminlen += *prod - slack;
+
+ /*
+ * If adjusting for extent size hint alignment produces an invalid
+ * min/max len combination, go ahead without it.
+ */
+ if (newmaxlen < newminlen) {
+ *prod = 1;
+ return;
+ }
+ *ramaxlen = newmaxlen;
+ *raminlen = newminlen;
+}
+
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtxnum_t start; /* allocation hint rtextent no */
+ xfs_rtxnum_t rtx; /* actually allocated rtextent no */
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t mod = 0; /* product factor for allocators */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t orig_length = ap->length;
+ xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
+ xfs_rtxlen_t raminlen;
+ bool rtlocked = false;
+ bool ignore_locality = false;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = ap->tp,
+ };
+ int error;
+
+ align = xfs_get_extsz_hint(ap->ip);
+retry:
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->offset, &ap->length);
+ if (error)
+ return error;
+ ASSERT(ap->length);
+ ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
+
+ /*
+ * If we shifted the file offset downward to satisfy an extent size
+ * hint, increase minlen by that amount so that the allocator won't
+ * give us an allocation that's too short to cover at least one of the
+ * blocks that the caller asked for.
+ */
+ if (ap->offset != orig_offset)
+ minlen += orig_offset - ap->offset;
+
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ *
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
+ raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ ASSERT(raminlen > 0);
+ ASSERT(raminlen <= ralen);
+
+ /*
+ * Lock out modifications to both the RT bitmap and summary inodes
+ */
+ if (!rtlocked) {
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ rtlocked = true;
+ }
+
+ if (ignore_locality) {
+ start = 0;
+ } else if (xfs_bmap_adjacent(ap)) {
+ start = xfs_rtb_to_rtx(mp, ap->blkno);
+ } else if (ap->eof && ap->offset == 0) {
+ /*
+ * If it's an allocation to an empty file at offset 0, pick an
+ * extent that will space things out in the rt area.
+ */
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &start);
+ if (error)
+ return error;
+ } else {
+ start = 0;
+ }
+
+ /*
+ * Only bother calculating a real prod factor if offset & length are
+ * perfectly aligned, otherwise it will just get us in trouble.
+ */
+ div_u64_rem(ap->offset, align, &mod);
+ if (mod || ap->length % align) {
+ prod = 1;
+ } else {
+ prod = xfs_extlen_to_rtxlen(mp, align);
+ if (prod > 1)
+ xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
+ }
+
+ if (start) {
+ error = xfs_rtallocate_extent_near(&args, start, raminlen,
+ ralen, &ralen, prod, &rtx);
+ } else {
+ error = xfs_rtallocate_extent_size(&args, raminlen,
+ ralen, &ralen, prod, &rtx);
+ }
+ xfs_rtbuf_cache_relse(&args);
+
+ if (error == -ENOSPC) {
+ if (align > mp->m_sb.sb_rextsize) {
+ /*
+ * We previously enlarged the request length to try to
+ * satisfy an extent size hint. The allocator didn't
+ * return anything, so reset the parameters to the
+ * original values and try again without alignment
+ * criteria.
+ */
+ ap->offset = orig_offset;
+ ap->length = orig_length;
+ minlen = align = mp->m_sb.sb_rextsize;
+ goto retry;
+ }
+
+ if (!ignore_locality && start != 0) {
+ /*
+ * If we can't allocate near a specific rt extent, try
+ * again without locality criteria.
+ */
+ ignore_locality = true;
+ goto retry;
+ }
+
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+ if (error)
+ return error;
+
+ xfs_trans_mod_sb(ap->tp, ap->wasdel ?
+ XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
+ -(long)ralen);
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+ ap->length = xfs_rtxlen_to_extlen(mp, ralen);
+ xfs_bmap_alloc_account(ap);
+ return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7cb9ffe51ca..a6836da9bebe 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -13,27 +13,6 @@ struct xfs_trans;
#ifdef CONFIG_XFS_RT
/*
- * Function prototypes for exported functions.
- */
-
-/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int /* error */
-xfs_rtallocate_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxnum_t start, /* starting rtext number to allocate */
- xfs_rtxlen_t minlen, /* minimum length to allocate */
- xfs_rtxlen_t maxlen, /* maximum length to allocate */
- xfs_rtxlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_rtxlen_t prod, /* extent product factor */
- xfs_rtxnum_t *rtblock); /* out: start rtext allocated */
-
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -52,20 +31,6 @@ xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Pick an extent for allocation at the start of a new realtime file.
- * Use the sequence number stored in the atime field of the bitmap inode.
- * Translate this to a fraction of the rtextents, and return the product
- * of rtextents and the fraction.
- * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
- */
-int /* error */
-xfs_rtpick_extent(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxlen_t len, /* allocation length (rtextents) */
- xfs_rtxnum_t *pick); /* result rt extent */
-
-/*
* Grow the realtime area of the filesystem.
*/
int
@@ -75,8 +40,6 @@ xfs_growfs_rt(
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS)
# define xfs_growfs_rt(mp,in) (-ENOSYS)
# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 764304595e8b..aff20ddd4a9f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -366,8 +366,9 @@ xfs_blkdev_get(
{
int error = 0;
- *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
+ *handlep = bdev_open_by_path(name,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*handlep)) {
error = PTR_ERR(*handlep);
*handlep = NULL;
@@ -439,18 +440,12 @@ xfs_open_devices(
int error;
/*
- * blkdev_put() can't be called under s_umount, see the comment
- * in get_tree_bdev() for more details
- */
- up_write(&sb->s_umount);
-
- /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
if (error)
- goto out_relock;
+ return error;
}
if (mp->m_rtname) {
@@ -493,10 +488,7 @@ xfs_open_devices(
bdev_release(logdev_handle);
}
- error = 0;
-out_relock:
- down_write(&sb->s_umount);
- return error;
+ return 0;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -509,7 +501,7 @@ out_relock:
out_close_logdev:
if (logdev_handle)
bdev_release(logdev_handle);
- goto out_relock;
+ return error;
}
/*
@@ -759,10 +751,6 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
- /*
- * Free the buftargs here because blkdev_put needs to be called outside
- * of sb->s_umount, which is held around the call to ->put_super.
- */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
if (mp->m_rtdev_targp)
@@ -906,10 +894,8 @@ xfs_fs_statfs(
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
{
- uint64_t resblks = 0;
-
mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, 0);
}
STATIC void
@@ -923,7 +909,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
} else
resblks = xfs_default_resblks(mp);
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, resblks);
}
/*
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..92974a4414c8 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,6 +23,7 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/* ----- Kernel only functions below ----- */
int
@@ -108,6 +109,8 @@ xfs_readlink(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(ip, XFS_DATA_FORK))
+ return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -128,10 +131,10 @@ xfs_readlink(
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
* if if_data is junk.
*/
- if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
goto out;
- memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+ memcpy(link, ip->i_df.if_data, pathlen + 1);
error = 0;
} else {
error = xfs_readlink_bmap_ilocked(ip, link);
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index f78ad6b10ea5..276696a07040 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
int pwork_threads; /* parallel workqueue threads */
bool larp; /* log attribute replay */
#endif
+ int bload_leaf_slack; /* btree bulk load leaf slack */
+ int bload_node_slack; /* btree bulk load node slack */
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index a3c6b1548723..17485666b672 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -229,6 +229,15 @@ pwork_threads_show(
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+/*
+ * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob
+ * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on
+ * V5 filesystems. As a result, the intermediate progress of all setxattr and
+ * removexattr operations are tracked via the log and can be restarted during
+ * recovery. This is useful for testing xattr recovery prior to merging of the
+ * parent pointer feature which requires it to maintain consistency, and may be
+ * enabled for userspace xattrs in the future.
+ */
static ssize_t
larp_store(
struct kobject *kobject,
@@ -253,6 +262,58 @@ larp_show(
XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
+STATIC ssize_t
+bload_leaf_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_leaf_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_node_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
@@ -262,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
#endif
+ ATTR_LIST(bload_leaf_slack),
+ ATTR_LIST(bload_node_slack),
NULL,
};
ATTRIBUTE_GROUPS(xfs_dbg);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3926cf7f2a6e..0984a1c884c7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -67,6 +67,7 @@ struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
+struct xfs_defer_op_type;
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_rmap_irec;
@@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xlog_intent_recovery_failed,
- TP_PROTO(struct xfs_mount *mp, int error, void *function),
- TP_ARGS(mp, error, function),
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
+ int error),
+ TP_ARGS(mp, ops, error),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __string(name, ops->name)
__field(int, error)
- __field(void *, function)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __assign_str(name, ops->name);
__entry->error = error;
- __entry->function = function;
),
- TP_printk("dev %d:%d error %d function %pS",
+ TP_printk("dev %d:%d optype %s error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->error, __entry->function)
+ __get_str(name),
+ __entry->error)
);
DECLARE_EVENT_CLASS(xfs_perag_class,
@@ -2549,22 +2552,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
TP_ARGS(mp, dfp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
+ __field(unsigned int, flags)
__field(char, committed)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
+ __entry->flags = dfp->dfp_flags;
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -2675,6 +2681,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -2688,25 +2697,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_ARGS(mp, dfp, item),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
__field(void *, item)
__field(char, committed)
+ __field(unsigned int, flags)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
__entry->item = item;
__entry->committed = dfp->dfp_done != NULL;
+ __entry->flags = dfp->dfp_flags;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
__entry->item,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -4399,8 +4411,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
TRACE_EVENT(xfs_force_shutdown,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 305c9d07bf1b..12d45e93f07d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1237,6 +1237,68 @@ out_cancel:
}
/*
+ * Try to reserve more blocks for a transaction.
+ *
+ * This is for callers that need to attach resources to a transaction, scan
+ * those resources to determine the space reservation requirements, and then
+ * modify the attached resources. In other words, online repair. This can
+ * fail due to ENOSPC, so the caller must be able to cancel the transaction
+ * without shutting down the fs.
+ */
+int
+xfs_trans_reserve_more(
+ struct xfs_trans *tp,
+ unsigned int blocks,
+ unsigned int rtextents)
+{
+ struct xfs_trans_res resv = { };
+
+ return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+}
+
+/*
+ * Try to reserve more blocks and file quota for a transaction. Same
+ * conditions of usage as xfs_trans_reserve_more.
+ */
+int
+xfs_trans_reserve_more_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force_quota)
+{
+ struct xfs_trans_res resv = { };
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ if (error)
+ return error;
+
+ if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+ return 0;
+
+ if (tp->t_flags & XFS_TRANS_RESERVE)
+ force_quota = true;
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+ force_quota);
+ if (!error)
+ return 0;
+
+ /* Quota failed, give back the new reservation. */
+ xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+ tp->t_blk_res -= dblocks;
+ xfs_mod_frextents(mp, rtx);
+ tp->t_rtx_res -= rtx;
+ return error;
+}
+
+/*
* Allocate an transaction in preparation for inode creation by reserving quota
* against the given dquots. Callers are not required to hold any inode locks.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6e3646d524ce..08ce757c7454 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -78,11 +78,7 @@ struct xfs_item_ops {
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
void (*iop_release)(struct xfs_log_item *);
- int (*iop_recover)(struct xfs_log_item *lip,
- struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
- struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
- struct xfs_trans *tp);
struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
@@ -168,6 +164,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more(struct xfs_trans *tp,
+ unsigned int blocks, unsigned int rtextents);
int xfs_trans_alloc_empty(struct xfs_mount *mp,
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -247,19 +245,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern struct kmem_cache *xfs_trans_cache;
-static inline struct xfs_log_item *
-xfs_trans_item_relog(
- struct xfs_log_item *lip,
- struct xfs_trans *tp)
-{
- return lip->li_ops->iop_relog(lip, tp);
-}
-
struct xfs_dquot;
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
unsigned int dblocks, unsigned int rblocks, bool force,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ unsigned int dblocks, unsigned int rblocks, bool force_quota);
int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp, unsigned int dblocks,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 987843f84d03..364104e1b38a 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
};
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
error = xfs_attr_get(&args);
if (error)
return error;
@@ -294,6 +297,9 @@ xfs_vn_listxattr(
struct inode *inode = d_inode(dentry);
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
/*
* First read the regular on-disk attributes.
*/
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index b2c9b35df8f7..6ab2318a9c8e 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = {
.invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = zonefs_swap_activate,
};