From 200237d6746faaeaf7f4ff4abbf13f3917cee60a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:31 +1100 Subject: xfs: Move AGI buffer type setting to xfs_read_agi We've missed properly setting the buffer type for an AGI transaction in 3 spots now, so just move it into xfs_read_agi() and set it if we are in a transaction to avoid the problem in the future. This is similar to how it is done in i.e. the dir3 and attr3 read functions. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc..c482b9716347 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2450,8 +2450,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2592,6 +2590,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; -- cgit v1.2.3 From 7710517fc37b1899722707883b54694ea710b3c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:50 +1100 Subject: xfs: pass state not whichfork to trace_xfs_extlist When xfs_bmap_trace_exlist called trace_xfs_extlist, it sent in the "whichfork" var instead of the bmap "state" as expected (even though state was already set up for this purpose). As a result, the xfs_bmap_class in tracing code used "whichfork" not state in xfs_iext_state_to_fork(), and got the wrong ifork pointer. It all goes downhill from there, including an ASSERT when ifp_bytes is empty by the time it reaches xfs_iext_get_ext(): XFS: Assertion failed: idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6b7e6eb29414..e4120fcefcc8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -531,7 +531,7 @@ xfs_bmap_trace_exlist( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* -- cgit v1.2.3 From c44a1f22626c153976289e1cd67bdcdfefc16e1f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:00 +1100 Subject: xfs: handle cow fork in xfs_bmap_trace_exlist By inspection, xfs_bmap_trace_exlist isn't handling cow forks, and will trace the data fork instead. Fix this by setting state appropriately if whichfork == XFS_COW_FORK. ()___() < @ @ > | | {o_o} (|) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e4120fcefcc8..23aa70b2790c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -518,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -527,6 +527,8 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); -- cgit v1.2.3 From f7a136aee3c1c3f7daf87197b3b3c361744a2812 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:14 +1100 Subject: xfs: several xattr functions can be void There are a handful of xattr functions which now return nothing but zero. They can be made void, chased through calling functions, and error handling etc can be removed. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f827..91f51637f8af 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* -- cgit v1.2.3 From d2a047f31e86941fa896e0e3271536d50aba415e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:32:50 +1100 Subject: xfs: forbid AG btrees with level == 0 There is no such thing as a zero-level AG btree since even a single-node zero-records btree has one level. Btree cursor constructors read cur_nlevels straight from disk and then access things like cur_bufs[cur_nlevels - 1] which is /really/ bad if cur_nlevels is zero! Therefore, strengthen the verifiers to prevent this possibility. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 10 +++++++--- fs/xfs/libxfs/xfs_ialloc.c | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714f..5050056a0b06 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2455,12 +2455,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2480,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index c482b9716347..d45c03779dae 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2510,8 +2510,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't -- cgit v1.2.3 From bb3be7e7c1c18e1b141d4cadeb98cc89ecf78099 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:33:54 +1100 Subject: xfs: check for bogus values in btree block headers When we're reading a btree block, make sure that what we retrieved matches the owner and level; and has a plausible number of records. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a59..21e6a6ab6b9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* -- cgit v1.2.3 From 356a3225222e5bc4df88aef3419fb6424f18ab69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:36:56 +1100 Subject: xfs: complain if we don't get nextents bmap records When reading into memory all extents of a btree-format inode fork, complain if the number of extents we find is not the same as the number of extents reported in the inode core. This is needed to stop an IO action from accessing the garbage areas of the in-core fork. [dchinner: removed redundant assert] Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 23aa70b2790c..829ad632533b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1377,8 +1377,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; ASSERT(i == xfs_iext_count(ifp)); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: -- cgit v1.2.3 From 96a3aefb8ffde23180130460b0b2407b328eb727 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:37:47 +1100 Subject: xfs: don't crash if reading a directory results in an unexpected hole In xfs_dir3_data_read, we can encounter the situation where err == 0 and *bpp == NULL if the given bno offset happens to be a hole; this leads to a crash if we try to set the buffer type after the _da_read_buf call. Holes can happen due to corrupt or malicious entries in the bmbt data, so be a little more careful when we're handling buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fde..e526f5a5f0be 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } -- cgit v1.2.3 From 0f352f8ee8412bd9d34fb2a6411241da61175c0e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:11 +1100 Subject: xfs: error out if trying to add attrs and anextents > 0 We shouldn't assert if somehow we end up trying to add an attr fork to an inode that apparently already has attr extents because this is an indication of on-disk corruption. Instead, return an error code to userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 829ad632533b..29ffc0569ce1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1153,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1160,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -- cgit v1.2.3 From ef388e2054feedaeb05399ed654bdb06f385d294 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:38 +1100 Subject: xfs: don't allow di_size with high bit set The on-disk field di_size is used to set i_size, which is a signed integer of loff_t. If the high bit of di_size is set, we'll end up with a negative i_size, which will cause all sorts of problems. Since the VFS won't let us create a file with such length, we should catch them here in the verifier too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434..c906e50515f0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -392,6 +392,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; -- cgit v1.2.3 From 11ef38afe98cc7ad1a46ef24945232ec1760d5e2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:38:58 +1100 Subject: xfs: make xfs btree stats less huge Embedding a switch statement in every btree stats inc/add adds a lot of code overhead to the core btree infrastructure paths. Stats are supposed to be small and lightweight, but the btree stats have become big and bloated as we've added more btrees. It needs fixing because the reflink code will just add more overhead again. Convert the v2 btree stats to arrays instead of independent variables, and instead use the type to index the specific btree array via an enum. This allows us to use array based indexing to update the stats, rather than having to derefence variables specific to the btree type. If we then wrap the xfsstats structure in a union and place uint32_t array beside it, and calculate the correct btree stats array base array index when creating a btree cursor, we can easily access entries in the stats structure without having to switch names based on the btree type. We then replace with the switch statement with a simple set of stats wrapper macros, resulting in a significant simplification of the btree stats code, and: text data bss dec hex filename 48905 144 8 49057 bfa1 fs/xfs/libxfs/xfs_btree.o.old 36793 144 8 36945 9051 fs/xfs/libxfs/xfs_btree.o it reduces the core btree infrastructure code size by close to 25%! Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 ++++ fs/xfs/libxfs/xfs_bmap_btree.c | 1 + fs/xfs/libxfs/xfs_btree.h | 43 ++++---------------------------------- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 ++ fs/xfs/libxfs/xfs_refcount_btree.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 1 + 6 files changed, 13 insertions(+), 39 deletions(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c..44cfcd03c451 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -428,6 +428,10 @@ xfs_allocbt_init_cursor( cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef..94ad31d372ab 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -803,6 +803,7 @@ xfs_bmbt_init_cursor( cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..b69b947c4c1b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e011..e7ff8ef0e5a7 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -365,9 +365,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec2..6fb2215f8ff7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577..de25771764ba 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; -- cgit v1.2.3 From cae028df53449905c944603df624ac94bc619661 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:40:32 +1100 Subject: xfs: optimise CRC updates Nick Piggin reported that the CRC overhead in an fsync heavy workload was higher than expected on a Power8 machine. Part of this was to do with the fact that the power8 CRC implementation is not efficient for CRC lengths of less than 512 bytes, and so the way we split the CRCs over the CRC field means a lot of the CRCs are reduced to being less than than optimal size. To optimise this, change the CRC update mechanism to zero the CRC field first, and then compute the CRC in one pass over the buffer and write the result back into the buffer. We can do this safely because anything writing a CRC has exclusive access to the buffer the CRC is being calculated over. We leave the CRC verify code the same - it still splits the CRC calculation - because we do not want read-only operations modifying the underlying buffer. This is because read-only operations may not have an exclusive access to the buffer guaranteed, and so temporary modifications could leak out to to other processes accessing the buffer concurrently. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_cksum.h | 26 ++++++++++++++++++++++---- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'fs/xfs/libxfs') diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..a416c7cb23ea 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -25,6 +26,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) length - (cksum_offset + sizeof(__be32))); } +/* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + /* * Convert the intermediate checksum to the final ondisk format. * @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index c906e50515f0..9004665d679a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -444,7 +444,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } -- cgit v1.2.3