aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c1301
1 files changed, 1009 insertions, 292 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b2403b6127f..aa0a60ee26cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,11 @@ struct btrfs_dio_data {
struct extent_changeset *data_reserved;
};
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, bool extent_inserted,
- struct btrfs_root *root, struct inode *inode,
- u64 start, size_t size, size_t compressed_size,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
+ struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
- unsigned long offset;
+ u64 i_size;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
@@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
size_t datasize;
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = start;
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->i_mapping,
- start >> PAGE_SHIFT);
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = offset_in_page(start);
- write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
}
@@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
- size = ALIGN(size, root->fs_info->sectorsize);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
if (ret)
goto fail;
/*
- * we're an inline extent, so nobody can
- * extend the file past i_size without locking
- * a page we already have locked.
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
*
- * We must do any isize and inode updates
- * before we unlock the pages. Otherwise we
- * could end up racing with unlink.
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
*/
- BTRFS_I(inode)->disk_i_size = inode->i_size;
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
+
fail:
return ret;
}
@@ -339,35 +348,31 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
- u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(&inode->vfs_inode);
- u64 actual_end = min(end + 1, isize);
- u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, fs_info->sectorsize);
- u64 data_len = inline_len;
+ u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;
- if (compressed_size)
- data_len = compressed_size;
-
- if (start > 0 ||
- actual_end > fs_info->sectorsize ||
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- (!compressed_size &&
- (actual_end & (fs_info->sectorsize - 1)) == 0) ||
- end + 1 < isize ||
- data_len > fs_info->max_inline) {
+ data_len > fs_info->max_inline)
return 1;
- }
path = btrfs_alloc_path();
if (!path)
@@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
trans->block_rsv = &inode->block_rsv;
drop_args.path = path;
- drop_args.start = start;
- drop_args.end = aligned_end;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
-
- if (compressed_size && compressed_pages)
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- compressed_size);
- else
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- inline_len);
-
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- if (isize > actual_end)
- inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
- root, &inode->vfs_inode, start,
- inline_len, compressed_size,
- compress_type, compressed_pages);
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -560,12 +555,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
@@ -624,7 +619,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
@@ -735,14 +729,15 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
0, BTRFS_COMPRESS_NONE,
- NULL);
+ NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
total_compressed,
- compress_type, pages);
+ compress_type, pages,
+ false);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
@@ -981,11 +976,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
- ins.objectid, /* disk_bytenr */
- async_extent->ram_size, /* num_bytes */
- ins.offset, /* disk_num_bytes */
- async_extent->compress_type);
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, start, end, 0);
goto out_free_reserve;
@@ -1003,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
async_extent->pages, /* compressed_pages */
async_extent->nr_pages,
async_chunk->write_flags,
- async_chunk->blkcg_css)) {
+ async_chunk->blkcg_css, true)) {
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -1152,9 +1150,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* So here we skip inline extent creation completely.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
+
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1234,9 +1235,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size,
- BTRFS_ORDERED_REGULAR);
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto out_drop_extent_cache;
@@ -1895,10 +1897,11 @@ out_check:
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_PREALLOC);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, num_bytes, num_bytes,
+ disk_bytenr, num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
@@ -1907,9 +1910,11 @@ out_check:
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
+ num_bytes, num_bytes,
disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_NOCOW);
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto error;
}
@@ -2310,7 +2315,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
}
/*
@@ -2538,10 +2543,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ /*
+ * btrfs_submit_compressed_read will handle completing
+ * the bio if there were any errors, so just return
+ * here.
+ */
ret = btrfs_submit_compressed_read(inode, bio,
mirror_num,
bio_flags);
- goto out;
+ goto out_no_endio;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2562,7 +2572,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
if (ret)
goto out;
}
@@ -2575,6 +2585,7 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
+out_no_endio:
return ret;
}
@@ -2870,6 +2881,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -2944,7 +2956,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
goto out;
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
- file_pos, qgroup_reserved, &ins);
+ file_pos - offset,
+ qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2970,20 +2983,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
- u64 logical_len;
bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- logical_len = oe->truncated_len;
- else
- logical_len = oe->num_bytes;
- btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
- btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ num_bytes = ram_bytes = oe->truncated_len;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
@@ -2994,6 +3007,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
@@ -3028,7 +3042,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
@@ -4062,7 +4077,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4118,15 +4134,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
- btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unncessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
+ index);
+ }
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4158,7 +4186,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4565,14 +4593,21 @@ out_up_write:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
return btrfs_delete_subvolume(dir, dentry);
+ }
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4611,7 +4646,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
@@ -4664,7 +4699,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -4876,8 +4911,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
goto next;
}
hole_em->start = cur_offset;
@@ -5046,16 +5080,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
/*
- * While truncating the inode pages during eviction, we get the VFS calling
- * btrfs_invalidatepage() against each page of the inode. This is slow because
- * the calls to btrfs_invalidatepage() result in a huge amount of calls to
- * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
- * extent_state structures over and over, wasting lots of time.
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
*
- * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
- * those expensive operations on a per page basis and do only the ordered io
- * finishing, while we release here the extent_map and extent_state structures,
- * without the excessive merging and splitting.
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
@@ -5121,7 +5156,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
- * (Refer to comment in btrfs_invalidatepage, case 2)
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
@@ -5584,21 +5619,17 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
static inline u8 btrfs_inode_type(struct inode *inode)
{
- /*
- * Compile-time asserts that generic FT_* types still match
- * BTRFS_FT_* types
- */
- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
-
return fs_umode_to_ftype(inode->i_mode);
}
@@ -5971,14 +6002,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
goto out;
ret = 0;
- /*
- * MAGIC NUMBER EXPLANATION:
- * since we search a directory based on f_pos we have to start at 2
- * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
- * else has to start at 2
- */
if (path->slots[0] == 0) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -5989,7 +6014,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -6140,7 +6165,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
@@ -6537,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -7040,8 +7065,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
- block_len, type);
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
if (ret) {
if (em) {
free_extent_map(em);
@@ -7441,7 +7469,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
@@ -7600,6 +7628,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
@@ -7803,7 +7859,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7860,7 +7916,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret)
goto err;
} else {
@@ -8076,8 +8132,13 @@ int btrfs_readpage(struct file *file, struct page *page)
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio)
- ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (bio_ctrl.bio) {
+ int ret2;
+
+ ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (ret == 0)
+ ret = ret2;
+ }
return ret;
}
@@ -8118,8 +8179,8 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidatepage() we have a race window where
- * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * For releasepage() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
@@ -8195,48 +8256,48 @@ static int btrfs_migratepage(struct address_space *mapping,
}
#endif
-static void btrfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
- * We have page locked so no new ordered extent can be created on this
- * page, nor bio can be submitted for this page.
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
*
- * But already submitted bio can still be finished on this page.
- * Furthermore, endio function won't skip page which has Ordered
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
* (Private2) already cleared, so it's possible for endio and
- * invalidatepage to do the same ordered extent accounting twice
- * on one page.
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
*
* So here we wait for any submitted bios to finish, so that we won't
- * do double ordered extent accounting on the same page.
+ * do double ordered extent accounting on the same folio.
*/
- wait_on_page_writeback(page);
- wait_subpage_spinlock(page);
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
/*
* For subpage case, we have call sites like
* btrfs_punch_hole_lock_range() which passes range not aligned to
* sectorsize.
- * If the range doesn't cover the full page, we don't need to and
- * shouldn't clear page extent mapped, as page->private can still
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
* record subpage dirty bits for other part of the range.
*
- * For cases that can invalidate the full even the range doesn't
- * cover the full page, like invalidating the last page, we're
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
* still safe to wait for ordered extent to finish.
*/
if (!(offset == 0 && length == PAGE_SIZE)) {
- btrfs_releasepage(page, GFP_NOFS);
+ btrfs_releasepage(&folio->page, GFP_NOFS);
return;
}
@@ -8277,7 +8338,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8287,7 +8348,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
delete_states = false;
goto next;
}
- btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8357,11 +8418,11 @@ next:
* should not have Ordered (Private2) anymore, or the above iteration
* did something wrong.
*/
- ASSERT(!PageOrdered(page));
- btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(page, GFP_NOFS);
- clear_page_extent_mapped(page);
+ __btrfs_releasepage(&folio->page, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
}
/*
@@ -8706,7 +8767,7 @@ out:
* extents beyond i_size to drop.
*/
if (control.extents_found > 0)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
return ret;
}
@@ -8759,7 +8820,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -9002,14 +9063,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
int ret2;
- bool root_log_pinned = false;
- bool dest_log_pinned = false;
bool need_abort = false;
/*
@@ -9112,29 +9173,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
- /*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
- *
- * We pin the logs even if at this precise moment none of the inodes was
- * logged before. This is because right after we checked for that, some
- * other task fsyncing some other inode not involved with this rename
- * operation could log that one of our inodes exists.
- *
- * We don't need to pin the logs before the above calls to
- * btrfs_insert_inode_ref(), since those don't ever need to change a log.
- */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
- }
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
- }
-
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9142,7 +9180,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9158,7 +9197,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9188,46 +9228,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
- if (root_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- old_dentry->d_parent);
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
-out_fail:
/*
- * If we have pinned a log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
*/
- if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
- btrfs_set_log_full_commit(trans);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
- if (root_log_pinned) {
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
- }
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+out_fail:
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9302,11 +9327,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
- bool log_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9411,29 +9436,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
- /*
- * Now pin the log. We do it to ensure that no other task can
- * sync the log while we are in progress with the rename, as
- * that could result in an inconsistency in case any of the
- * inodes that are part of this rename operation were logged
- * before.
- *
- * We pin the log even if at this precise moment none of the
- * inodes was logged before. This is because right after we
- * checked for that, some other task fsyncing some other inode
- * not involved with this rename operation could log that one of
- * our inodes exists.
- *
- * We don't need to pin the logs before the above call to
- * btrfs_insert_inode_ref(), since that does not need to change
- * a log.
- */
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9475,12 +9482,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
@@ -9492,28 +9496,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
}
out_fail:
- /*
- * If we have pinned the log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && log_pinned) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9993,8 +9975,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em = alloc_extent_map();
if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10076,11 +10057,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static int btrfs_set_page_dirty(struct page *page)
-{
- return __set_page_dirty_nobuffers(page);
-}
-
static int btrfs_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
@@ -10182,6 +10158,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
+static int btrfs_encoded_io_compression_from_extent(
+ struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret) {
+ btrfs_bio_free_csum(bbio);
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret) {
+ atomic_dec(&priv->pending);
+ btrfs_bio_free_csum(bbio);
+ }
+ return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u64 start = priv->file_offset;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff, start))
+ return BLK_STS_IOERR;
+ start += sectorsize;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset,
+ u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ bio->bi_end_io = btrfs_encoded_read_endio;
+ bio->bi_private = &priv;
+ bio->bi_opf = REQ_OP_READ;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent_bits(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap(pages[i]);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap(pages[i]);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent_bits(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
@@ -10638,12 +11355,12 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
- .invalidatepage = btrfs_invalidatepage,
+ .invalidate_folio = btrfs_invalidate_folio,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
- .set_page_dirty = btrfs_set_page_dirty,
+ .dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,