From beaa51b36012fad5a4d3c18b88a617aea7a9b96d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 4 Apr 2024 12:32:53 -0400 Subject: blk-iocost: avoid out of bounds shift UBSAN catches undefined behavior in blk-iocost, where sometimes iocg->delay is shifted right by a number that is too large, resulting in undefined behavior on some architectures. [ 186.556576] ------------[ cut here ]------------ UBSAN: shift-out-of-bounds in block/blk-iocost.c:1366:23 shift exponent 64 is too large for 64-bit type 'u64' (aka 'unsigned long long') CPU: 16 PID: 0 Comm: swapper/16 Tainted: G S E N 6.9.0-0_fbk700_debug_rc2_kbuilder_0_gc85af715cac0 #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A23 12/08/2020 Call Trace: dump_stack_lvl+0x8f/0xe0 __ubsan_handle_shift_out_of_bounds+0x22c/0x280 iocg_kick_delay+0x30b/0x310 ioc_timer_fn+0x2fb/0x1f80 __run_timer_base+0x1b6/0x250 ... Avoid that undefined behavior by simply taking the "delay = 0" branch if the shift is too large. I am not sure what the symptoms of an undefined value delay will be, but I suspect it could be more than a little annoying to debug. Signed-off-by: Rik van Riel Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20240404123253.0f58010f@imladris.surriel.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9a85bfbbc45a..baa20c85799d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1347,7 +1347,7 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 tdelta, delay, new_delay; + u64 tdelta, delay, new_delay, shift; s64 vover, vover_pct; u32 hwa; @@ -1362,8 +1362,9 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; - if (iocg->delay) - delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + shift = div64_u64(tdelta, USEC_PER_SEC); + if (iocg->delay && shift < BITS_PER_LONG) + delay = iocg->delay >> shift; else delay = 0; -- cgit v1.2.3 From 8b8ace080319a866f5dfe9da8e665ae51d971c54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 20:59:10 +0800 Subject: block: fix q->blkg_list corruption during disk rebind Multiple gendisk instances can allocated/added for single request queue in case of disk rebind. blkg may still stay in q->blkg_list when calling blkcg_init_disk() for rebind, then q->blkg_list becomes corrupted. Fix the list corruption issue by: - add blkg_init_queue() to initialize q->blkg_list & q->blkcg_mutex only - move calling blkg_init_queue() into blk_alloc_queue() The list corruption should be started since commit f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") which delays removing blkg from q->blkg_list into blkg_free_workfn(). Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20240407125910.4053377-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++--- block/blk-cgroup.h | 2 ++ block/blk-core.c | 2 ++ 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bdbb557feb5a..059467086b13 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) return 0; } +void blkg_init_queue(struct request_queue *q) +{ + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); +} + int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; @@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); - new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 78b74106bf10..90b3959d88cf 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -189,6 +189,7 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; +void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); @@ -482,6 +483,7 @@ struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } diff --git a/block/blk-core.c b/block/blk-core.c index a16b5abdbbf5..3a6f5603fb44 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -442,6 +442,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); + blkg_init_queue(q); + /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. -- cgit v1.2.3 From b561ea56a26415bf44ce8ca6a8e625c7c390f1ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 21:19:31 +0800 Subject: block: allow device to have both virt_boundary_mask and max segment size When one stacking device is over one device with virt_boundary_mask and another one with max segment size, the stacking device have both limits set. This way is allowed before d690cb8ae14b ("block: add an API to atomically update queue limits"). Relax the limit so that we won't break such kind of stacking setting. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218687 Reported-by: janpieter.sollie@edpnet.be Fixes: d690cb8ae14b ("block: add an API to atomically update queue limits") Link: https://lore.kernel.org/linux-block/ZfGl8HzUpiOxCLm3@fedora/ Cc: Christoph Hellwig Cc: Mike Snitzer Cc: dm-devel@lists.linux.dev Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240407131931.4055231-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index cdbaef159c4b..d2731843f2fc 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -182,17 +182,13 @@ static int blk_validate_limits(struct queue_limits *lim) return -EINVAL; /* - * Devices that require a virtual boundary do not support scatter/gather - * I/O natively, but instead require a descriptor list entry for each - * page (which might not be identical to the Linux PAGE_SIZE). Because - * of that they are not limited by our notion of "segment size". + * Stacking device may have both virtual boundary and max segment + * size limit, so allow this setting now, and long-term the two + * might need to move out of stacking limits since we have immutable + * bvec and lower layer bio splitting is supposed to handle the two + * correctly. */ - if (lim->virt_boundary_mask) { - if (WARN_ON_ONCE(lim->max_segment_size && - lim->max_segment_size != UINT_MAX)) - return -EINVAL; - lim->max_segment_size = UINT_MAX; - } else { + if (!lim->virt_boundary_mask) { /* * The maximum segment size has an odd historic 64k default that * drivers probably should override. Just like the I/O size we -- cgit v1.2.3 From 3ec4848913d695245716ea45ca4872d9dff097a5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 11 Apr 2024 11:23:48 +0800 Subject: block: fix that blk_time_get_ns() doesn't update time after schedule While monitoring the throttle time of IO from iocost, it's found that such time is always zero after the io_schedule() from ioc_rqos_throttle, for example, with the following debug patch: + printk("%s-%d: %s enter %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); if (wait.committed) break; io_schedule(); } + printk("%s-%d: %s exit %llu\n", current->comm, current->pid, __func__, blk_time_get_ns()); It can be observerd that blk_time_get_ns() always return the same time: [ 1068.096579] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.272587] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.274389] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.472690] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.474485] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.672656] fio-1268: ioc_rqos_throttle exit 1067901962288 [ 1068.674451] fio-1268: ioc_rqos_throttle enter 1067901962288 [ 1068.872655] fio-1268: ioc_rqos_throttle exit 1067901962288 And I think the root cause is that 'PF_BLOCK_TS' is always cleared by blk_flush_plug() before scheduel(), hence blk_plug_invalidate_ts() will never be called: blk_time_get_ns plug->cur_ktime = ktime_get_ns(); current->flags |= PF_BLOCK_TS; io_schedule: io_schedule_prepare blk_flush_plug __blk_flush_plug /* the flag is cleared, while time is not */ current->flags &= ~PF_BLOCK_TS; schedule sched_update_worker /* the flag is not set, hence plug->cur_ktime is not cleared */ if (tsk->flags & PF_BLOCK_TS) blk_plug_invalidate_ts() blk_time_get_ns /* got the time stashed before schedule */ return plug->cur_ktime; Fix the problem by clearing cached time in __blk_flush_plug(). Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240411032349.3051233-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3a6f5603fb44..b795ac177281 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1197,6 +1197,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } -- cgit v1.2.3