From 07a1bc5c14c9ef6401b21c1873c6c087075ff292 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 14 Jan 2026 11:28:02 -0800 Subject: [PATCH 1/8] block: Fix an error path in disk_update_zone_resources() Any queue_limits_start_update() call must be followed either by a queue_limits_commit_update() call or by a queue_limits_cancel_update() call. Make sure that the error path near the start of disk_update_zone_resources() follows this requirement. Remove the "goto unfreeze" statement from that error path to make the code easier to verify. This was detected by annotating the queue_limits_*() calls with Clang thread-safety attributes and by building the kernel with thread-safety checking enabled. Without this patch and with thread-safety checking enabled, the following error is reported: block/blk-zoned.c:2020:1: error: mutex 'disk->queue->limits_lock' is not held on every path through here [-Werror,-Wthread-safety-analysis] 2020 | } | ^ block/blk-zoned.c:1959:8: note: mutex acquired here 1959 | lim = queue_limits_start_update(q); | ^ Cc: Damien Le Moal Cc: Christoph Hellwig Fixes: bba4322e3f30 ("block: freeze queue when updating zone resources") Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260114192803.4171847-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1c54678fae6b..8000c94690ee 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1957,6 +1957,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; if (args->nr_conv_zones >= disk->nr_zones) { + queue_limits_cancel_update(q); pr_warn("%s: Invalid number of conventional zones %u / %u\n", disk->disk_name, args->nr_conv_zones, disk->nr_zones); ret = -ENODEV; From 47bdf1d29caec7207b7f112230055db36602dfc0 Mon Sep 17 00:00:00 2001 From: Seamus Connor Date: Wed, 14 Jan 2026 18:59:52 -0800 Subject: [PATCH 2/8] ublk: fix ublksrv pid handling for pid namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When ublksrv runs inside a pid namespace, START/END_RECOVERY compared the stored init-ns tgid against the userspace pid (getpid vnr), so the check failed and control ops could not proceed. Compare against the caller’s init-ns tgid and store that value, then translate it back to the caller’s pid namespace when reporting GET_DEV_INFO so ublk list shows a sensible pid. Testing: start/recover in a pid namespace; `ublk list` shows reasonable pid values in init, child, and sibling namespaces. Fixes: c2c8089f325e ("ublk: validate ublk server pid") Signed-off-by: Seamus Connor Reviewed-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f6e5a0766721..cd1e84653002 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2885,6 +2885,15 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } +static bool ublk_validate_user_pid(struct ublk_device *ub, pid_t ublksrv_pid) +{ + rcu_read_lock(); + ublksrv_pid = pid_nr(find_vpid(ublksrv_pid)); + rcu_read_unlock(); + + return ub->ublksrv_tgid == ublksrv_pid; +} + static int ublk_ctrl_start_dev(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) { @@ -2953,7 +2962,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; - if (ub->ublksrv_tgid != ublksrv_pid) + if (!ublk_validate_user_pid(ub, ublksrv_pid)) return -EINVAL; mutex_lock(&ub->mutex); @@ -2972,7 +2981,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, disk->fops = &ub_fops; disk->private_data = ub; - ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->dev_info.ublksrv_pid = ub->ublksrv_tgid; ub->ub_disk = disk; ublk_apply_params(ub); @@ -3320,12 +3329,32 @@ static int ublk_ctrl_stop_dev(struct ublk_device *ub) static int ublk_ctrl_get_dev_info(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) { + struct task_struct *p; + struct pid *pid; + struct ublksrv_ctrl_dev_info dev_info; + pid_t init_ublksrv_tgid = ub->dev_info.ublksrv_pid; void __user *argp = (void __user *)(unsigned long)header->addr; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) return -EINVAL; - if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) + memcpy(&dev_info, &ub->dev_info, sizeof(dev_info)); + dev_info.ublksrv_pid = -1; + + if (init_ublksrv_tgid > 0) { + rcu_read_lock(); + pid = find_pid_ns(init_ublksrv_tgid, &init_pid_ns); + p = pid_task(pid, PIDTYPE_TGID); + if (p) { + int vnr = task_tgid_vnr(p); + + if (vnr) + dev_info.ublksrv_pid = vnr; + } + rcu_read_unlock(); + } + + if (copy_to_user(argp, &dev_info, sizeof(dev_info))) return -EFAULT; return 0; @@ -3470,7 +3499,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__, header->dev_id); - if (ub->ublksrv_tgid != ublksrv_pid) + if (!ublk_validate_user_pid(ub, ublksrv_pid)) return -EINVAL; mutex_lock(&ub->mutex); @@ -3481,7 +3510,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, ret = -EBUSY; goto out_unlock; } - ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->dev_info.ublksrv_pid = ub->ublksrv_tgid; ub->dev_info.state = UBLK_S_DEV_LIVE; pr_devel("%s: new ublksrv_pid %d, dev id %d\n", __func__, ublksrv_pid, header->dev_id); From f5f2bad67a45cd1ef6f5b727da104694a81b3666 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Jan 2026 08:31:49 +0100 Subject: [PATCH 3/8] block: make the new blkzoned UAPI constants discoverable The Linux 6.19 merge window added the new BLKREPORTZONESV2 ioctl, and with it the new BLK_ZONE_REP_CACHED and BLK_ZONE_COND_ACTIVE constants. The two constants are defined as part of enums, which makes it very painful for userspace to discover if they are present in the installed system headers. Use the #define to the same name trick to make them trivially discoverable using CPP directives. Fixes: 0bf0e2e46668 ("block: track zone conditions") Fixes: b30ffcdc0c15 ("block: introduce BLKREPORTZONESV2 ioctl") Reported-by: Andrey Albershteyn Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e33f02703350..663836120966 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -81,7 +81,8 @@ enum blk_zone_cond { BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, - BLK_ZONE_COND_ACTIVE = 0xFF, + BLK_ZONE_COND_ACTIVE = 0xFF, /* added in Linux 6.19 */ +#define BLK_ZONE_COND_ACTIVE BLK_ZONE_COND_ACTIVE }; /** @@ -100,7 +101,8 @@ enum blk_zone_report_flags { BLK_ZONE_REP_CAPACITY = (1U << 0), /* Input flags */ - BLK_ZONE_REP_CACHED = (1U << 31), + BLK_ZONE_REP_CACHED = (1U << 31), /* added in Linux 6.19 */ +#define BLK_ZONE_REP_CACHED BLK_ZONE_REP_CACHED }; /** From 75aad5ffe099a1b1a342257236dc260493917ed2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:00 +0800 Subject: [PATCH 4/8] selftests/ublk: fix IO thread idle check Include cmd_inflight in ublk_thread_is_done() check. Without this, the thread may exit before all FETCH commands are completed, which may cause device deletion to hang. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..f52431fe9b6c 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -753,7 +753,7 @@ static int ublk_thread_is_idle(struct ublk_thread *t) static int ublk_thread_is_done(struct ublk_thread *t) { - return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t); + return (t->state & UBLKS_T_STOPPING) && ublk_thread_is_idle(t) && !t->cmd_inflight; } static inline void ublksrv_handle_tgt_cqe(struct ublk_thread *t, From 23e62cf75518825aac12e9a22bdc40f062428898 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:01 +0800 Subject: [PATCH 5/8] selftests/ublk: fix error handling for starting device Fix error handling in ublk_start_daemon() when start_dev fails: 1. Call ublk_ctrl_stop_dev() to cancel inflight uring_cmd before cleanup. Without this, the device deletion may hang waiting for I/O completion that will never happen. 2. Add fail_start label so that pthread_join() is called on the error path. This ensures proper thread cleanup when startup fails. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index f52431fe9b6c..65f59e7b6972 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1054,7 +1054,9 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) } if (ret < 0) { ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); - goto fail; + /* stop device so that inflight uring_cmd can be cancelled */ + ublk_ctrl_stop_dev(dev); + goto fail_start; } ublk_ctrl_get_info(dev); @@ -1062,7 +1064,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); - +fail_start: /* wait until we are terminated */ for (i = 0; i < dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); From e7e1cc18f120a415646be12470169a978a1adcd9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 16:58:02 +0800 Subject: [PATCH 6/8] selftests/ublk: fix garbage output in foreground mode Initialize _evtfd to -1 in struct dev_ctx to prevent garbage output when running kublk in foreground mode. Without this, _evtfd is zero-initialized to 0 (stdin), and ublk_send_dev_event() writes binary data to stdin which appears as garbage on the terminal. Also fix debug message format string. Fixes: 6aecda00b7d1 ("selftests: ublk: add kernel selftests for ublk") Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 65f59e7b6972..f197ad9cc262 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1274,7 +1274,7 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) } ret = ublk_start_daemon(ctx, dev); - ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\n", __func__, ret); if (ret < 0) ublk_ctrl_del_dev(dev); @@ -1620,6 +1620,7 @@ int main(int argc, char *argv[]) int option_idx, opt; const char *cmd = argv[1]; struct dev_ctx ctx = { + ._evtfd = -1, .queue_depth = 128, .nr_hw_queues = 2, .dev_id = -1, From 046be7e5967ef80547f7fd8a399e932f5338d5d4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 22 Jan 2026 12:28:58 +0800 Subject: [PATCH 7/8] blk-mq: use BLK_POLL_ONESHOT for synchronous poll completion blk_execute_rq() with polling is used in kernel code paths such as NVMe controller connect. The aggressive spinning in blk_hctx_poll() can prevent the completion task from getting a chance to run, causing a lockup. The spinning with cpu_relax() doesn't yield CPU, so need_resched() only becomes true on timer tick. This causes unnecessary spinning while the completion task is already waiting to run. Before commit f22ecf9c14c1, the loop would exit early because task_is_running() was always true. After that commit removed the check, the loop now spins until need_resched(). Fix this by using BLK_POLL_ONESHOT in blk_rq_poll_completion(). This causes blk_hctx_poll() to poll once and return immediately, letting the outer loop's cond_resched() yield CPU so the completion task can run. Fixes: f22ecf9c14c1 ("blk-mq: delete task running check in blk_hctx_poll()") Cc: Diangang Li Cc: Fengnan Chang Reported-by: Yi Zhang Signed-off-by: Ming Lei Tested-by: Yi Zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..968699277c3d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1480,7 +1480,7 @@ EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); + blk_hctx_poll(rq->q, rq->mq_hctx, NULL, BLK_POLL_ONESHOT); cond_resched(); } while (!completion_done(wait)); } From 3ef825dfd4e487d6f92b23ee2df2455814583ef4 Mon Sep 17 00:00:00 2001 From: Shida Zhang Date: Thu, 22 Jan 2026 14:13:21 +0800 Subject: [PATCH 8/8] bcache: use bio cloning for detached device requests Previously, bcache hijacked the bi_end_io and bi_private fields of the incoming bio when the backing device was in a detached state. This is fragile and breaks if the bio is needed to be processed by other layers. This patch transitions to using a cloned bio embedded within a private structure. This ensures the original bio's metadata remains untouched. Fixes: 53280e398471 ("bcache: fix improper use of bi_end_io") Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Shida Zhang Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 9 +++++ drivers/md/bcache/request.c | 81 +++++++++++++++++-------------------- drivers/md/bcache/super.c | 12 +++++- 3 files changed, 55 insertions(+), 47 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 8ccacba85547..ec9ff9715081 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -273,6 +273,8 @@ struct bcache_device { struct bio_set bio_split; + struct bio_set bio_detached; + unsigned int data_csum:1; int (*cache_miss)(struct btree *b, struct search *s, @@ -753,6 +755,13 @@ struct bbio { struct bio bio; }; +struct detached_dev_io_private { + struct bcache_device *d; + unsigned long start_time; + struct bio *orig_bio; + struct bio bio; +}; + #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768U diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 82fdea7dea7a..a02aecac05cd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1077,68 +1077,58 @@ static CLOSURE_CALLBACK(cached_dev_nodata) continue_at(cl, cached_dev_bio_complete, NULL); } -struct detached_dev_io_private { - struct bcache_device *d; - unsigned long start_time; - bio_end_io_t *bi_end_io; - void *bi_private; - struct block_device *orig_bdev; -}; - static void detached_dev_end_io(struct bio *bio) { - struct detached_dev_io_private *ddip; - - ddip = bio->bi_private; - bio->bi_end_io = ddip->bi_end_io; - bio->bi_private = ddip->bi_private; + struct detached_dev_io_private *ddip = + container_of(bio, struct detached_dev_io_private, bio); + struct bio *orig_bio = ddip->orig_bio; /* Count on the bcache device */ - bio_end_io_acct_remapped(bio, ddip->start_time, ddip->orig_bdev); + bio_end_io_acct(orig_bio, ddip->start_time); if (bio->bi_status) { - struct cached_dev *dc = container_of(ddip->d, - struct cached_dev, disk); + struct cached_dev *dc = bio->bi_private; + /* should count I/O error for backing device here */ bch_count_backing_io_errors(dc, bio); + orig_bio->bi_status = bio->bi_status; } - kfree(ddip); - bio_endio(bio); + bio_put(bio); + bio_endio(orig_bio); } -static void detached_dev_do_request(struct bcache_device *d, struct bio *bio, - struct block_device *orig_bdev, unsigned long start_time) +static void detached_dev_do_request(struct bcache_device *d, + struct bio *orig_bio, unsigned long start_time) { struct detached_dev_io_private *ddip; struct cached_dev *dc = container_of(d, struct cached_dev, disk); + struct bio *clone_bio; - /* - * no need to call closure_get(&dc->disk.cl), - * because upper layer had already opened bcache device, - * which would call closure_get(&dc->disk.cl) - */ - ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); - if (!ddip) { - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); + if (bio_op(orig_bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(dc->bdev)) { + bio_endio(orig_bio); return; } - ddip->d = d; - /* Count on the bcache device */ - ddip->orig_bdev = orig_bdev; - ddip->start_time = start_time; - ddip->bi_end_io = bio->bi_end_io; - ddip->bi_private = bio->bi_private; - bio->bi_end_io = detached_dev_end_io; - bio->bi_private = ddip; + clone_bio = bio_alloc_clone(dc->bdev, orig_bio, GFP_NOIO, + &d->bio_detached); + if (!clone_bio) { + orig_bio->bi_status = BLK_STS_RESOURCE; + bio_endio(orig_bio); + return; + } - if ((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(dc->bdev)) - detached_dev_end_io(bio); - else - submit_bio_noacct(bio); + ddip = container_of(clone_bio, struct detached_dev_io_private, bio); + /* Count on the bcache device */ + ddip->d = d; + ddip->start_time = start_time; + ddip->orig_bio = orig_bio; + + clone_bio->bi_end_io = detached_dev_end_io; + clone_bio->bi_private = dc; + + submit_bio_noacct(clone_bio); } static void quit_max_writeback_rate(struct cache_set *c, @@ -1214,10 +1204,10 @@ void cached_dev_submit_bio(struct bio *bio) start_time = bio_start_io_acct(bio); - bio_set_dev(bio, dc->bdev); bio->bi_iter.bi_sector += dc->sb.data_offset; if (cached_dev_get(dc)) { + bio_set_dev(bio, dc->bdev); s = search_alloc(bio, d, orig_bdev, start_time); trace_bcache_request_start(s->d, bio); @@ -1237,9 +1227,10 @@ void cached_dev_submit_bio(struct bio *bio) else cached_dev_read(dc, s); } - } else + } else { /* I/O request sent to backing device */ - detached_dev_do_request(d, bio, orig_bdev, start_time); + detached_dev_do_request(d, bio, start_time); + } } static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c17d4517af22..238d12ffdae8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -887,6 +887,7 @@ static void bcache_device_free(struct bcache_device *d) } bioset_exit(&d->bio_split); + bioset_exit(&d->bio_detached); kvfree(d->full_dirty_stripes); kvfree(d->stripe_sectors_dirty); @@ -949,6 +950,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove; + if (bioset_init(&d->bio_detached, 4, + offsetof(struct detached_dev_io_private, bio), + BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) + goto out_bioset_split_exit; + if (lim.logical_block_size > PAGE_SIZE && cached_bdev) { /* * This should only happen with BCACHE_SB_VERSION_BDEV. @@ -964,7 +970,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); if (IS_ERR(d->disk)) - goto out_bioset_exit; + goto out_bioset_detach_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -976,7 +982,9 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->private_data = d; return 0; -out_bioset_exit: +out_bioset_detach_exit: + bioset_exit(&d->bio_detached); +out_bioset_split_exit: bioset_exit(&d->bio_split); out_ida_remove: ida_free(&bcache_device_idx, idx);