From 2ebc8d600fb907fa6b1e7095c0b6d84fc47e91ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Thu, 5 Feb 2026 18:39:29 +0100 Subject: [PATCH 01/11] drbd: always set BLK_FEAT_STABLE_WRITES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRBD requires stable pages because it may read the same bio data multiple times for local disk I/O and network transmission, and in some cases for calculating checksums. The BLK_FEAT_STABLE_WRITES flag is set when the device is first created, but blk_set_stacking_limits() clears it whenever a backing device is attached. In some cases the flag may be inherited from the backing device, but we want it to be enabled at all times. Unconditionally re-enable BLK_FEAT_STABLE_WRITES in drbd_reconsider_queue_parameters() after the queue parameter negotiations. Also, document why we want this flag enabled in the first place. Fixes: 1a02f3a73f8c ("block: move the stable_writes flag to queue_limits") Signed-off-by: Christoph Böhmwalder Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 3 --- drivers/block/drbd/drbd_nl.c | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c73376886e7a..1f6ac9202b66 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2659,9 +2659,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig * connect. */ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, - .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | - BLK_FEAT_ROTATIONAL | - BLK_FEAT_STABLE_WRITES, }; device = minor_to_device(minor); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 91f3b8afb63c..b502038be0a9 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1296,6 +1296,8 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, lim.max_segments = drbd_backing_dev_max_segments(device); } else { lim.max_segments = BLK_MAX_SEGMENTS; + lim.features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | + BLK_FEAT_ROTATIONAL | BLK_FEAT_STABLE_WRITES; } lim.max_hw_sectors = new >> SECTOR_SHIFT; @@ -1318,8 +1320,24 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, lim.max_hw_discard_sectors = 0; } - if (bdev) + if (bdev) { blk_stack_limits(&lim, &b->limits, 0); + /* + * blk_set_stacking_limits() cleared the features, and + * blk_stack_limits() may or may not have inherited + * BLK_FEAT_STABLE_WRITES from the backing device. + * + * DRBD always requires stable writes because: + * 1. The same bio data is read for both local disk I/O and + * network transmission. If the page changes mid-flight, + * the local and remote copies could diverge. + * 2. When data integrity is enabled, DRBD calculates a + * checksum before sending the data. If the page changes + * between checksum calculation and transmission, the + * receiver will detect a checksum mismatch. + */ + lim.features |= BLK_FEAT_STABLE_WRITES; + } /* * If we can handle "zeroes" efficiently on the protocol, we want to do From 5b88af7113feba2f0ae3402bb57cb5c94eea7dc3 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 10 Feb 2026 11:36:17 -0500 Subject: [PATCH 02/11] block: allow IOC_PR_READ_* ioctls with BLK_OPEN_READ The recently added IOC_PR_READ_* ioctls require the same BLK_OPEN_WRITE permission as the older persistent reservation ioctls. This has the drawback that udev triggers when the file descriptor is closed, resulting in unnecessary activity like scanning partitions even though these read-only ioctls do not modify the device. Change IOC_PR_READ_KEYS and IOC_PR_READ_RESERVATION to require BLK_OPEN_READ. This prevents unnecessary activity every time `blkpr --read-keys` or `blkpr --read-reservation` is invoked by shell scripts, for example. It is safe to reduce the permission requirement from BLK_OPEN_WRITE to BLK_OPEN_READ since these two ioctls do not modify the persistent reservation state. Userspace cannot use the information fetched by these ioctls to make changes to the device unless it later opens the device with BLK_OPEN_WRITE. Fixes: 3e2cb9ee76c2 ("block: add IOC_PR_READ_RESERVATION ioctl") Fixes: 22a1ffea5f80 ("block: add IOC_PR_READ_KEYS ioctl") Cc: Christoph Hellwig Cc: Martin Wilck Cc: Benjamin Marzinski Suggested-by: Hannes Reinecke Signed-off-by: Stefan Hajnoczi Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index fd48f82f9f03..0b04661ac809 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -318,7 +318,13 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif -static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) +enum pr_direction { + PR_IN, /* read from device */ + PR_OUT, /* write to device */ +}; + +static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode, + enum pr_direction dir) { /* no sense to make reservations for partitions */ if (bdev_is_partition(bdev)) @@ -326,11 +332,17 @@ static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) if (capable(CAP_SYS_ADMIN)) return true; + /* - * Only allow unprivileged reservations if the file descriptor is open - * for writing. + * Only allow unprivileged reservation _out_ commands if the file + * descriptor is open for writing. Allow reservation _in_ commands if + * the file descriptor is open for reading since they do not modify the + * device. */ - return mode & BLK_OPEN_WRITE; + if (dir == PR_IN) + return mode & BLK_OPEN_READ; + else + return mode & BLK_OPEN_WRITE; } static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, @@ -339,7 +351,7 @@ static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_OUT)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; @@ -357,7 +369,7 @@ static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_OUT)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; @@ -375,7 +387,7 @@ static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_OUT)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; @@ -393,7 +405,7 @@ static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_OUT)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; @@ -411,7 +423,7 @@ static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_OUT)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; @@ -434,7 +446,7 @@ static int blkdev_pr_read_keys(struct block_device *bdev, blk_mode_t mode, size_t keys_copy_len; int ret; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_IN)) return -EPERM; if (!ops || !ops->pr_read_keys) return -EOPNOTSUPP; @@ -486,7 +498,7 @@ static int blkdev_pr_read_reservation(struct block_device *bdev, struct pr_read_reservation out = {}; int ret; - if (!blkdev_pr_allowed(bdev, mode)) + if (!blkdev_pr_allowed(bdev, mode, PR_IN)) return -EPERM; if (!ops || !ops->pr_read_reservation) return -EOPNOTSUPP; From 5991bfa3f88ec8d67fa3f552c19c39ff37a4e67b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 12 Feb 2026 04:07:41 -0700 Subject: [PATCH 03/11] block: fix folio leak in bio_iov_iter_bounce_read() If iov_iter_extract_bvecs() returns an error or zero bytes extracted, then the folio allocated is leaked on return. Ensure it's put before returning. Fixes: 8dd5e7c75d7b ("block: add helpers to bounce buffer an iov_iter into bios") Signed-off-by: Jens Axboe --- block/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index b291b9aaeee1..8203bb7455a9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1382,8 +1382,10 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); if (ret <= 0) { - if (!bio->bi_vcnt) + if (!bio->bi_vcnt) { + folio_put(folio); return ret; + } break; } len -= ret; From 81e7223b1a2d63b655ee72577c8579f968d037e3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 11 Feb 2026 12:49:44 -0800 Subject: [PATCH 04/11] block: fix partial IOVA mapping cleanup in blk_rq_dma_map_iova When dma_iova_link() fails partway through mapping a request's bvec list, the function breaks out of the loop without cleaning up already mapped segments. Similarly, if dma_iova_sync() fails after linking all segments, no cleanup is performed. This leaves partial IOVA mappings in place. The completion path attempts to unmap the full expected size via dma_iova_destroy() or nvme_unmap_data(), but only a partial size was actually mapped, leading to incorrect unmap operations. Add an out_unlink error path that calls dma_iova_destroy() to clean up partial mappings before returning failure. The dma_iova_destroy() function handles both partial unlink and IOVA space freeing. It correctly handles the mapped_len == 0 case (first dma_iova_link() failure) by only freeing the IOVA allocation without attempting to unmap. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 3c87779cdc19..bfdb9ed70741 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -121,17 +121,20 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, error = dma_iova_link(dma_dev, state, vec->paddr, mapped, vec->len, dir, attrs); if (error) - break; + goto out_unlink; mapped += vec->len; } while (blk_map_iter_next(req, &iter->iter, vec)); error = dma_iova_sync(dma_dev, state, 0, mapped); - if (error) { - iter->status = errno_to_blk_status(error); - return false; - } + if (error) + goto out_unlink; return true; + +out_unlink: + dma_iova_destroy(dma_dev, state, mapped, dir, attrs); + iter->status = errno_to_blk_status(error); + return false; } static inline void blk_rq_map_iter_init(struct request *rq, From 699fcfb6cb80a9df67fd2086a1c930d196d709f2 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 11 Feb 2026 12:44:36 -0800 Subject: [PATCH 05/11] md: ignore discard return value __blkdev_issue_discard() always returns 0, making all error checking at call sites dead code. Simplify md to only check !discard_bio by ignoring the __blkdev_issue_discard() value. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 59cd303548de..89c9e63a9139 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9179,8 +9179,8 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, { struct bio *discard_bio = NULL; - if (__blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, - &discard_bio) || !discard_bio) + __blkdev_issue_discard(rdev->bdev, start, size, GFP_NOIO, &discard_bio); + if (!discard_bio) return; bio_chain(discard_bio, bio); From 38d12f15c4772b5383b1249b2afb0d206a430f0f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 11 Feb 2026 12:44:37 -0800 Subject: [PATCH 06/11] nvmet: ignore discard return value __blkdev_issue_discard() always returns 0, making the error checking in nvmet_bdev_discard_range() dead code. Kill the function nvmet_bdev_discard_range() and call __blkdev_issue_discard() directly from nvmet_bdev_execute_discard(), since no error handling is needed anymore for __blkdev_issue_discard() call. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-bdev.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0103815542d4..f15d1c213bc6 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -363,29 +363,14 @@ u16 nvmet_bdev_flush(struct nvmet_req *req) return 0; } -static u16 nvmet_bdev_discard_range(struct nvmet_req *req, - struct nvme_dsm_range *range, struct bio **bio) -{ - struct nvmet_ns *ns = req->ns; - int ret; - - ret = __blkdev_issue_discard(ns->bdev, - nvmet_lba_to_sect(ns, range->slba), - le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), - GFP_KERNEL, bio); - if (ret && ret != -EOPNOTSUPP) { - req->error_slba = le64_to_cpu(range->slba); - return errno_to_nvme_status(req, ret); - } - return NVME_SC_SUCCESS; -} - static void nvmet_bdev_execute_discard(struct nvmet_req *req) { + struct nvmet_ns *ns = req->ns; struct nvme_dsm_range range; struct bio *bio = NULL; + sector_t nr_sects; int i; - u16 status; + u16 status = NVME_SC_SUCCESS; for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, @@ -393,9 +378,10 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req) if (status) break; - status = nvmet_bdev_discard_range(req, &range, &bio); - if (status) - break; + nr_sects = le32_to_cpu(range.nlb) << (ns->blksize_shift - 9); + __blkdev_issue_discard(ns->bdev, + nvmet_lba_to_sect(ns, range.slba), nr_sects, + GFP_KERNEL, &bio); } if (bio) { From 453daece381e60df20da16c49ccc6a9bc5c6515a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 11 Feb 2026 12:44:38 -0800 Subject: [PATCH 07/11] block: change return type to void Now that all the callers of __blkdev_issue_discard() have been changed to ignore its return value, change its return type from int to void. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-lib.c | 3 +-- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 0be3acdc3eb5..3213afc7f0d5 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -60,7 +60,7 @@ struct bio *blk_alloc_discard_bio(struct block_device *bdev, return bio; } -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, +void __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct bio *bio; @@ -68,7 +68,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, gfp_mask))) *biop = bio_chain_and_submit(*biop, bio); - return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99ef8cd7673c..d463b9b5a0a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1259,7 +1259,7 @@ extern void blk_io_schedule(void); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, +void __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); From 11506b3c233fcead6eba2842d17ec29c84f550d4 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Sat, 14 Feb 2026 10:12:54 +0100 Subject: [PATCH 08/11] block: update docs for bio and bvec_iter The documentation for bio and bvec_iter refers to a vector named bvl_vec. This does not exist. Update the documentation comment with correct use. Also update documentation comments for remaining fields of `bvec_iter` to improve readability. The fields of `bvec_iter` is using a mix of tabs and spaces for indentation. While at it, change them all to tabs, which is most prevalent in this struct definition. Signed-off-by: Andreas Hindborg Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 +++++++- include/linux/bvec.h | 25 +++++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d59553324a84..397602606f74 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -273,7 +273,13 @@ struct bio { * Everything starting with bi_max_vecs will be preserved by bio_reset() */ - unsigned short bi_max_vecs; /* max bvl_vecs we can hold */ + /* + * Number of elements in `bi_io_vec` that were allocated for this bio. + * Only used by the bio submitter to make `bio_add_page` fail once full + * and to free the `bi_io_vec` allocation. Must not be used in drivers + * and does not hold a useful value for cloned bios. + */ + unsigned short bi_max_vecs; atomic_t __bi_cnt; /* pin count */ diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 3fc0efa0825b..06fb60471aaf 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -75,14 +75,27 @@ static inline void bvec_set_virt(struct bio_vec *bv, void *vaddr, } struct bvec_iter { - sector_t bi_sector; /* device address in 512 byte - sectors */ - unsigned int bi_size; /* residual I/O count */ + /* + * Current device address in 512 byte sectors. Only updated by the bio + * iter wrappers and not the bvec iterator helpers themselves. + */ + sector_t bi_sector; - unsigned int bi_idx; /* current index into bvl_vec */ + /* + * Remaining size in bytes. + */ + unsigned int bi_size; - unsigned int bi_bvec_done; /* number of bytes completed in - current bvec */ + /* + * Current index into the bvec array. This indexes into `bi_io_vec` when + * iterating a bvec array that is part of a `bio`. + */ + unsigned int bi_idx; + + /* + * Current offset in the bvec entry pointed to by `bi_idx`. + */ + unsigned int bi_bvec_done; } __packed __aligned(4); struct bvec_iter_all { From 4c431a76a288ce958aaa114d8ea6fc0968942832 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Feb 2026 21:29:41 -0800 Subject: [PATCH 09/11] block: fix enum descriptions kernel-doc Fix all kernel-doc warnings in blk_types.h: Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_READ' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_WRITE' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_FLUSH' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_DISCARD' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_SECURE_ERASE' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_APPEND' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_WRITE_ZEROES' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_OPEN' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_CLOSE' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_FINISH' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_RESET' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_ZONE_RESET_ALL' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_DRV_IN' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_DRV_OUT' not described in enum 'req_op' Warning: include/linux/blk_types.h:371 Enum value 'REQ_OP_LAST' not described in enum 'req_op' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 397602606f74..8808ee76e73c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -345,32 +345,33 @@ typedef __u32 __bitwise blk_mq_req_flags_t; * meaning. */ enum req_op { - /* read sectors from the device */ + /** @REQ_OP_READ: read sectors from the device */ REQ_OP_READ = (__force blk_opf_t)0, - /* write sectors to the device */ + /** @REQ_OP_WRITE: write sectors to the device */ REQ_OP_WRITE = (__force blk_opf_t)1, - /* flush the volatile write cache */ + /** @REQ_OP_FLUSH: flush the volatile write cache */ REQ_OP_FLUSH = (__force blk_opf_t)2, - /* discard sectors */ + /** @REQ_OP_DISCARD: discard sectors */ REQ_OP_DISCARD = (__force blk_opf_t)3, - /* securely erase sectors */ + /** @REQ_OP_SECURE_ERASE: securely erase sectors */ REQ_OP_SECURE_ERASE = (__force blk_opf_t)5, - /* write data at the current zone write pointer */ + /** @REQ_OP_ZONE_APPEND: write data at the current zone write pointer */ REQ_OP_ZONE_APPEND = (__force blk_opf_t)7, - /* write the zero filled sector many times */ + /** @REQ_OP_WRITE_ZEROES: write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, - /* Open a zone */ + /** @REQ_OP_ZONE_OPEN: Open a zone */ REQ_OP_ZONE_OPEN = (__force blk_opf_t)11, - /* Close a zone */ + /** @REQ_OP_ZONE_CLOSE: Close a zone */ REQ_OP_ZONE_CLOSE = (__force blk_opf_t)13, - /* Transition a zone to full */ + /** @REQ_OP_ZONE_FINISH: Transition a zone to full */ REQ_OP_ZONE_FINISH = (__force blk_opf_t)15, - /* reset a zone write pointer */ + /** @REQ_OP_ZONE_RESET: reset a zone write pointer */ REQ_OP_ZONE_RESET = (__force blk_opf_t)17, - /* reset all the zone present on the device */ + /** @REQ_OP_ZONE_RESET_ALL: reset all the zone present on the device */ REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)19, /* Driver private requests */ + /* private: */ REQ_OP_DRV_IN = (__force blk_opf_t)34, REQ_OP_DRV_OUT = (__force blk_opf_t)35, From 3678a334a55e869b413e2fb4824e92200b149d73 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Feb 2026 21:29:51 -0800 Subject: [PATCH 10/11] blk-stat: convert struct blk_stat_callback to kernel-doc Most of struct blk_stat_callback documentation is already in kernel-doc format. Convert the remaining struct members to kernel-doc to avoid kernel-doc warnings: Warning: block/blk-stat.h:62 struct member 'list' not described in 'blk_stat_callback' Warning: block/blk-stat.h:62 struct member 'timer_fn' not described in 'blk_stat_callback' Warning: block/blk-stat.h:62 struct member 'rcu' not described in 'blk_stat_callback' Warning: block/blk-stat.h:133 No description found for return value of 'blk_stat_is_active' Signed-off-by: Randy Dunlap Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-stat.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-stat.h b/block/blk-stat.h index 9e05bf18d1be..cc5b66e7ee60 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -17,7 +17,7 @@ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked. */ struct blk_stat_callback { - /* + /** * @list: RCU list of callbacks for a &struct request_queue. */ struct list_head list; @@ -50,7 +50,7 @@ struct blk_stat_callback { struct blk_rq_stat *stat; /** - * @fn: Callback function. + * @timer_fn: Callback function. */ void (*timer_fn)(struct blk_stat_callback *); @@ -59,6 +59,9 @@ struct blk_stat_callback { */ void *data; + /** + * @rcu: rcu list head + */ struct rcu_head rcu; }; @@ -126,6 +129,8 @@ void blk_stat_free_callback(struct blk_stat_callback *cb); * blk_stat_is_active() - Check if a block statistics callback is currently * gathering statistics. * @cb: The callback. + * + * Returns: %true iff the callback is active. */ static inline bool blk_stat_is_active(struct blk_stat_callback *cb) { From dfe48ea179733be948c432f6af2fc3913cf5dd28 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 14 Feb 2026 13:43:50 +0800 Subject: [PATCH 11/11] blk-mq: use NOIO context to prevent deadlock during debugfs creation Creating debugfs entries can trigger fs reclaim, which can enter back into the block layer request_queue. This can cause deadlock if the queue is frozen. Previously, a WARN_ON_ONCE check was used in debugfs_create_files() to detect this condition, but it was racy since the queue can be frozen from another context at any time. Introduce blk_debugfs_lock()/blk_debugfs_unlock() helpers that combine the debugfs_mutex with memalloc_noio_save()/restore() to prevent fs reclaim from triggering block I/O. Also add blk_debugfs_lock_nomemsave() and blk_debugfs_unlock_nomemrestore() variants for callers that don't need NOIO protection (e.g., debugfs removal or read-only operations). Replace all raw debugfs_mutex lock/unlock pairs with these helpers, using the _nomemsave/_nomemrestore variants where appropriate. Reported-by: Yi Zhang Closes: https://lore.kernel.org/all/CAHj4cs9gNKEYAPagD9JADfO5UH+OiCr4P7OO2wjpfOYeM-RV=A@mail.gmail.com/ Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/all/aYWQR7CtYdk3K39g@shinmob/ Suggested-by: Christoph Hellwig Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 10 +++------- block/blk-mq-sched.c | 9 +++++---- block/blk-sysfs.c | 9 +++++---- block/blk-wbt.c | 10 ++++++---- block/blk.h | 31 +++++++++++++++++++++++++++++++ kernel/trace/blktrace.c | 38 +++++++++++++++++++++----------------- 6 files changed, 71 insertions(+), 36 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index faeaa1fc86a7..28167c9baa55 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -613,11 +613,6 @@ static void debugfs_create_files(struct request_queue *q, struct dentry *parent, const struct blk_mq_debugfs_attr *attr) { lockdep_assert_held(&q->debugfs_mutex); - /* - * Creating new debugfs entries with queue freezed has the risk of - * deadlock. - */ - WARN_ON_ONCE(q->mq_freeze_depth != 0); /* * debugfs_mutex should not be nested under other locks that can be * grabbed while queue is frozen. @@ -693,12 +688,13 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; + unsigned int memflags; unsigned long i; - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e26898128a7e..97c3c8f45a9b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -390,13 +390,14 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla void blk_mq_sched_reg_debugfs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; + unsigned int memflags; unsigned long i; - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); blk_mq_debugfs_register_sched(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_sched_hctx(q, hctx); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); } void blk_mq_sched_unreg_debugfs(struct request_queue *q) @@ -404,11 +405,11 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_sched_hctx(hctx); blk_mq_debugfs_unregister_sched(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); } void blk_mq_free_sched_tags(struct elevator_tags *et, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 003aa684e854..f3b1968c80ce 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -892,13 +892,13 @@ static void blk_debugfs_remove(struct gendisk *disk) { struct request_queue *q = disk->queue; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); blk_trace_shutdown(q); debugfs_remove_recursive(q->debugfs_dir); q->debugfs_dir = NULL; q->sched_debugfs_dir = NULL; q->rqos_debugfs_dir = NULL; - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); } /** @@ -908,6 +908,7 @@ static void blk_debugfs_remove(struct gendisk *disk) int blk_register_queue(struct gendisk *disk) { struct request_queue *q = disk->queue; + unsigned int memflags; int ret; ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); @@ -921,11 +922,11 @@ int blk_register_queue(struct gendisk *disk) } mutex_lock(&q->sysfs_lock); - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); ret = disk_register_independent_access_ranges(disk); if (ret) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1415f2bf8611..6dba71e87387 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -776,6 +776,7 @@ void wbt_init_enable_default(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; + unsigned int memflags; if (!__wbt_enable_default(disk)) return; @@ -789,9 +790,9 @@ void wbt_init_enable_default(struct gendisk *disk) return; } - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); blk_mq_debugfs_register_rq_qos(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); } static u64 wbt_default_latency_nsec(struct request_queue *q) @@ -1015,9 +1016,10 @@ int wbt_set_lat(struct gendisk *disk, s64 val) blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); - mutex_lock(&q->debugfs_mutex); + + memflags = blk_debugfs_lock(q); blk_mq_debugfs_register_rq_qos(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); return ret; } diff --git a/block/blk.h b/block/blk.h index a6b1de509733..f6053e9dd2aa 100644 --- a/block/blk.h +++ b/block/blk.h @@ -729,4 +729,35 @@ static inline void blk_unfreeze_release_lock(struct request_queue *q) } #endif +/* + * debugfs directory and file creation can trigger fs reclaim, which can enter + * back into the block layer request_queue. This can cause deadlock if the + * queue is frozen. Use NOIO context together with debugfs_mutex to prevent fs + * reclaim from triggering block I/O. + */ +static inline void blk_debugfs_lock_nomemsave(struct request_queue *q) +{ + mutex_lock(&q->debugfs_mutex); +} + +static inline void blk_debugfs_unlock_nomemrestore(struct request_queue *q) +{ + mutex_unlock(&q->debugfs_mutex); +} + +static inline unsigned int __must_check blk_debugfs_lock(struct request_queue *q) +{ + unsigned int memflags = memalloc_noio_save(); + + blk_debugfs_lock_nomemsave(q); + return memflags; +} + +static inline void blk_debugfs_unlock(struct request_queue *q, + unsigned int memflags) +{ + blk_debugfs_unlock_nomemrestore(q); + memalloc_noio_restore(memflags); +} + #endif /* BLK_INTERNAL_H */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c4db5c2e7103..a3d8a68f8683 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -559,9 +559,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); ret = __blk_trace_remove(q); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -767,6 +767,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; struct blk_trace *bt; + unsigned int memflags; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); @@ -785,16 +786,16 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, .pid = buts.pid, }; - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, bdev); if (IS_ERR(bt)) { - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 1, bt, &buts2); strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -809,6 +810,7 @@ static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, { struct blk_user_trace_setup2 buts2; struct blk_trace *bt; + unsigned int memflags; if (copy_from_user(&buts2, arg, sizeof(buts2))) return -EFAULT; @@ -819,15 +821,15 @@ static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, if (buts2.flags != 0) return -EINVAL; - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, bdev); if (IS_ERR(bt)) { - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 2, bt, &buts2); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); if (copy_to_user(arg, &buts2, sizeof(buts2))) { blk_trace_remove(q); @@ -844,6 +846,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; struct blk_trace *bt; + unsigned int memflags; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; @@ -860,15 +863,15 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .pid = cbuts.pid, }; - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, bdev); if (IS_ERR(bt)) { - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 1, bt, &buts2); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); @@ -898,9 +901,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -2020,7 +2023,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct blk_trace *bt; ssize_t ret = -ENXIO; - mutex_lock(&q->debugfs_mutex); + blk_debugfs_lock_nomemsave(q); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); @@ -2041,7 +2044,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock_nomemrestore(q); return ret; } @@ -2052,6 +2055,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; + unsigned int memflags; u64 value; ssize_t ret = -EINVAL; @@ -2071,7 +2075,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; } - mutex_lock(&q->debugfs_mutex); + memflags = blk_debugfs_lock(q); bt = rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex)); @@ -2106,7 +2110,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_unlock(q, memflags); out: return ret ? ret : count; }