From 0f5bb0cfb0b40a31d2fe146ecbef5727690fa547 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 19 Nov 2025 22:41:26 +0000 Subject: [PATCH 01/28] fs: use min() or umin() instead of min_t() min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'. Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long' and so cannot discard significant bits. A couple of places need umin() because of loops like: nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); for (i = 0; i < nfolios; i++) { struct folio *folio = page_folio(pages[i]); ... unsigned int len = umin(ret, PAGE_SIZE - start); ... ret -= len; ... } where the compiler doesn't track things well enough to know that 'ret' is never negative. The alternate loop: for (i = 0; ret > 0; i++) { struct folio *folio = page_folio(pages[i]); ... unsigned int len = min(ret, PAGE_SIZE - start); ... ret -= len; ... } would be equivalent and doesn't need 'nfolios'. Most of the 'unsigned long' actually come from PAGE_SIZE. Detected by an extra check added to min_t(). Signed-off-by: David Laight Link: https://patch.msgid.link/20251119224140.8616-31-david.laight.linux@gmail.com Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- fs/exec.c | 2 +- fs/ext4/mballoc.c | 3 +-- fs/ext4/resize.c | 2 +- fs/ext4/super.c | 2 +- fs/fat/dir.c | 4 ++-- fs/fat/file.c | 3 +-- fs/fuse/dev.c | 2 +- fs/fuse/file.c | 8 +++----- fs/splice.c | 2 +- 10 files changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..c6f4660f92df 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2354,7 +2354,7 @@ bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count) if (!head) return false; blocksize = head->b_size; - to = min_t(unsigned, folio_size(folio) - from, count); + to = min(folio_size(folio) - from, count); to = from + to; if (from < blocksize && to > folio_size(folio) - blocksize) return false; diff --git a/fs/exec.c b/fs/exec.c index 9d5ebc9d15b0..d0606e53376f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -555,7 +555,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) return -E2BIG; while (len > 0) { - unsigned int bytes_to_copy = min_t(unsigned int, len, + unsigned int bytes_to_copy = min(len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 56d50fd3310b..e817a758801d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4276,8 +4276,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, * get the corresponding group metadata to work with. * For this we have goto again loop. */ - thisgrp_len = min_t(unsigned int, (unsigned int)len, - EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); + thisgrp_len = min(len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 050f26168d97..76842f0957b5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1479,7 +1479,7 @@ static void ext4_update_super(struct super_block *sb, /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + sbi->s_blockfile_groups = min(sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 87205660c5d0..79762c3e0dff 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4832,7 +4832,7 @@ static int ext4_check_geometry(struct super_block *sb, return -EINVAL; } sbi->s_groups_count = blocks_count; - sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, + sbi->s_blockfile_groups = min(sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 92b091783966..8375e7fbc1a5 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1353,7 +1353,7 @@ found: /* Fill the long name slots. */ for (i = 0; i < long_bhs; i++) { - int copy = min_t(int, sb->s_blocksize - offset, size); + int copy = umin(sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); offset = 0; @@ -1364,7 +1364,7 @@ found: err = fat_sync_bhs(bhs, long_bhs); if (!err && i < nr_bhs) { /* Fill the short name slot. */ - int copy = min_t(int, sb->s_blocksize - offset, size); + int copy = umin(sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); if (IS_DIRSYNC(dir)) diff --git a/fs/fat/file.c b/fs/fat/file.c index 4fc49a614fb8..f48435e586c7 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -140,8 +140,7 @@ static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) if (copy_from_user(&range, user_range, sizeof(range))) return -EFAULT; - range.minlen = max_t(unsigned int, range.minlen, - bdev_discard_granularity(sb->s_bdev)); + range.minlen = max(range.minlen, bdev_discard_granularity(sb->s_bdev)); err = fat_trim_fs(inode, &range); if (err < 0) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6d59cbc877c6..a30c8b57d478 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1813,7 +1813,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, goto out_iput; folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; - nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_bytes = min(num, folio_size(folio) - folio_offset); nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 01bc894e9c2b..4f71eb5a9bac 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1323,10 +1323,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, unsigned int max_pages) { - return min_t(unsigned int, - ((pos + len - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT) + 1, - max_pages); + return min(((pos + len - 1) >> PAGE_SHIFT) - (pos >> PAGE_SHIFT) + 1, + max_pages); } static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) @@ -1607,7 +1605,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, struct folio *folio = page_folio(pages[i]); unsigned int offset = start + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); - unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); + unsigned int len = umin(ret, PAGE_SIZE - start); ap->descs[ap->num_folios].offset = offset; ap->descs[ap->num_folios].length = len; diff --git a/fs/splice.c b/fs/splice.c index d338fe56b50b..5fb07c01936f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1467,7 +1467,7 @@ static ssize_t iter_to_pipe(struct iov_iter *from, n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { - int size = min_t(int, left, PAGE_SIZE - start); + int size = umin(left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; From 5854fc6391e9d67c9ebfb4cb618406b5a372db6b Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:55:08 +0100 Subject: [PATCH 02/28] fs: annotate cdev_lock with __cacheline_aligned_in_smp No need for the crapper to be susceptible to false-sharing. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203095508.291073-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/char_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/char_dev.c b/fs/char_dev.c index c2ddb998f3c9..84a5a0699373 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -343,7 +343,7 @@ void __unregister_chrdev(unsigned int major, unsigned int baseminor, kfree(cd); } -static DEFINE_SPINLOCK(cdev_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { From c0aac5975bafc86f6817b14e9f71dcb5064a9183 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:28:50 +0100 Subject: [PATCH 03/28] ns: pad refcount Note no effort is made to make sure structs embedding the namespace are themselves aligned, so this is not guaranteed to eliminate cacheline bouncing due to refcount management. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203092851.287617-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index b332b019b29c..0014fbc1c626 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -108,11 +108,13 @@ extern const struct proc_ns_operations utsns_operations; * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { + struct { + refcount_t __ns_ref; /* do not use directly */ + } ____cacheline_aligned_in_smp; u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ union { struct ns_tree; struct rcu_head ns_rcu; From 1fa4e69a54a250fa17d2afd9c5b54a59329033c1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:36 +0100 Subject: [PATCH 04/28] filelock: use a consume fence in locks_inode_context() Matches the idiom of storing a pointer with a release fence and safely getting the content with a consume fence after. Eliminates an actual fence on some archs. Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 54b824c05299..dc15f5427680 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -241,7 +241,10 @@ bool locks_owner_has_blockers(struct file_lock_context *flctx, static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { - return smp_load_acquire(&inode->i_flctx); + /* + * Paired with the fence in locks_get_lock_context(). + */ + return READ_ONCE(inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ From 6d864a1b182532e7570383af8825fa4ddcd24243 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:28:51 +0100 Subject: [PATCH 05/28] pid: only take pidmap_lock once on alloc When spawning and killing threads in separate processes in parallel the primary bottleneck on the stock kernel is pidmap_lock, largely because of a back-to-back acquire in the common case. This aspect is fixed with the patch. Performance improvement varies between reboots. When benchmarking with 20 processes creating and killing threads in a loop, the unpatched baseline hovers around 465k ops/s, while patched is anything between ~510k ops/s and ~560k depending on false-sharing (which I only minimally sanitized). So this is at least 10% if you are unlucky. The change also facilitated some cosmetic fixes. It has an unintentional side effect of no longer issuing spurious idr_preload() around idr_replace(). Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- kernel/pid.c | 131 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 46 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index a31771bc89c1..f45ae56db7da 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -159,58 +159,86 @@ void free_pids(struct pid **pids) free_pid(pids[tmp]); } -struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, - size_t set_tid_size) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, + size_t arg_set_tid_size) { + int set_tid[MAX_PID_NS_LEVEL + 1] = {}; + int pid_max[MAX_PID_NS_LEVEL + 1] = {}; struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; + bool retried_preload; /* - * set_tid_size contains the size of the set_tid array. Starting at + * arg_set_tid_size contains the size of the arg_set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace - * up to set_tid_size PID namespaces. It does not have to set the PID - * for a process in all nested PID namespaces but set_tid_size must + * up to arg_set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but arg_set_tid_size must * never be greater than the current ns->level + 1. */ - if (set_tid_size > ns->level + 1) + if (arg_set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); + /* + * Prep before we take locks: + * + * 1. allocate and fill in pid struct + */ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); - tmp = ns; + get_pid_ns(ns); pid->level = ns->level; + refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); - for (i = ns->level; i >= 0; i--) { - int tid = 0; - int pid_max = READ_ONCE(tmp->pid_max); + /* + * 2. perm check checkpoint_restore_ns_capable() + * + * This stores found pid_max to make sure the used value is the same should + * later code need it. + */ + for (tmp = ns, i = ns->level; i >= 0; i--) { + pid_max[ns->level - i] = READ_ONCE(tmp->pid_max); - if (set_tid_size) { - tid = set_tid[ns->level - i]; + if (arg_set_tid_size) { + int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i]; retval = -EINVAL; - if (tid < 1 || tid >= pid_max) - goto out_free; + if (tid < 1 || tid >= pid_max[ns->level - i]) + goto out_abort; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) - goto out_free; + goto out_abort; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) - goto out_free; - set_tid_size--; + goto out_abort; + arg_set_tid_size--; } - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); + tmp = tmp->parent; + } + + /* + * Prep is done, id allocation goes here: + */ + retried_preload = false; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + for (tmp = ns, i = ns->level; i >= 0;) { + int tid = set_tid[ns->level - i]; if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, @@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) + nr = -EEXIST; } else { int pid_min = 1; @@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + pid_max[ns->level - i], GFP_ATOMIC); + if (nr == -ENOSPC) + nr = -EAGAIN; } - spin_unlock(&pidmap_lock); - idr_preload_end(); - if (nr < 0) { - retval = (nr == -ENOSPC) ? -EAGAIN : nr; + if (unlikely(nr < 0)) { + /* + * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM. + * + * The IDR API only allows us to preload memory for one call, while we may end + * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be + * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload + * did not help (the routine unfortunately returns void, so we have no idea + * if it got anywhere). + * + * The lock can be safely dropped and picked up as historically pid allocation + * for different namespaces was *not* atomic -- we try to hold on to it the + * entire time only for performance reasons. + */ + if (nr == -ENOMEM && !retried_preload) { + spin_unlock(&pidmap_lock); + idr_preload_end(); + retried_preload = true; + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + continue; + } + retval = nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; + i--; + retried_preload = false; } /* @@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. + * + * This can't be done earlier because we need to preserve other + * error conditions. */ retval = -ENOMEM; - - get_pid_ns(ns); - refcount_set(&pid->count, 1); - spin_lock_init(&pid->lock); - for (type = 0; type < PIDTYPE_MAX; ++type) - INIT_HLIST_HEAD(&pid->tasks[type]); - - init_waitqueue_head(&pid->wait_pidfd); - INIT_HLIST_HEAD(&pid->inodes); - - upid = pid->numbers + ns->level; - idr_preload(GFP_KERNEL); - spin_lock(&pidmap_lock); - if (!(ns->pid_allocated & PIDNS_ADDING)) - goto out_unlock; + if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) + goto out_free; pidfs_add_pid(pid); - for ( ; upid >= pid->numbers; --upid) { + for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; @@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, return pid; -out_unlock: - spin_unlock(&pidmap_lock); - idr_preload_end(); - put_pid_ns(ns); - out_free: - spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -303,7 +339,10 @@ out_free: idr_set_cursor(&ns->idr, 0); spin_unlock(&pidmap_lock); + idr_preload_end(); +out_abort: + put_pid_ns(ns); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } From 887e97745ec336c2f49b6c0af3c4cc00a5df3211 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 3 Dec 2025 10:48:37 +0100 Subject: [PATCH 06/28] fs: track the inode having file locks with a flag in ->i_opflags Opening and closing an inode dirties the ->i_readcount field. Depending on the alignment of the inode, it may happen to false-share with other fields loaded both for both operations to various extent. This notably concerns the ->i_flctx field. Since most inodes don't have the field populated, this bit can be managed with a flag in ->i_opflags instead which bypasses the problem. Here are results I obtained while opening a file read-only in a loop with 24 cores doing the work on Sapphire Rapids. Utilizing the flag as opposed to reading ->i_flctx field was toggled at runtime as the benchmark was running, to make sure both results come from the same alignment. before: 3233740 after: 3373346 (+4%) before: 3284313 after: 3518711 (+7%) before: 3505545 after: 4092806 (+16%) Or to put it differently, this varies wildly depending on how (un)lucky you get. The primary bottleneck before and after is the avoidable lockref trip in do_dentry_open(). Reviewed-by: Jeff Layton Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251203094837.290654-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/locks.c | 14 ++++++++++++-- include/linux/filelock.h | 15 +++++++++++---- include/linux/fs.h | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 9f565802a88c..7a63fa3ca9b4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -178,7 +178,6 @@ locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *ctx; - /* paired with cmpxchg() below */ ctx = locks_inode_context(inode); if (likely(ctx) || type == F_UNLCK) goto out; @@ -196,7 +195,18 @@ locks_get_lock_context(struct inode *inode, int type) * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - if (cmpxchg(&inode->i_flctx, NULL, ctx)) { + spin_lock(&inode->i_lock); + if (!(inode->i_opflags & IOP_FLCTX)) { + VFS_BUG_ON_INODE(inode->i_flctx, inode); + WRITE_ONCE(inode->i_flctx, ctx); + /* + * Paired with locks_inode_context(). + */ + smp_store_release(&inode->i_opflags, inode->i_opflags | IOP_FLCTX); + spin_unlock(&inode->i_lock); + } else { + VFS_BUG_ON_INODE(!inode->i_flctx, inode); + spin_unlock(&inode->i_lock); kmem_cache_free(flctx_cache, ctx); ctx = locks_inode_context(inode); } diff --git a/include/linux/filelock.h b/include/linux/filelock.h index dc15f5427680..4a8912b9653e 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -242,8 +242,12 @@ static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { /* - * Paired with the fence in locks_get_lock_context(). + * Paired with smp_store_release in locks_get_lock_context(). + * + * Ensures ->i_flctx will be visible if we spotted the flag. */ + if (likely(!(smp_load_acquire(&inode->i_opflags) & IOP_FLCTX))) + return NULL; return READ_ONCE(inode->i_flctx); } @@ -471,7 +475,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -490,7 +494,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) * could end up racing with tasks trying to set a new lease on this * file. */ - flctx = READ_ONCE(inode->i_flctx); + flctx = locks_inode_context(inode); if (!flctx) return 0; smp_mb(); @@ -535,8 +539,11 @@ static inline int break_deleg_wait(struct delegated_inode *di) static inline int break_layout(struct inode *inode, bool wait) { + struct file_lock_context *flctx; + smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + flctx = locks_inode_context(inode); + if (flctx && !list_empty_careful(&flctx->flc_lease)) { unsigned int flags = LEASE_BREAK_LAYOUT; if (!wait) diff --git a/include/linux/fs.h b/include/linux/fs.h index 04ceeca12a0d..094b0adcb035 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -631,6 +631,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 #define IOP_FASTPERM_MAY_EXEC 0x0080 +#define IOP_FLCTX 0x0100 /* * Inode state bits. Protected by inode->i_lock From b68f91ef3b3fe82ad78c417de71b675699a8467c Mon Sep 17 00:00:00 2001 From: Deepakkumar Karn Date: Thu, 11 Dec 2025 18:42:11 +0530 Subject: [PATCH 07/28] fs/buffer: add alert in try_to_free_buffers() for folios without buffers try_to_free_buffers() can be called on folios with no buffers attached when filemap_release_folio() is invoked on a folio belonging to a mapping with AS_RELEASE_ALWAYS set but no release_folio operation defined. In such cases, folio_needs_release() returns true because of the AS_RELEASE_ALWAYS flag, but the folio has no private buffer data. This causes try_to_free_buffers() to call drop_buffers() on a folio with no buffers, leading to a null pointer dereference. Adding a check in try_to_free_buffers() to return early if the folio has no buffers attached, with WARN_ON_ONCE() to alert about the misconfiguration. This provides defensive hardening. Signed-off-by: Deepakkumar Karn Link: https://patch.msgid.link/20251211131211.308021-1-dkarn@redhat.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index c6f4660f92df..fd53b806ab7e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2948,6 +2948,10 @@ bool try_to_free_buffers(struct folio *folio) if (folio_test_writeback(folio)) return false; + /* Misconfigured folio check */ + if (WARN_ON_ONCE(!folio_buffers(folio))) + return true; + if (mapping == NULL) { /* can this still happen? */ ret = drop_buffers(folio, &buffers_to_free); goto out; From 63ad216fbfe2240da67233e0a0d10af8a12f7bde Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 18 Dec 2025 12:21:45 +0100 Subject: [PATCH 08/28] fs: Replace simple_strtoul with kstrtoul in set_ihash_entries Replace simple_strtoul() with the recommended kstrtoul() for parsing the 'ihash_entries=' boot parameter. Check the return value of kstrtoul() and reject invalid values. This adds error handling while preserving behavior for existing valid values, and removes use of the deprecated simple_strtoul() helper. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20251218112144.225301-2-thorsten.blum@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 521383223d8a..a6df537eb856 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2531,10 +2531,7 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { - if (!str) - return 0; - ihash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &ihash_entries) == 0; } __setup("ihash_entries=", set_ihash_entries); From b29a0a37f46bbfd2a36eff73eb66249d7baaf71a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 16 Dec 2025 15:52:37 +0100 Subject: [PATCH 09/28] dcache: Replace simple_strtoul with kstrtoul in set_dhash_entries Replace simple_strtoul() with the recommended kstrtoul() for parsing the 'dhash_entries=' boot parameter. Check the return value of kstrtoul() and reject invalid values. This adds error handling while preserving behavior for existing values, and removes use of the deprecated simple_strtoul() helper. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20251216145236.44520-2-thorsten.blum@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/dcache.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index dc2fff4811d1..ec275f4fd81c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3227,10 +3227,7 @@ EXPORT_SYMBOL(d_parent_ino); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { - if (!str) - return 0; - dhash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &dhash_entries) == 0; } __setup("dhash_entries=", set_dhash_entries); From 3f320e5c2eca158e3b5dc2e633694ee7f348d970 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 14 Dec 2025 16:31:42 +0100 Subject: [PATCH 10/28] namespace: Replace simple_strtoul with kstrtoul to parse boot params Replace simple_strtoul() with the recommended kstrtoul() for parsing the 'mhash_entries=' and 'mphash_entries=' boot parameters. Check the return value of kstrtoul() and reject invalid values. This adds error handling while preserving behavior for existing values, and removes use of the deprecated simple_strtoul() helper. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20251214153141.218953-2-thorsten.blum@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index c58674a20cad..a548369ddb9c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -49,20 +49,14 @@ static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { - if (!str) - return 0; - mhash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &mhash_entries) == 0; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { - if (!str) - return 0; - mphash_entries = simple_strtoul(str, &str, 0); - return 1; + return kstrtoul(str, 0, &mphash_entries) == 0; } __setup("mphash_entries=", set_mphash_entries); From 3685744afa4a2e65a4a509f1b782af98e929b83f Mon Sep 17 00:00:00 2001 From: chen zhang Date: Mon, 15 Dec 2025 19:15:00 +0800 Subject: [PATCH 11/28] chardev: Switch to guard(mutex) and __free(kfree) Instead of using the 'goto label; mutex_unlock()' pattern use 'guard(mutex)' which will release the mutex when it goes out of scope. Use the __free(kfree) cleanup to replace instances of manually calling kfree(). Also make some code path simplifications that this allows. Signed-off-by: chen zhang Link: https://patch.msgid.link/20251215111500.159243-1-chenzhang@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/char_dev.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/char_dev.c b/fs/char_dev.c index 84a5a0699373..bf7b32650e54 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -97,7 +98,8 @@ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { - struct char_device_struct *cd, *curr, *prev = NULL; + struct char_device_struct *cd __free(kfree) = NULL; + struct char_device_struct *curr, *prev = NULL; int ret; int i; @@ -117,14 +119,14 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, if (cd == NULL) return ERR_PTR(-ENOMEM); - mutex_lock(&chrdevs_lock); + guard(mutex)(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); - goto out; + return ERR_PTR(ret); } major = ret; } @@ -144,7 +146,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, if (curr->baseminor >= baseminor + minorct) break; - goto out; + return ERR_PTR(ret); } cd->major = major; @@ -160,12 +162,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, prev->next = cd; } - mutex_unlock(&chrdevs_lock); - return cd; -out: - mutex_unlock(&chrdevs_lock); - kfree(cd); - return ERR_PTR(ret); + return_ptr(cd); } static struct char_device_struct * From 0f166bf1d6d82701cc1d94445cc2a9107d1790df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 23 Dec 2025 08:00:39 +0100 Subject: [PATCH 12/28] select: store end_time as timespec64 in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Storing the end time seconds as 'unsigned long' can lead to truncation on 32-bit architectures if assigned from the 64-bit timespec64::tv_sec. As the select() core uses timespec64 consistently, also use that in the restart block. This also allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251223-restart-block-expiration-v2-1-8e33e5df7359@linutronix.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 12 ++++-------- include/linux/restart_block.h | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/select.c b/fs/select.c index 65019b8ba3f7..78a1508c84d3 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1038,14 +1038,11 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec64 *to = NULL, end_time; + struct timespec64 *to = NULL; int ret; - if (restart_block->poll.has_timeout) { - end_time.tv_sec = restart_block->poll.tv_sec; - end_time.tv_nsec = restart_block->poll.tv_nsec; - to = &end_time; - } + if (restart_block->poll.has_timeout) + to = &restart_block->poll.end_time; ret = do_sys_poll(ufds, nfds, to); @@ -1077,8 +1074,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { - restart_block->poll.tv_sec = end_time.tv_sec; - restart_block->poll.tv_nsec = end_time.tv_nsec; + restart_block->poll.end_time = end_time; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 67d2bf579942..9b262109726d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -6,6 +6,7 @@ #define __LINUX_RESTART_BLOCK_H #include +#include #include struct __kernel_timespec; @@ -50,8 +51,7 @@ struct restart_block { struct pollfd __user *ufds; int nfds; int has_timeout; - unsigned long tv_sec; - unsigned long tv_nsec; + struct timespec64 end_time; } poll; }; }; From 729d015ab230d1d6debd69744c6e0fb70c16a779 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 29 Dec 2025 13:57:51 +0100 Subject: [PATCH 13/28] fs: only assert on LOOKUP_RCU when built with CONFIG_DEBUG_VFS Calls to the 2 modified routines are explicitly gated with checks for the flag, so there is no use for this in production kernels. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251229125751.826050-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index bf0f66f0e9b9..280c69e8fb99 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -881,7 +881,7 @@ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; - BUG_ON(!(nd->flags & LOOKUP_RCU)); + VFS_BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(nd->depth && !legitimize_links(nd))) goto out1; @@ -916,7 +916,8 @@ out: static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; - BUG_ON(!(nd->flags & LOOKUP_RCU)); + + VFS_BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(nd->depth && !legitimize_links(nd))) goto out2; From a6b9f5b2f04bd7809cd72c5d33af944758c00ab1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 5 Jan 2026 07:10:27 -0800 Subject: [PATCH 14/28] fs/namei: Remove redundant DCACHE_MANAGED_DENTRY check in __follow_mount_rcu The check for DCACHE_MANAGED_DENTRY at the start of __follow_mount_rcu() is redundant because the only caller (handle_mounts) already verifies d_managed(dentry) before calling this function, so, dentry in __follow_mount_rcu() has always DCACHE_MANAGED_DENTRY set. This early-out optimization never fires in practice - but it is marking as likely(). This was detected with branch profiling, which shows 100% misprediction in this likely. Remove the whole if clause instead of removing the likely, given we know for sure that dentry is not DCACHE_MANAGED_DENTRY. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260105-dcache-v1-1-f0d904b4a7c2@debian.org Acked-by: Al Viro Signed-off-by: Christian Brauner --- fs/namei.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 280c69e8fb99..e0d5ebdf43a3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1624,9 +1624,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; - if (likely(!(flags & DCACHE_MANAGED_DENTRY))) - return true; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; From b0f5804b41789f99ecf303a48fc0266dc3e24b0e Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 09:46:19 +0700 Subject: [PATCH 15/28] fs: Describe @isnew parameter in ilookup5_nowait() Sphinx reports kernel-doc warning: WARNING: ./fs/inode.c:1607 function parameter 'isnew' not described in 'ilookup5_nowait' Describe the parameter. Fixes: a27628f4363435 ("fs: rework I_NEW handling to operate without fences") Signed-off-by: Bagas Sanjaya Link: https://patch.msgid.link/20251219024620.22880-2-bagasdotme@gmail.com Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index a6df537eb856..b986da098e87 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1593,6 +1593,9 @@ EXPORT_SYMBOL(igrab); * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test + * @isnew: return argument telling whether I_NEW was set when + * the inode was found in hash (the caller needs to + * wait for I_NEW to clear) * * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented From ba4c74f80ef39bb5a387dd3b13422199515efec0 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 09:46:20 +0700 Subject: [PATCH 16/28] VFS: fix __start_dirop() kernel-doc warnings Sphinx report kernel-doc warnings: WARNING: ./fs/namei.c:2853 function parameter 'state' not described in '__start_dirop' WARNING: ./fs/namei.c:2853 expecting prototype for start_dirop(). Prototype was for __start_dirop() instead Fix them up. Fixes: ff7c4ea11a05c8 ("VFS: add start_creating_killable() and start_removing_killable()") Signed-off-by: Bagas Sanjaya Link: https://patch.msgid.link/20251219024620.22880-3-bagasdotme@gmail.com Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index e0d5ebdf43a3..4e3a5fd370a8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2834,10 +2834,11 @@ static int filename_parentat(int dfd, struct filename *name, } /** - * start_dirop - begin a create or remove dirop, performing locking and lookup + * __start_dirop - begin a create or remove dirop, performing locking and lookup * @parent: the dentry of the parent in which the operation will occur * @name: a qstr holding the name within that parent * @lookup_flags: intent and other lookup flags. + * @state: task state bitmask * * The lookup is performed and necessary locks are taken so that, on success, * the returned dentry can be operated on safely. From 6784f274722559c0cdaaa418bc8b7b1d61c314f9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 7 Jan 2026 06:06:36 -0800 Subject: [PATCH 17/28] device_cgroup: remove branch hint after code refactor commit 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") reordered the checks in devcgroup_inode_permission() to check the inode mode before checking i_rdev, for better cache behavior. However, the likely() annotation on the i_rdev check was not updated to reflect the new code flow. Originally, when i_rdev was checked first, likely(!inode->i_rdev) made sense because most inodes were(?) regular files/directories, thus i_rdev == 0. After the reorder, by the time we reach the i_rdev check, we have already confirmed the inode IS a block or character device. Block and character special files are precisely defined by having a device number (i_rdev), so !inode->i_rdev is now the rare edge case, not the common case. Branch profiling confirmed this is 100% mispredicted: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 2631904 100 devcgroup_inode_permission device_cgroup.h 24 Remove likely() to avoid giving the wrong hint to the CPU. Fixes: 4ef4ac360101 ("device_cgroup: avoid access to ->i_rdev in the common case in devcgroup_inode_permission()") Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260107-likely_device-v1-1-0c55f83a7e47@debian.org Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/device_cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h index 0864773a57e8..822085bc2d20 100644 --- a/include/linux/device_cgroup.h +++ b/include/linux/device_cgroup.h @@ -21,7 +21,7 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask) if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; - if (likely(!inode->i_rdev)) + if (!inode->i_rdev) return 0; if (S_ISBLK(inode->i_mode)) From 46329a9dd74bd12e92fb7cc8afe70dad32875758 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Jan 2026 09:38:22 -0500 Subject: [PATCH 18/28] acct(2): begin the deprecation of legacy BSD process accounting As Christian points out [1], even though it's privileged, this interface has a lot of footguns. There are better options these days (e.g. eBPF), so it would be good to start discouraging its use and mark it as deprecated. [1]: https://lore.kernel.org/linux-fsdevel/20250212-giert-spannend-8893f1eaba7d@brauner/ Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260106-bsd-acct-v1-1-d15564b52c83@kernel.org Signed-off-by: Christian Brauner --- init/Kconfig | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..160c1c4ef253 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -624,8 +624,9 @@ config SCHED_HW_PRESSURE arch_update_hw_pressure() and arch_scale_thermal_pressure(). config BSD_PROCESS_ACCT - bool "BSD Process Accounting" + bool "BSD Process Accounting (DEPRECATED)" depends on MULTIUSER + default n help If you say Y here, a user level program will be able to instruct the kernel (via a special system call) to write process accounting @@ -635,7 +636,9 @@ config BSD_PROCESS_ACCT command name, memory usage, controlling terminal etc. (the complete list is in the struct acct in ). It is up to the user level program to do useful things with this - information. This is generally a good idea, so say Y. + information. This mechanism is antiquated and has significant + scalability issues. You probably want to use eBPF instead. Say + N unless you really need this. config BSD_PROCESS_ACCT_V3 bool "BSD Process Accounting version 3 file format" From 5e7fa6bfa9b5ced6868fc652d5c40fe0eac154d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:24 -0300 Subject: [PATCH 19/28] exportfs: Fix kernel-doc output for get_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a space between %NAME_MAX and the plus sign, kernel-doc will output ``NAME_MAX``+1, which scapes the last backtick and make Sphinx format a much larger string as monospaced text. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-1-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index f0cf2714ec52..599ea86363e1 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -234,7 +234,7 @@ struct handle_to_path_ctx { * get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the - * understanding that it is already pointing to a %NAME_MAX+1 sized + * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * From fc76b5968a435894062ad4160c2e81c32cc4972e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:25 -0300 Subject: [PATCH 20/28] exportfs: Mark struct export_operations functions at kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding a `@` before the function names make then recognizable as kernel-docs, so they get correctly rendered in the documentation. Even if they are already marked with `@` in the short one-line summary, the kernel-docs will correctly favor the more detailed definition here. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-2-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 599ea86363e1..bed370b9f906 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -203,7 +203,7 @@ struct handle_to_path_ctx { * See Documentation/filesystems/nfs/exporting.rst for details on how to use * this interface correctly. * - * encode_fh: + * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most * @max_len bytes) information that can be used by @decode_fh to recover the * file referred to by the &struct dentry @de. If @flag has CONNECTABLE bit @@ -215,7 +215,7 @@ struct handle_to_path_ctx { * greater than @max_len*4 bytes). On error @max_len contains the minimum * size(in 4 byte unit) needed to encode the file handle. * - * fh_to_dentry: + * @fh_to_dentry: * @fh_to_dentry is given a &struct super_block (@sb) and a file handle * fragment (@fh, @fh_len). It should return a &struct dentry which refers * to the same file that the file handle fragment refers to. If it cannot, @@ -227,29 +227,29 @@ struct handle_to_path_ctx { * created with d_alloc_root. The caller can then find any other extant * dentries by following the d_alias links. * - * fh_to_parent: + * @fh_to_parent: * Same as @fh_to_dentry, except that it returns a pointer to the parent * dentry if it was encoded into the filehandle fragment by @encode_fh. * - * get_name: + * @get_name: * @get_name should find a name for the given @child in the given @parent * directory. The name should be stored in the @name (with the * understanding that it is already pointing to a %NAME_MAX + 1 sized * buffer. get_name() should return %0 on success, a negative error code * or error. @get_name will be called without @parent->i_rwsem held. * - * get_parent: + * @get_parent: * @get_parent should find the parent directory for the given @child which * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * - * permission: + * @permission: * Allow filesystems to specify a custom permission function. * - * open: + * @open: * Allow filesystems to specify a custom open function. * - * commit_metadata: + * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * * Locking rules: From 7a6f811e2c06d656996776771f0498df129a0cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:26 -0300 Subject: [PATCH 21/28] exportfs: Complete kernel-doc for struct export_operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Write down the missing members definitions for struct export_operations, using as a reference the commit messages that created the members. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-3-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index bed370b9f906..262e24d83313 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -201,7 +201,7 @@ struct handle_to_path_ctx { * @commit_metadata: commit metadata changes to stable storage * * See Documentation/filesystems/nfs/exporting.rst for details on how to use - * this interface correctly. + * this interface correctly and the definition of the flags. * * @encode_fh: * @encode_fh should store in the file handle fragment @fh (using at most @@ -252,6 +252,19 @@ struct handle_to_path_ctx { * @commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * + * @get_uuid: + * Get a filesystem unique signature exposed to clients. + * + * @map_blocks: + * Map and, if necessary, allocate blocks for a layout. + * + * @commit_blocks: + * Commit blocks in a layout once the client is done with them. + * + * @flags: + * Allows the filesystem to communicate to nfsd that it may want to do things + * differently when dealing with it. + * * Locking rules: * get_parent is called with child->d_inode->i_rwsem down * get_name is not (which is possibly inconsistent) From 1219e0feaefc9697f738b223540e8e8906291cb3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Jan 2026 22:15:36 +0100 Subject: [PATCH 22/28] fs: move initializing f_mode before file_ref_init() The comment above file_ref_init() says: "We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last." but file_set_fsnotify_mode() was added after file_ref_init(). Move it right after setting f_mode, where it makes more sense. Fixes: 711f9b8fbe4f4 ("fsnotify: disable pre-content and permission events by default") Signed-off-by: Amir Goldstein Link: https://patch.msgid.link/20260109211536.3565697-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/file_table.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index cd4a3db4659a..34244fccf2ed 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -176,6 +176,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_flags = flags; f->f_mode = OPEN_FMODE(flags); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by fsnotify_open_perm_and_set_mode(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); f->f_op = NULL; f->f_mapping = NULL; @@ -197,11 +202,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); - /* - * Disable permission and pre-content events for all files by default. - * They may be enabled later by fsnotify_open_perm_and_set_mode(). - */ - file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } From f9a6a3fec23a852851049847f2ba3be6eb6eb0b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 12 Jan 2026 22:51:27 -0300 Subject: [PATCH 23/28] docs: exportfs: Use source code struct documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of duplicating struct export_operations documentation in both ReST file and in the C source code, just use the kernel-doc in the docs. While here, make the sentence preceding the paragraph less redundant. Signed-off-by: André Almeida Link: https://patch.msgid.link/20260112-tonyk-fs_uuid-v1-4-acc1889de772@igalia.com Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- Documentation/filesystems/nfs/exporting.rst | 40 +++------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index de64d2d002a2..a01d9b9b5bc3 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -119,43 +119,11 @@ For a filesystem to be exportable it must: A file system implementation declares that instances of the filesystem are exportable by setting the s_export_op field in the struct -super_block. This field must point to a "struct export_operations" -struct which has the following members: +super_block. This field must point to a struct export_operations +which has the following members: - encode_fh (mandatory) - Takes a dentry and creates a filehandle fragment which may later be used - to find or create a dentry for the same object. - - fh_to_dentry (mandatory) - Given a filehandle fragment, this should find the implied object and - create a dentry for it (possibly with d_obtain_alias). - - fh_to_parent (optional but strongly recommended) - Given a filehandle fragment, this should find the parent of the - implied object and create a dentry for it (possibly with - d_obtain_alias). May fail if the filehandle fragment is too small. - - get_parent (optional but strongly recommended) - When given a dentry for a directory, this should return a dentry for - the parent. Quite possibly the parent dentry will have been allocated - by d_alloc_anon. The default get_parent function just returns an error - so any filehandle lookup that requires finding a parent will fail. - ->lookup("..") is *not* used as a default as it can leave ".." entries - in the dcache which are too messy to work with. - - get_name (optional) - When given a parent dentry and a child dentry, this should find a name - in the directory identified by the parent dentry, which leads to the - object identified by the child dentry. If no get_name function is - supplied, a default implementation is provided which uses vfs_readdir - to find potential names, and matches inode numbers to find the correct - match. - - flags - Some filesystems may need to be handled differently than others. The - export_operations struct also includes a flags field that allows the - filesystem to communicate such information to nfsd. See the Export - Operations Flags section below for more explanation. +.. kernel-doc:: include/linux/exportfs.h + :identifiers: struct export_operations A filehandle fragment consists of an array of 1 or more 4byte words, together with a one byte "type". From 589cff4975afe1a4eaaa1d961652f50b1628d78d Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 8 Jan 2026 11:58:56 +0000 Subject: [PATCH 24/28] fs: add for 'init_fs' The init_fs symbol is defined in but was not included in fs/fs_struct.c so fix by adding the include. Fixes the following sparse warning: fs/fs_struct.c:150:18: warning: symbol 'init_fs' was not declared. Should it be static? Fixes: 3e93cd671813e ("Take fs_struct handling to new file") Signed-off-by: Ben Dooks Link: https://patch.msgid.link/20260108115856.238027-1-ben.dooks@codethink.co.uk Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs_struct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs_struct.c b/fs/fs_struct.c index b8c46c5a38a0..394875d06fd6 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "internal.h" /* From 7c0225003317bd6e2107784a83c7099de8b2b28c Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Mon, 12 Jan 2026 18:14:43 +0000 Subject: [PATCH 25/28] fs: improve dump_inode() to safely access inode fields Use get_kernel_nofault() to safely access inode and related structures (superblock, file_system_type) to avoid crashing when the inode pointer is invalid. This allows the same pattern as dump_mapping(). Note: The original access method for i_state and i_count is preserved, as get_kernel_nofault() is unnecessary once the inode structure is verified accessible. Reviewed-by: Jan Kara Signed-off-by: Yuto Ohnuki Link: https://patch.msgid.link/20260112181443.81286-1-ytohnuki@amazon.com Reviewed-by: Mateusz Guzik Signed-off-by: Christian Brauner --- fs/inode.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index b986da098e87..d317637dc3b4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2984,24 +2984,45 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, EXPORT_SYMBOL(mode_strip_sgid); #ifdef CONFIG_DEBUG_VFS -/* - * Dump an inode. +/** + * dump_inode - dump an inode. + * @inode: inode to dump + * @reason: reason for dumping * - * TODO: add a proper inode dumping routine, this is a stub to get debug off the - * ground. - * - * TODO: handle getting to fs type with get_kernel_nofault()? - * See dump_mapping() above. + * If inode is an invalid pointer, we don't want to crash accessing it, + * so probe everything depending on it carefully with get_kernel_nofault(). */ void dump_inode(struct inode *inode, const char *reason) { - struct super_block *sb = inode->i_sb; + struct super_block *sb; + struct file_system_type *s_type; + const char *fs_name_ptr; + char fs_name[32] = {}; + umode_t mode; + unsigned short opflags; + unsigned int flags; + unsigned int state; + int count; - pr_warn("%s encountered for inode %px\n" - "fs %s mode %ho opflags 0x%hx flags 0x%x state 0x%x count %d\n", - reason, inode, sb->s_type->name, inode->i_mode, inode->i_opflags, - inode->i_flags, inode_state_read_once(inode), atomic_read(&inode->i_count)); + if (get_kernel_nofault(sb, &inode->i_sb) || + get_kernel_nofault(mode, &inode->i_mode) || + get_kernel_nofault(opflags, &inode->i_opflags) || + get_kernel_nofault(flags, &inode->i_flags)) { + pr_warn("%s: unreadable inode:%px\n", reason, inode); + return; + } + + state = inode_state_read_once(inode); + count = atomic_read(&inode->i_count); + + if (!sb || + get_kernel_nofault(s_type, &sb->s_type) || !s_type || + get_kernel_nofault(fs_name_ptr, &s_type->name) || !fs_name_ptr || + strncpy_from_kernel_nofault(fs_name, fs_name_ptr, sizeof(fs_name) - 1) < 0) + strscpy(fs_name, ""); + + pr_warn("%s: inode:%px fs:%s mode:%ho opflags:%#x flags:%#x state:%#x count:%d\n", + reason, inode, fs_name, mode, opflags, flags, state, count); } - EXPORT_SYMBOL(dump_inode); #endif From aaf76839616a3cff7bfff6a888e1762bc1d0c235 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 15 Jan 2026 00:50:52 +1100 Subject: [PATCH 26/28] initramfs_test: kunit test for cpio.filesize > PATH_MAX initramfs unpack skips over cpio entries where namesize > PATH_MAX, instead of returning an error. Add coverage for this behaviour. Signed-off-by: David Disseldorp Link: https://patch.msgid.link/20260114135051.4943-2-ddiss@suse.de Signed-off-by: Christian Brauner --- init/initramfs_test.c | 48 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/init/initramfs_test.c b/init/initramfs_test.c index 5d2db455e60c..beb6e3cf7808 100644 --- a/init/initramfs_test.c +++ b/init/initramfs_test.c @@ -447,6 +447,53 @@ out: kfree(tbufs); } +static void __init initramfs_test_fname_path_max(struct kunit *test) +{ + char *err; + size_t len; + struct kstat st0, st1; + char fdata[] = "this file data will not be unpacked"; + struct test_fname_path_max { + char fname_oversize[PATH_MAX + 1]; + char fname_ok[PATH_MAX]; + char cpio_src[(CPIO_HDRLEN + PATH_MAX + 3 + sizeof(fdata)) * 2]; + } *tbufs = kzalloc(sizeof(struct test_fname_path_max), GFP_KERNEL); + struct initramfs_test_cpio c[] = { { + .magic = "070701", + .ino = 1, + .mode = S_IFDIR | 0777, + .nlink = 1, + .namesize = sizeof(tbufs->fname_oversize), + .fname = tbufs->fname_oversize, + .filesize = sizeof(fdata), + .data = fdata, + }, { + .magic = "070701", + .ino = 2, + .mode = S_IFDIR | 0777, + .nlink = 1, + .namesize = sizeof(tbufs->fname_ok), + .fname = tbufs->fname_ok, + } }; + + memset(tbufs->fname_oversize, '/', sizeof(tbufs->fname_oversize) - 1); + memset(tbufs->fname_ok, '/', sizeof(tbufs->fname_ok) - 1); + memcpy(tbufs->fname_oversize, "fname_oversize", + sizeof("fname_oversize") - 1); + memcpy(tbufs->fname_ok, "fname_ok", sizeof("fname_ok") - 1); + len = fill_cpio(c, ARRAY_SIZE(c), tbufs->cpio_src); + + /* unpack skips over fname_oversize instead of returning an error */ + err = unpack_to_rootfs(tbufs->cpio_src, len); + KUNIT_EXPECT_NULL(test, err); + + KUNIT_EXPECT_EQ(test, init_stat("fname_oversize", &st0, 0), -ENOENT); + KUNIT_EXPECT_EQ(test, init_stat("fname_ok", &st1, 0), 0); + KUNIT_EXPECT_EQ(test, init_rmdir("fname_ok"), 0); + + kfree(tbufs); +} + /* * The kunit_case/_suite struct cannot be marked as __initdata as this will be * used in debugfs to retrieve results after test has run. @@ -459,6 +506,7 @@ static struct kunit_case __refdata initramfs_test_cases[] = { KUNIT_CASE(initramfs_test_hardlink), KUNIT_CASE(initramfs_test_many), KUNIT_CASE(initramfs_test_fname_pad), + KUNIT_CASE(initramfs_test_fname_path_max), {}, }; From 88ec797c468097a8ce97694ed11ea9c982598ec0 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 14 Jan 2026 10:47:16 +0100 Subject: [PATCH 27/28] fs: make insert_inode_locked() wait for inode destruction This is the only routine which instead skipped instead of waiting. The current behavior is arguably a bug as it results in a corner case where the inode hash can have *two* matching inodes, one of which is on its way out. Ironing out this difference is an incremental step towards sanitizing the API. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20260114094717.236202-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- fs/inode.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index d317637dc3b4..de398c63fb72 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1028,19 +1028,20 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked); +static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked); + /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, bool is_inode_hash_locked, + void *data, bool hash_locked, bool *isnew) { struct inode *inode = NULL; - if (is_inode_hash_locked) + if (hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -1054,7 +1055,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { - __wait_on_freeing_inode(inode, is_inode_hash_locked); + __wait_on_freeing_inode(inode, hash_locked, true); goto repeat; } if (unlikely(inode_state_read(inode) & I_CREATING)) { @@ -1078,11 +1079,11 @@ repeat: */ static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino, - bool is_inode_hash_locked, bool *isnew) + bool hash_locked, bool *isnew) { struct inode *inode = NULL; - if (is_inode_hash_locked) + if (hash_locked) lockdep_assert_held(&inode_hash_lock); else lockdep_assert_not_held(&inode_hash_lock); @@ -1096,7 +1097,7 @@ repeat: continue; spin_lock(&inode->i_lock); if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) { - __wait_on_freeing_inode(inode, is_inode_hash_locked); + __wait_on_freeing_inode(inode, hash_locked, true); goto repeat; } if (unlikely(inode_state_read(inode) & I_CREATING)) { @@ -1832,16 +1833,13 @@ int insert_inode_locked(struct inode *inode) while (1) { struct inode *old = NULL; spin_lock(&inode_hash_lock); +repeat: hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; spin_lock(&old->i_lock); - if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { - spin_unlock(&old->i_lock); - continue; - } break; } if (likely(!old)) { @@ -1852,6 +1850,11 @@ int insert_inode_locked(struct inode *inode) spin_unlock(&inode_hash_lock); return 0; } + if (inode_state_read(old) & (I_FREEING | I_WILL_FREE)) { + __wait_on_freeing_inode(old, true, false); + old = NULL; + goto repeat; + } if (unlikely(inode_state_read(old) & I_CREATING)) { spin_unlock(&old->i_lock); spin_unlock(&inode_hash_lock); @@ -2504,16 +2507,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked) +static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked) { struct wait_bit_queue_entry wqe; struct wait_queue_head *wq_head; + VFS_BUG_ON(!hash_locked && !rcu_locked); + /* * Handle racing against evict(), see that routine for more details. */ if (unlikely(inode_unhashed(inode))) { - WARN_ON(is_inode_hash_locked); + WARN_ON(hash_locked); spin_unlock(&inode->i_lock); return; } @@ -2521,14 +2526,16 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW); prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - rcu_read_unlock(); - if (is_inode_hash_locked) + if (rcu_locked) + rcu_read_unlock(); + if (hash_locked) spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq_head, &wqe.wq_entry); - if (is_inode_hash_locked) + if (hash_locked) spin_lock(&inode_hash_lock); - rcu_read_lock(); + if (rcu_locked) + rcu_read_lock(); } static __initdata unsigned long ihash_entries; From 6cbfdf89470ef3c2110f376a507d135e7a7a7378 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 15 Jan 2026 13:23:40 +0100 Subject: [PATCH 28/28] posix_acl: make posix_acl_to_xattr() alloc the buffer Without exception all caller do that. So move the allocation into the helper. This reduces boilerplate and removes unnecessary error checking. Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260115122341.556026-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/9p/acl.c | 16 ++--------- fs/btrfs/acl.c | 10 ++----- fs/ceph/acl.c | 50 +++++++++++++++------------------ fs/fuse/acl.c | 12 +++----- fs/gfs2/acl.c | 13 ++------- fs/jfs/acl.c | 9 ++---- fs/ntfs3/xattr.c | 6 +--- fs/orangefs/acl.c | 8 +----- fs/posix_acl.c | 21 +++++++------- include/linux/posix_acl_xattr.h | 5 ++-- 10 files changed, 53 insertions(+), 97 deletions(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 633da5e37299..ae7e7cf7523a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -167,17 +167,11 @@ int v9fs_iop_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (retval) goto err_out; - size = posix_acl_xattr_size(acl->a_count); - - value = kzalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { retval = -ENOMEM; goto err_out; } - - retval = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (retval < 0) - goto err_out; } /* @@ -257,13 +251,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) return 0; /* Set a setxattr request to server */ - size = posix_acl_xattr_size(acl->a_count); - buffer = kmalloc(size, GFP_KERNEL); + buffer = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!buffer) return -ENOMEM; - retval = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - if (retval < 0) - goto err_free_out; + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -275,7 +266,6 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) BUG(); } retval = v9fs_fid_xattr_set(fid, name, buffer, size, 0); -err_free_out: kfree(buffer); return retval; } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index c336e2ab7f8a..e55b686fe1ab 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -57,7 +57,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct posix_acl *acl, int type) { - int ret, size = 0; + int ret; + size_t size = 0; const char *name; char AUTO_KFREE(value); @@ -77,20 +78,15 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, if (acl) { unsigned int nofs_flag; - size = posix_acl_xattr_size(acl->a_count); /* * We're holding a transaction handle, so use a NOFS memory * allocation context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!value) return -ENOMEM; - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - return ret; } if (trans) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 1564eacc253d..85d3dd48b167 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -90,7 +90,8 @@ retry: int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { - int ret = 0, size = 0; + int ret = 0; + size_t size = 0; const char *name = NULL; char *value = NULL; struct iattr newattrs; @@ -126,16 +127,11 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) { ret = -ENOMEM; goto out; } - - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (ret < 0) - goto out_free; } if (new_mode != old_mode) { @@ -172,7 +168,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct posix_acl *acl, *default_acl; size_t val_size1 = 0, val_size2 = 0; struct ceph_pagelist *pagelist = NULL; - void *tmp_buf = NULL; + void *tmp_buf1 = NULL, *tmp_buf2 = NULL; int err; err = posix_acl_create(dir, mode, &default_acl, &acl); @@ -192,15 +188,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (!default_acl && !acl) return 0; - if (acl) - val_size1 = posix_acl_xattr_size(acl->a_count); - if (default_acl) - val_size2 = posix_acl_xattr_size(default_acl->a_count); - err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); - if (!tmp_buf) - goto out_err; pagelist = ceph_pagelist_alloc(GFP_KERNEL); if (!pagelist) goto out_err; @@ -213,34 +201,39 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, if (acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); + + err = -ENOMEM; + tmp_buf1 = posix_acl_to_xattr(&init_user_ns, acl, + &val_size1, GFP_KERNEL); + if (!tmp_buf1) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, len); - err = posix_acl_to_xattr(&init_user_ns, acl, - tmp_buf, val_size1); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size1); - ceph_pagelist_append(pagelist, tmp_buf, val_size1); + ceph_pagelist_append(pagelist, tmp_buf1, val_size1); } if (default_acl) { size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); + + err = -ENOMEM; + tmp_buf2 = posix_acl_to_xattr(&init_user_ns, default_acl, + &val_size2, GFP_KERNEL); + if (!tmp_buf2) + goto out_err; err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_DEFAULT, len); - err = posix_acl_to_xattr(&init_user_ns, default_acl, - tmp_buf, val_size2); - if (err < 0) - goto out_err; ceph_pagelist_encode_32(pagelist, val_size2); - ceph_pagelist_append(pagelist, tmp_buf, val_size2); + ceph_pagelist_append(pagelist, tmp_buf2, val_size2); } - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); as_ctx->acl = acl; as_ctx->default_acl = default_acl; @@ -250,7 +243,8 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, out_err: posix_acl_release(acl); posix_acl_release(default_acl); - kfree(tmp_buf); + kfree(tmp_buf1); + kfree(tmp_buf2); if (pagelist) ceph_pagelist_release(pagelist); return err; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 8f484b105f13..cbde6ac1add3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -122,20 +122,16 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * them to be refreshed the next time they are used, * and it also updates i_ctime. */ - size_t size = posix_acl_xattr_size(acl->a_count); + size_t size; void *value; - if (size > PAGE_SIZE) - return -E2BIG; - - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(fc->user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); - if (ret < 0) { + if (size > PAGE_SIZE) { kfree(value); - return ret; + return -E2BIG; } /* diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 443640e6fb9c..a5b60778b91c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -83,21 +83,14 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int error; - size_t len; - char *data; + size_t len = 0; + char *data = NULL; const char *name = gfs2_acl_name(type); if (acl) { - len = posix_acl_xattr_size(acl->a_count); - data = kmalloc(len, GFP_NOFS); + data = posix_acl_to_xattr(&init_user_ns, acl, &len, GFP_NOFS); if (data == NULL) return -ENOMEM; - error = posix_acl_to_xattr(&init_user_ns, acl, data, len); - if (error < 0) - goto out; - } else { - data = NULL; - len = 0; } error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1de3602c98de..16b71a23ff1e 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -61,7 +61,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, { char *ea_name; int rc; - int size = 0; + size_t size = 0; char *value = NULL; switch (type) { @@ -76,16 +76,11 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, } if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (rc < 0) - goto out; } rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); -out: kfree(value); if (!rc) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index c93df55e98d0..37a69a75ce68 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -641,13 +641,9 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, value = NULL; flags = XATTR_REPLACE; } else { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (err < 0) - goto out; flags = 0; } diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 5aefb705bcc8..a01ef0c1b1bf 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -90,14 +90,9 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) type); if (acl) { - size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_KERNEL); + value = posix_acl_to_xattr(&init_user_ns, acl, &size, GFP_KERNEL); if (!value) return -ENOMEM; - - error = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (error < 0) - goto out; } gossip_debug(GOSSIP_ACL_DEBUG, @@ -111,7 +106,6 @@ int __orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) */ error = orangefs_inode_setxattr(inode, name, value, size, 0); -out: kfree(value); if (!error) set_cached_acl(inode, type, acl); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 768f027c1428..4ef6f9d2b8d6 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -829,19 +829,19 @@ EXPORT_SYMBOL (posix_acl_from_xattr); /* * Convert from in-memory to extended attribute representation. */ -int +void * posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, - void *buffer, size_t size) + size_t *sizep, gfp_t gfp) { - struct posix_acl_xattr_header *ext_acl = buffer; + struct posix_acl_xattr_header *ext_acl; struct posix_acl_xattr_entry *ext_entry; - int real_size, n; + size_t size; + int n; - real_size = posix_acl_xattr_size(acl->a_count); - if (!buffer) - return real_size; - if (real_size > size) - return -ERANGE; + size = posix_acl_xattr_size(acl->a_count); + ext_acl = kmalloc(size, gfp); + if (!ext_acl) + return NULL; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); @@ -864,7 +864,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, break; } } - return real_size; + *sizep = size; + return ext_acl; } EXPORT_SYMBOL (posix_acl_to_xattr); diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index e86f3b731da2..9e1892525eac 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -44,8 +44,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns, const void *value, } #endif -int posix_acl_to_xattr(struct user_namespace *user_ns, - const struct posix_acl *acl, void *buffer, size_t size); +extern void *posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, + size_t *sizep, gfp_t gfp); + static inline const char *posix_acl_xattr_name(int type) { switch (type) {