From b7737c38e7cb611c2fbd87af3b09afeb92c96fe7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 19 Nov 2025 13:00:16 +0000 Subject: [PATCH 1/5] arm64: mm: Simplify check in arch_kfence_init_pool() TL;DR: checking force_pte_mapping() in arch_kfence_init_pool() is sufficient Commit ce2b3a50ad92 ("arm64: mm: Don't sleep in split_kernel_leaf_mapping() when in atomic context") recently added an arm64 implementation of arch_kfence_init_pool() to ensure that the KFENCE pool is PTE-mapped. Assuming that the pool was not initialised early, block splitting is necessary if the linear mapping is not fully PTE-mapped, in other words if force_pte_mapping() is false. arch_kfence_init_pool() currently makes another check: whether BBML2-noabort is supported, i.e. whether we are *able* to split block mappings. This check is however unnecessary, because force_pte_mapping() is always true if KFENCE is enabled and BBML2-noabort is not supported. This must be the case by design, since KFENCE requires PTE-mapped pages in all cases. We can therefore remove that check. The situation is different in split_kernel_leaf_mapping(), as that function is called unconditionally regardless of the configuration. If BBML2-noabort is not supported, it cannot do anything and bails out. If force_pte_mapping() is true, there is nothing to do and it also bails out, but these are independent checks. Commit 53357f14f924 ("arm64: mm: Tidy up force_pte_mapping()") grouped these checks into a helper, split_leaf_mapping_possible(). This isn't so helpful as only split_kernel_leaf_mapping() should check both. Revert the parts of that commit that introduced the helper, reintroducing the more accurate comments in split_kernel_leaf_mapping(). Signed-off-by: Kevin Brodsky Reviewed-by: Ryan Roberts Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9ae7ce00a7ef..8e1d80a7033e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -767,18 +767,6 @@ static inline bool force_pte_mapping(void) return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); } -static inline bool split_leaf_mapping_possible(void) -{ - /* - * !BBML2_NOABORT systems should never run into scenarios where we would - * have to split. So exit early and let calling code detect it and raise - * a warning. - */ - if (!system_supports_bbml2_noabort()) - return false; - return !force_pte_mapping(); -} - static DEFINE_MUTEX(pgtable_split_lock); int split_kernel_leaf_mapping(unsigned long start, unsigned long end) @@ -786,11 +774,22 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end) int ret; /* - * Exit early if the region is within a pte-mapped area or if we can't - * split. For the latter case, the permission change code will raise a - * warning if not already pte-mapped. + * !BBML2_NOABORT systems should not be trying to change permissions on + * anything that is not pte-mapped in the first place. Just return early + * and let the permission change code raise a warning if not already + * pte-mapped. */ - if (!split_leaf_mapping_possible() || is_kfence_address((void *)start)) + if (!system_supports_bbml2_noabort()) + return 0; + + /* + * If the region is within a pte-mapped area, there is no need to try to + * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may + * change permissions from atomic context so for those cases (which are + * always pte-mapped), we must not go any further because taking the + * mutex below may sleep. + */ + if (force_pte_mapping() || is_kfence_address((void *)start)) return 0; /* @@ -1089,7 +1088,7 @@ bool arch_kfence_init_pool(void) int ret; /* Exit early if we know the linear map is already pte-mapped. */ - if (!split_leaf_mapping_possible()) + if (force_pte_mapping()) return true; /* Kfence pool is already pte-mapped for the early init case. */ From eb972eab0794dedeef5b3b1845e5f9a78793f184 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 6 Dec 2025 20:01:16 +0100 Subject: [PATCH 2/5] lkdtm/bugs: Add cases for BUG and PANIC occurring in hardirq context Add lkdtm cases to trigger a BUG() or panic() from hardirq context. This is useful for testing pstore behavior being invoked from such contexts. Reviewed-by: Kees Cook Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- drivers/misc/lkdtm/bugs.c | 53 +++++++++++++++++++++++++ tools/testing/selftests/lkdtm/tests.txt | 2 + 2 files changed, 55 insertions(+) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index 376047beea3d..fa05d77acb55 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -8,6 +8,7 @@ #include "lkdtm.h" #include #include +#include #include #include #include @@ -100,11 +101,61 @@ static void lkdtm_PANIC_STOP_IRQOFF(void) stop_machine(panic_stop_irqoff_fn, &v, cpu_online_mask); } +static bool wait_for_panic; + +static enum hrtimer_restart panic_in_hardirq(struct hrtimer *timer) +{ + panic("from hard IRQ context"); + + wait_for_panic = false; + return HRTIMER_NORESTART; +} + +static void lkdtm_PANIC_IN_HARDIRQ(void) +{ + struct hrtimer timer; + + wait_for_panic = true; + hrtimer_setup_on_stack(&timer, panic_in_hardirq, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + hrtimer_start(&timer, us_to_ktime(100), HRTIMER_MODE_REL_HARD); + + while (wait_for_panic) + ; + + hrtimer_cancel(&timer); +} + static void lkdtm_BUG(void) { BUG(); } +static bool wait_for_bug; + +static enum hrtimer_restart bug_in_hardirq(struct hrtimer *timer) +{ + BUG(); + + wait_for_bug = false; + return HRTIMER_NORESTART; +} + +static void lkdtm_BUG_IN_HARDIRQ(void) +{ + struct hrtimer timer; + + wait_for_bug = true; + hrtimer_setup_on_stack(&timer, bug_in_hardirq, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + hrtimer_start(&timer, us_to_ktime(100), HRTIMER_MODE_REL_HARD); + + while (wait_for_bug) + ; + + hrtimer_cancel(&timer); +} + static int warn_counter; static void lkdtm_WARNING(void) @@ -696,7 +747,9 @@ static noinline void lkdtm_CORRUPT_PAC(void) static struct crashtype crashtypes[] = { CRASHTYPE(PANIC), CRASHTYPE(PANIC_STOP_IRQOFF), + CRASHTYPE(PANIC_IN_HARDIRQ), CRASHTYPE(BUG), + CRASHTYPE(BUG_IN_HARDIRQ), CRASHTYPE(WARNING), CRASHTYPE(WARNING_MESSAGE), CRASHTYPE(EXCEPTION), diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt index cff124c1eddd..67cd53715d93 100644 --- a/tools/testing/selftests/lkdtm/tests.txt +++ b/tools/testing/selftests/lkdtm/tests.txt @@ -1,6 +1,8 @@ #PANIC #PANIC_STOP_IRQOFF Crashes entire system +#PANIC_IN_HARDIRQ Crashes entire system BUG kernel BUG at +#BUG_IN_HARDIRQ Crashes entire system WARNING WARNING: WARNING_MESSAGE message trigger EXCEPTION From 63de2b3859ba1def9f43ed0a9c25a68810208e5c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 6 Dec 2025 20:01:17 +0100 Subject: [PATCH 3/5] arm64/efi: Remove unneeded SVE/SME fallback preserve/store handling Since commit 7137a203b251 ("arm64/fpsimd: Permit kernel mode NEON with IRQs off"), the only condition under which the fallback path is taken for FP/SIMD preserve/restore across a EFI runtime call is when it is called from hardirq or NMI context. In practice, this only happens when the EFI pstore driver is called to dump the kernel log buffer into a EFI variable under a panic, oops or emergency_restart() condition, and none of these can be expected to result in a return to user space for the task in question. This means that the existing EFI-specific logic for preserving and restoring SVE/SME state is pointless, and can be removed. Instead, kill the task, so that an exceedingly unlikely inadvertent return to user space does not proceed with a corrupted FP/SIMD state. Also, retain the preserve and restore of the base FP/SIMD state, as that might belong to kernel mode use of FP/SIMD. (Note that EFI runtime calls are never invoked reentrantly, even in this case, and so any interrupted kernel mode FP/SIMD usage will be unrelated to EFI) Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/fpsimd.c | 130 ++++++------------------------------- 1 file changed, 20 insertions(+), 110 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index c154f72634e0..9de1d8a604cb 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -180,13 +180,6 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static u8 *efi_sve_state; - -#else /* ! CONFIG_ARM64_SVE */ - -/* Dummy declaration for code that will be optimised out: */ -extern u8 *efi_sve_state; - #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME @@ -1095,36 +1088,6 @@ int vec_verify_vq_map(enum vec_type type) return 0; } -static void __init sve_efi_setup(void) -{ - int max_vl = 0; - int i; - - if (!IS_ENABLED(CONFIG_EFI)) - return; - - for (i = 0; i < ARRAY_SIZE(vl_info); i++) - max_vl = max(vl_info[i].max_vl, max_vl); - - /* - * alloc_percpu() warns and prints a backtrace if this goes wrong. - * This is evidence of a crippled system and we are returning void, - * so no attempt is made to handle this situation here. - */ - if (!sve_vl_valid(max_vl)) - goto fail; - - efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), - GFP_KERNEL); - if (!efi_sve_state) - goto fail; - - return; - -fail: - panic("Cannot allocate memory for EFI SVE save/restore"); -} - void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) { write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1); @@ -1185,8 +1148,6 @@ void __init sve_setup(void) if (sve_max_virtualisable_vl() < sve_max_vl()) pr_warn("%s: unvirtualisable vector lengths present\n", info->name); - - sve_efi_setup(); } /* @@ -1947,9 +1908,6 @@ EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI static struct user_fpsimd_state efi_fpsimd_state; -static bool efi_fpsimd_state_used; -static bool efi_sve_state_used; -static bool efi_sm_state; /* * EFI runtime services support functions @@ -1976,43 +1934,26 @@ void __efi_fpsimd_begin(void) if (may_use_simd()) { kernel_neon_begin(&efi_fpsimd_state); } else { - WARN_ON(preemptible()); - /* - * If !efi_sve_state, SVE can't be in use yet and doesn't need - * preserving: + * We are running in hardirq or NMI context, and the only + * legitimate case where this might happen is when EFI pstore + * is attempting to record the system's dying gasps into EFI + * variables. This could be due to an oops, a panic or a call + * to emergency_restart(), and in none of those cases, we can + * expect the current task to ever return to user space again, + * or for the kernel to resume any normal execution, for that + * matter (an oops in hardirq context triggers a panic too). + * + * Therefore, there is no point in attempting to preserve any + * SVE/SME state here. On the off chance that we might have + * ended up here for a different reason inadvertently, kill the + * task and preserve/restore the base FP/SIMD state, which + * might belong to kernel mode FP/SIMD. */ - if (system_supports_sve() && efi_sve_state != NULL) { - bool ffr = true; - u64 svcr; - - efi_sve_state_used = true; - - if (system_supports_sme()) { - svcr = read_sysreg_s(SYS_SVCR); - - efi_sm_state = svcr & SVCR_SM_MASK; - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = !(svcr & SVCR_SM_MASK); - } - - sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), - &efi_fpsimd_state.fpsr, ffr); - - if (system_supports_sme()) - sysreg_clear_set_s(SYS_SVCR, - SVCR_SM_MASK, 0); - - } else { - fpsimd_save_state(&efi_fpsimd_state); - } - - efi_fpsimd_state_used = true; + pr_warn_ratelimited("Calling EFI runtime from %s context\n", + in_nmi() ? "NMI" : "hardirq"); + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); + fpsimd_save_state(&efi_fpsimd_state); } } @@ -2024,41 +1965,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!efi_fpsimd_state_used) { + if (may_use_simd()) { kernel_neon_end(&efi_fpsimd_state); } else { - if (system_supports_sve() && efi_sve_state_used) { - bool ffr = true; - - /* - * Restore streaming mode; EFI calls are - * normal function calls so should not return in - * streaming mode. - */ - if (system_supports_sme()) { - if (efi_sm_state) { - sysreg_clear_set_s(SYS_SVCR, - 0, - SVCR_SM_MASK); - - /* - * Unless we have FA64 FFR does not - * exist in streaming mode. - */ - if (!system_supports_fa64()) - ffr = false; - } - } - - sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), - &efi_fpsimd_state.fpsr, ffr); - - efi_sve_state_used = false; - } else { - fpsimd_load_state(&efi_fpsimd_state); - } - - efi_fpsimd_state_used = false; + fpsimd_load_state(&efi_fpsimd_state); } } From 98a97bf41528ef738b06eb07ec2b2eb1cfde6ce6 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Sat, 29 Nov 2025 00:48:45 +0000 Subject: [PATCH 4/5] arm64/gcs: Flush the GCS locking state on exec When we exec a new task we forget to flush the set of locked GCS mode bits. Since we do flush the rest of the state this means that if GCS is locked the new task will be unable to enable GCS, it will be locked as being disabled. Add the expected flush. Fixes: fc84bc5378a8 ("arm64/gcs: Context switch GCS state for EL0") Cc: # 6.13.x Reported-by: Yury Khrustalev Signed-off-by: Mark Brown Tested-by: Yury Khrustalev Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index fba7ca102a8c..489554931231 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -292,6 +292,7 @@ static void flush_gcs(void) current->thread.gcs_base = 0; current->thread.gcs_size = 0; current->thread.gcs_el0_mode = 0; + current->thread.gcs_el0_locked = 0; write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); write_sysreg_s(0, SYS_GCSPR_EL0); } From f4ea8e05f2a857d5447c25f7daf00807d38b307d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 19 Dec 2025 15:09:09 +0000 Subject: [PATCH 5/5] lkdtm/bugs: Do not confuse the clang/objtool with busy wait loop Since commit eb972eab0794 ("lkdtm/bugs: Add cases for BUG and PANIC occurring in hardirq context"), building with clang for x86_64 results in the following warnings: vmlinux.o: warning: objtool: lkdtm_PANIC_IN_HARDIRQ(): unexpected end of section .text.lkdtm_PANIC_IN_HARDIRQ vmlinux.o: warning: objtool: lkdtm_BUG_IN_HARDIRQ(): unexpected end of section .text.lkdtm_BUG_IN_HARDIRQ caused by busy "while (wait_for_...);" loops. Add READ_ONCE() and cpu_relax() to better indicate the intention and avoid any unwanted compiler optimisations. Fixes: eb972eab0794 ("lkdtm/bugs: Add cases for BUG and PANIC occurring in hardirq context") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512190111.jxFSqxUH-lkp@intel.com/ Signed-off-by: Catalin Marinas --- drivers/misc/lkdtm/bugs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index fa05d77acb55..502059078b45 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -120,8 +120,8 @@ static void lkdtm_PANIC_IN_HARDIRQ(void) CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&timer, us_to_ktime(100), HRTIMER_MODE_REL_HARD); - while (wait_for_panic) - ; + while (READ_ONCE(wait_for_panic)) + cpu_relax(); hrtimer_cancel(&timer); } @@ -150,8 +150,8 @@ static void lkdtm_BUG_IN_HARDIRQ(void) CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&timer, us_to_ktime(100), HRTIMER_MODE_REL_HARD); - while (wait_for_bug) - ; + while (READ_ONCE(wait_for_bug)) + cpu_relax(); hrtimer_cancel(&timer); }