Landlock fix for v7.0-rc6

-----BEGIN PGP SIGNATURE-----
 
 iIYEABYKAC4WIQSVyBthFV4iTW/VU1/l49DojIL20gUCacVk0xAcbWljQGRpZ2lr
 b2QubmV0AAoJEOXj0OiMgvbS0v4A/joA39PP40bpHZorGYVgHyEZZgCgGicffmYd
 TnvlvawOAPoDc6h1HwkcOonhYgvEe29JPIBrEFOCNBZsGTntvN29Ag==
 =T4m+
 -----END PGP SIGNATURE-----

Merge tag 'landlock-7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux

Pull Landlock fixes from Mickaël Salaün:
 "This mainly fixes Landlock TSYNC issues related to interrupts and
  unexpected task exit.

  Other fixes touch documentation and sample, and a new test extends
  coverage"

* tag 'landlock-7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux:
  landlock: Expand restrict flags example for ABI version 8
  selftests/landlock: Test tsync interruption and cancellation paths
  landlock: Clean up interrupted thread logic in TSYNC
  landlock: Serialize TSYNC thread restriction
  samples/landlock: Bump ABI version to 8
  landlock: Improve TSYNC types
  landlock: Fully release unused TSYNC work entries
  landlock: Fix formatting
This commit is contained in:
Linus Torvalds 2026-03-26 12:03:37 -07:00
commit 25b69ebe28
6 changed files with 191 additions and 34 deletions

View File

@ -8,7 +8,7 @@ Landlock: unprivileged access control
===================================== =====================================
:Author: Mickaël Salaün :Author: Mickaël Salaün
:Date: January 2026 :Date: March 2026
The goal of Landlock is to enable restriction of ambient rights (e.g. global The goal of Landlock is to enable restriction of ambient rights (e.g. global
filesystem or network access) for a set of processes. Because Landlock filesystem or network access) for a set of processes. Because Landlock
@ -197,12 +197,27 @@ similar backwards compatibility check is needed for the restrict flags
.. code-block:: c .. code-block:: c
__u32 restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; __u32 restrict_flags =
if (abi < 7) { LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON |
/* Clear logging flags unsupported before ABI 7. */ LANDLOCK_RESTRICT_SELF_TSYNC;
switch (abi) {
case 1 ... 6:
/* Removes logging flags for ABI < 7 */
restrict_flags &= ~(LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF | restrict_flags &= ~(LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF |
LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON | LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON |
LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF); LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF);
__attribute__((fallthrough));
case 7:
/*
* Removes multithreaded enforcement flag for ABI < 8
*
* WARNING: Without this flag, calling landlock_restrict_self(2) is
* only equivalent if the calling process is single-threaded. Below
* ABI v8 (and as of ABI v8, when not using this flag), a Landlock
* policy would only be enforced for the calling thread and its
* children (and not for all threads, including parents and siblings).
*/
restrict_flags &= ~LANDLOCK_RESTRICT_SELF_TSYNC;
} }
The next step is to restrict the current thread from gaining more privileges The next step is to restrict the current thread from gaining more privileges

View File

@ -299,7 +299,7 @@ out_unset:
/* clang-format on */ /* clang-format on */
#define LANDLOCK_ABI_LAST 7 #define LANDLOCK_ABI_LAST 8
#define XSTR(s) #s #define XSTR(s) #s
#define STR(s) XSTR(s) #define STR(s) XSTR(s)
@ -436,7 +436,8 @@ int main(const int argc, char *const argv[], char *const *const envp)
/* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */ /* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */
supported_restrict_flags &= supported_restrict_flags &=
~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON; ~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
__attribute__((fallthrough));
case 7:
/* Must be printed for any ABI < LANDLOCK_ABI_LAST. */ /* Must be printed for any ABI < LANDLOCK_ABI_LAST. */
fprintf(stderr, fprintf(stderr,
"Hint: You should update the running kernel " "Hint: You should update the running kernel "

View File

@ -94,8 +94,7 @@ static struct landlock_details *get_current_details(void)
* allocate with GFP_KERNEL_ACCOUNT because it is independent from the * allocate with GFP_KERNEL_ACCOUNT because it is independent from the
* caller. * caller.
*/ */
details = details = kzalloc_flex(*details, exe_path, path_size);
kzalloc_flex(*details, exe_path, path_size);
if (!details) if (!details)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);

View File

@ -32,9 +32,8 @@ static struct landlock_ruleset *create_ruleset(const u32 num_layers)
{ {
struct landlock_ruleset *new_ruleset; struct landlock_ruleset *new_ruleset;
new_ruleset = new_ruleset = kzalloc_flex(*new_ruleset, access_masks, num_layers,
kzalloc_flex(*new_ruleset, access_masks, num_layers, GFP_KERNEL_ACCOUNT);
GFP_KERNEL_ACCOUNT);
if (!new_ruleset) if (!new_ruleset)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
refcount_set(&new_ruleset->usage, 1); refcount_set(&new_ruleset->usage, 1);
@ -559,8 +558,8 @@ landlock_merge_ruleset(struct landlock_ruleset *const parent,
if (IS_ERR(new_dom)) if (IS_ERR(new_dom))
return new_dom; return new_dom;
new_dom->hierarchy = kzalloc_obj(*new_dom->hierarchy, new_dom->hierarchy =
GFP_KERNEL_ACCOUNT); kzalloc_obj(*new_dom->hierarchy, GFP_KERNEL_ACCOUNT);
if (!new_dom->hierarchy) if (!new_dom->hierarchy)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);

View File

@ -203,6 +203,40 @@ static struct tsync_work *tsync_works_provide(struct tsync_works *s,
return ctx; return ctx;
} }
/**
* tsync_works_trim - Put the last tsync_work element
*
* @s: TSYNC works to trim.
*
* Put the last task and decrement the size of @s.
*
* This helper does not cancel a running task, but just reset the last element
* to zero.
*/
static void tsync_works_trim(struct tsync_works *s)
{
struct tsync_work *ctx;
if (WARN_ON_ONCE(s->size <= 0))
return;
ctx = s->works[s->size - 1];
/*
* For consistency, remove the task from ctx so that it does not look like
* we handed it a task_work.
*/
put_task_struct(ctx->task);
*ctx = (typeof(*ctx)){};
/*
* Cancel the tsync_works_provide() change to recycle the reserved memory
* for the next thread, if any. This also ensures that cancel_tsync_works()
* and tsync_works_release() do not see any NULL task pointers.
*/
s->size--;
}
/* /*
* tsync_works_grow_by - preallocates space for n more contexts in s * tsync_works_grow_by - preallocates space for n more contexts in s
* *
@ -256,13 +290,14 @@ static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
* tsync_works_contains - checks for presence of task in s * tsync_works_contains - checks for presence of task in s
*/ */
static bool tsync_works_contains_task(const struct tsync_works *s, static bool tsync_works_contains_task(const struct tsync_works *s,
struct task_struct *task) const struct task_struct *task)
{ {
size_t i; size_t i;
for (i = 0; i < s->size; i++) for (i = 0; i < s->size; i++)
if (s->works[i]->task == task) if (s->works[i]->task == task)
return true; return true;
return false; return false;
} }
@ -276,7 +311,7 @@ static void tsync_works_release(struct tsync_works *s)
size_t i; size_t i;
for (i = 0; i < s->size; i++) { for (i = 0; i < s->size; i++) {
if (!s->works[i]->task) if (WARN_ON_ONCE(!s->works[i]->task))
continue; continue;
put_task_struct(s->works[i]->task); put_task_struct(s->works[i]->task);
@ -284,6 +319,7 @@ static void tsync_works_release(struct tsync_works *s)
for (i = 0; i < s->capacity; i++) for (i = 0; i < s->capacity; i++)
kfree(s->works[i]); kfree(s->works[i]);
kfree(s->works); kfree(s->works);
s->works = NULL; s->works = NULL;
s->size = 0; s->size = 0;
@ -295,7 +331,7 @@ static void tsync_works_release(struct tsync_works *s)
*/ */
static size_t count_additional_threads(const struct tsync_works *works) static size_t count_additional_threads(const struct tsync_works *works)
{ {
struct task_struct *thread, *caller; const struct task_struct *caller, *thread;
size_t n = 0; size_t n = 0;
caller = current; caller = current;
@ -334,7 +370,8 @@ static bool schedule_task_work(struct tsync_works *works,
struct tsync_shared_context *shared_ctx) struct tsync_shared_context *shared_ctx)
{ {
int err; int err;
struct task_struct *thread, *caller; const struct task_struct *caller;
struct task_struct *thread;
struct tsync_work *ctx; struct tsync_work *ctx;
bool found_more_threads = false; bool found_more_threads = false;
@ -379,16 +416,14 @@ static bool schedule_task_work(struct tsync_works *works,
init_task_work(&ctx->work, restrict_one_thread_callback); init_task_work(&ctx->work, restrict_one_thread_callback);
err = task_work_add(thread, &ctx->work, TWA_SIGNAL); err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
if (err) { if (unlikely(err)) {
/* /*
* task_work_add() only fails if the task is about to exit. We * task_work_add() only fails if the task is about to exit. We
* checked that earlier, but it can happen as a race. Resume * checked that earlier, but it can happen as a race. Resume
* without setting an error, as the task is probably gone in the * without setting an error, as the task is probably gone in the
* next loop iteration. For consistency, remove the task from ctx * next loop iteration.
* so that it does not look like we handed it a task_work.
*/ */
put_task_struct(ctx->task); tsync_works_trim(works);
ctx->task = NULL;
atomic_dec(&shared_ctx->num_preparing); atomic_dec(&shared_ctx->num_preparing);
atomic_dec(&shared_ctx->num_unfinished); atomic_dec(&shared_ctx->num_unfinished);
@ -406,12 +441,15 @@ static bool schedule_task_work(struct tsync_works *works,
* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two * shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
* completions if needed, as if the task was never scheduled. * completions if needed, as if the task was never scheduled.
*/ */
static void cancel_tsync_works(struct tsync_works *works, static void cancel_tsync_works(const struct tsync_works *works,
struct tsync_shared_context *shared_ctx) struct tsync_shared_context *shared_ctx)
{ {
int i; size_t i;
for (i = 0; i < works->size; i++) { for (i = 0; i < works->size; i++) {
if (WARN_ON_ONCE(!works->works[i]->task))
continue;
if (!task_work_cancel(works->works[i]->task, if (!task_work_cancel(works->works[i]->task,
&works->works[i]->work)) &works->works[i]->work))
continue; continue;
@ -447,6 +485,16 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
shared_ctx.new_cred = new_cred; shared_ctx.new_cred = new_cred;
shared_ctx.set_no_new_privs = task_no_new_privs(current); shared_ctx.set_no_new_privs = task_no_new_privs(current);
/*
* Serialize concurrent TSYNC operations to prevent deadlocks when
* multiple threads call landlock_restrict_self() simultaneously.
* If the lock is already held, we gracefully yield by restarting the
* syscall. This allows the current thread to process pending
* task_works before retrying.
*/
if (!down_write_trylock(&current->signal->exec_update_lock))
return restart_syscall();
/* /*
* We schedule a pseudo-signal task_work for each of the calling task's * We schedule a pseudo-signal task_work for each of the calling task's
* sibling threads. In the task work, each thread: * sibling threads. In the task work, each thread:
@ -527,24 +575,30 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
-ERESTARTNOINTR); -ERESTARTNOINTR);
/* /*
* Cancel task works for tasks that did not start running yet, * Opportunistic improvement: try to cancel task
* and decrement all_prepared and num_unfinished accordingly. * works for tasks that did not start running
* yet. We do not have a guarantee that it
* cancels any of the enqueued task works
* because task_work_run() might already have
* dequeued them.
*/ */
cancel_tsync_works(&works, &shared_ctx); cancel_tsync_works(&works, &shared_ctx);
/* /*
* The remaining task works have started running, so waiting for * Break the loop with error. The cleanup code
* their completion will finish. * after the loop unblocks the remaining
* task_works.
*/ */
wait_for_completion(&shared_ctx.all_prepared); break;
} }
} }
} while (found_more_threads && } while (found_more_threads &&
!atomic_read(&shared_ctx.preparation_error)); !atomic_read(&shared_ctx.preparation_error));
/* /*
* We now have all sibling threads blocking and in "prepared" state in the * We now have either (a) all sibling threads blocking and in "prepared"
* task work. Ask all threads to commit. * state in the task work, or (b) the preparation error is set. Ask all
* threads to commit (or abort).
*/ */
complete_all(&shared_ctx.ready_to_commit); complete_all(&shared_ctx.ready_to_commit);
@ -556,6 +610,6 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
wait_for_completion(&shared_ctx.all_finished); wait_for_completion(&shared_ctx.all_finished);
tsync_works_release(&works); tsync_works_release(&works);
up_write(&current->signal->exec_update_lock);
return atomic_read(&shared_ctx.preparation_error); return atomic_read(&shared_ctx.preparation_error);
} }

View File

@ -6,9 +6,10 @@
*/ */
#define _GNU_SOURCE #define _GNU_SOURCE
#include <pthread.h>
#include <sys/prctl.h>
#include <linux/landlock.h> #include <linux/landlock.h>
#include <pthread.h>
#include <signal.h>
#include <sys/prctl.h>
#include "common.h" #include "common.h"
@ -158,4 +159,92 @@ TEST(competing_enablement)
EXPECT_EQ(0, close(ruleset_fd)); EXPECT_EQ(0, close(ruleset_fd));
} }
static void signal_nop_handler(int sig)
{
}
struct signaler_data {
pthread_t target;
volatile bool stop;
};
static void *signaler_thread(void *data)
{
struct signaler_data *sd = data;
while (!sd->stop)
pthread_kill(sd->target, SIGUSR1);
return NULL;
}
/*
* Number of idle sibling threads. This must be large enough that even on
* machines with many cores, the sibling threads cannot all complete their
* credential preparation in a single parallel wave, otherwise the signaler
* thread has no window to interrupt wait_for_completion_interruptible().
* 200 threads on a 64-core machine yields ~3 serialized waves, giving the
* tight signal loop enough time to land an interruption.
*/
#define NUM_IDLE_THREADS 200
/*
* Exercises the tsync interruption and cancellation paths in tsync.c.
*
* When a signal interrupts the calling thread while it waits for sibling
* threads to finish their credential preparation
* (wait_for_completion_interruptible in landlock_restrict_sibling_threads),
* the kernel sets ERESTARTNOINTR, cancels queued task works that have not
* started yet (cancel_tsync_works), then waits for the remaining works to
* finish. On the error return, syscalls.c aborts the prepared credentials.
* The kernel automatically restarts the syscall, so userspace sees success.
*/
TEST(tsync_interrupt)
{
size_t i;
pthread_t threads[NUM_IDLE_THREADS];
pthread_t signaler;
struct signaler_data sd;
struct sigaction sa = {};
const int ruleset_fd = create_ruleset(_metadata);
disable_caps(_metadata);
/* Install a no-op SIGUSR1 handler so the signal does not kill us. */
sa.sa_handler = signal_nop_handler;
sigemptyset(&sa.sa_mask);
ASSERT_EQ(0, sigaction(SIGUSR1, &sa, NULL));
ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
for (i = 0; i < NUM_IDLE_THREADS; i++)
ASSERT_EQ(0, pthread_create(&threads[i], NULL, idle, NULL));
/*
* Start a signaler thread that continuously sends SIGUSR1 to the
* calling thread. This maximizes the chance of interrupting
* wait_for_completion_interruptible() in the kernel's tsync path.
*/
sd.target = pthread_self();
sd.stop = false;
ASSERT_EQ(0, pthread_create(&signaler, NULL, signaler_thread, &sd));
/*
* The syscall may be interrupted and transparently restarted by the
* kernel (ERESTARTNOINTR). From userspace, it should always succeed.
*/
EXPECT_EQ(0, landlock_restrict_self(ruleset_fd,
LANDLOCK_RESTRICT_SELF_TSYNC));
sd.stop = true;
ASSERT_EQ(0, pthread_join(signaler, NULL));
for (i = 0; i < NUM_IDLE_THREADS; i++) {
ASSERT_EQ(0, pthread_cancel(threads[i]));
ASSERT_EQ(0, pthread_join(threads[i], NULL));
}
EXPECT_EQ(0, close(ruleset_fd));
}
TEST_HARNESS_MAIN TEST_HARNESS_MAIN