Landlock fix for v7.0-rc6
-----BEGIN PGP SIGNATURE----- iIYEABYKAC4WIQSVyBthFV4iTW/VU1/l49DojIL20gUCacVk0xAcbWljQGRpZ2lr b2QubmV0AAoJEOXj0OiMgvbS0v4A/joA39PP40bpHZorGYVgHyEZZgCgGicffmYd TnvlvawOAPoDc6h1HwkcOonhYgvEe29JPIBrEFOCNBZsGTntvN29Ag== =T4m+ -----END PGP SIGNATURE----- Merge tag 'landlock-7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux Pull Landlock fixes from Mickaël Salaün: "This mainly fixes Landlock TSYNC issues related to interrupts and unexpected task exit. Other fixes touch documentation and sample, and a new test extends coverage" * tag 'landlock-7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/mic/linux: landlock: Expand restrict flags example for ABI version 8 selftests/landlock: Test tsync interruption and cancellation paths landlock: Clean up interrupted thread logic in TSYNC landlock: Serialize TSYNC thread restriction samples/landlock: Bump ABI version to 8 landlock: Improve TSYNC types landlock: Fully release unused TSYNC work entries landlock: Fix formatting
This commit is contained in:
commit
25b69ebe28
|
|
@ -8,7 +8,7 @@ Landlock: unprivileged access control
|
|||
=====================================
|
||||
|
||||
:Author: Mickaël Salaün
|
||||
:Date: January 2026
|
||||
:Date: March 2026
|
||||
|
||||
The goal of Landlock is to enable restriction of ambient rights (e.g. global
|
||||
filesystem or network access) for a set of processes. Because Landlock
|
||||
|
|
@ -197,12 +197,27 @@ similar backwards compatibility check is needed for the restrict flags
|
|||
|
||||
.. code-block:: c
|
||||
|
||||
__u32 restrict_flags = LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
|
||||
if (abi < 7) {
|
||||
/* Clear logging flags unsupported before ABI 7. */
|
||||
__u32 restrict_flags =
|
||||
LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON |
|
||||
LANDLOCK_RESTRICT_SELF_TSYNC;
|
||||
switch (abi) {
|
||||
case 1 ... 6:
|
||||
/* Removes logging flags for ABI < 7 */
|
||||
restrict_flags &= ~(LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF |
|
||||
LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON |
|
||||
LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF);
|
||||
__attribute__((fallthrough));
|
||||
case 7:
|
||||
/*
|
||||
* Removes multithreaded enforcement flag for ABI < 8
|
||||
*
|
||||
* WARNING: Without this flag, calling landlock_restrict_self(2) is
|
||||
* only equivalent if the calling process is single-threaded. Below
|
||||
* ABI v8 (and as of ABI v8, when not using this flag), a Landlock
|
||||
* policy would only be enforced for the calling thread and its
|
||||
* children (and not for all threads, including parents and siblings).
|
||||
*/
|
||||
restrict_flags &= ~LANDLOCK_RESTRICT_SELF_TSYNC;
|
||||
}
|
||||
|
||||
The next step is to restrict the current thread from gaining more privileges
|
||||
|
|
|
|||
|
|
@ -299,7 +299,7 @@ out_unset:
|
|||
|
||||
/* clang-format on */
|
||||
|
||||
#define LANDLOCK_ABI_LAST 7
|
||||
#define LANDLOCK_ABI_LAST 8
|
||||
|
||||
#define XSTR(s) #s
|
||||
#define STR(s) XSTR(s)
|
||||
|
|
@ -436,7 +436,8 @@ int main(const int argc, char *const argv[], char *const *const envp)
|
|||
/* Removes LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON for ABI < 7 */
|
||||
supported_restrict_flags &=
|
||||
~LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON;
|
||||
|
||||
__attribute__((fallthrough));
|
||||
case 7:
|
||||
/* Must be printed for any ABI < LANDLOCK_ABI_LAST. */
|
||||
fprintf(stderr,
|
||||
"Hint: You should update the running kernel "
|
||||
|
|
|
|||
|
|
@ -94,8 +94,7 @@ static struct landlock_details *get_current_details(void)
|
|||
* allocate with GFP_KERNEL_ACCOUNT because it is independent from the
|
||||
* caller.
|
||||
*/
|
||||
details =
|
||||
kzalloc_flex(*details, exe_path, path_size);
|
||||
details = kzalloc_flex(*details, exe_path, path_size);
|
||||
if (!details)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
|
|
|
|||
|
|
@ -32,8 +32,7 @@ static struct landlock_ruleset *create_ruleset(const u32 num_layers)
|
|||
{
|
||||
struct landlock_ruleset *new_ruleset;
|
||||
|
||||
new_ruleset =
|
||||
kzalloc_flex(*new_ruleset, access_masks, num_layers,
|
||||
new_ruleset = kzalloc_flex(*new_ruleset, access_masks, num_layers,
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (!new_ruleset)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
|
@ -559,8 +558,8 @@ landlock_merge_ruleset(struct landlock_ruleset *const parent,
|
|||
if (IS_ERR(new_dom))
|
||||
return new_dom;
|
||||
|
||||
new_dom->hierarchy = kzalloc_obj(*new_dom->hierarchy,
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
new_dom->hierarchy =
|
||||
kzalloc_obj(*new_dom->hierarchy, GFP_KERNEL_ACCOUNT);
|
||||
if (!new_dom->hierarchy)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
|
|
|
|||
|
|
@ -203,6 +203,40 @@ static struct tsync_work *tsync_works_provide(struct tsync_works *s,
|
|||
return ctx;
|
||||
}
|
||||
|
||||
/**
|
||||
* tsync_works_trim - Put the last tsync_work element
|
||||
*
|
||||
* @s: TSYNC works to trim.
|
||||
*
|
||||
* Put the last task and decrement the size of @s.
|
||||
*
|
||||
* This helper does not cancel a running task, but just reset the last element
|
||||
* to zero.
|
||||
*/
|
||||
static void tsync_works_trim(struct tsync_works *s)
|
||||
{
|
||||
struct tsync_work *ctx;
|
||||
|
||||
if (WARN_ON_ONCE(s->size <= 0))
|
||||
return;
|
||||
|
||||
ctx = s->works[s->size - 1];
|
||||
|
||||
/*
|
||||
* For consistency, remove the task from ctx so that it does not look like
|
||||
* we handed it a task_work.
|
||||
*/
|
||||
put_task_struct(ctx->task);
|
||||
*ctx = (typeof(*ctx)){};
|
||||
|
||||
/*
|
||||
* Cancel the tsync_works_provide() change to recycle the reserved memory
|
||||
* for the next thread, if any. This also ensures that cancel_tsync_works()
|
||||
* and tsync_works_release() do not see any NULL task pointers.
|
||||
*/
|
||||
s->size--;
|
||||
}
|
||||
|
||||
/*
|
||||
* tsync_works_grow_by - preallocates space for n more contexts in s
|
||||
*
|
||||
|
|
@ -256,13 +290,14 @@ static int tsync_works_grow_by(struct tsync_works *s, size_t n, gfp_t flags)
|
|||
* tsync_works_contains - checks for presence of task in s
|
||||
*/
|
||||
static bool tsync_works_contains_task(const struct tsync_works *s,
|
||||
struct task_struct *task)
|
||||
const struct task_struct *task)
|
||||
{
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < s->size; i++)
|
||||
if (s->works[i]->task == task)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
@ -276,7 +311,7 @@ static void tsync_works_release(struct tsync_works *s)
|
|||
size_t i;
|
||||
|
||||
for (i = 0; i < s->size; i++) {
|
||||
if (!s->works[i]->task)
|
||||
if (WARN_ON_ONCE(!s->works[i]->task))
|
||||
continue;
|
||||
|
||||
put_task_struct(s->works[i]->task);
|
||||
|
|
@ -284,6 +319,7 @@ static void tsync_works_release(struct tsync_works *s)
|
|||
|
||||
for (i = 0; i < s->capacity; i++)
|
||||
kfree(s->works[i]);
|
||||
|
||||
kfree(s->works);
|
||||
s->works = NULL;
|
||||
s->size = 0;
|
||||
|
|
@ -295,7 +331,7 @@ static void tsync_works_release(struct tsync_works *s)
|
|||
*/
|
||||
static size_t count_additional_threads(const struct tsync_works *works)
|
||||
{
|
||||
struct task_struct *thread, *caller;
|
||||
const struct task_struct *caller, *thread;
|
||||
size_t n = 0;
|
||||
|
||||
caller = current;
|
||||
|
|
@ -334,7 +370,8 @@ static bool schedule_task_work(struct tsync_works *works,
|
|||
struct tsync_shared_context *shared_ctx)
|
||||
{
|
||||
int err;
|
||||
struct task_struct *thread, *caller;
|
||||
const struct task_struct *caller;
|
||||
struct task_struct *thread;
|
||||
struct tsync_work *ctx;
|
||||
bool found_more_threads = false;
|
||||
|
||||
|
|
@ -379,16 +416,14 @@ static bool schedule_task_work(struct tsync_works *works,
|
|||
|
||||
init_task_work(&ctx->work, restrict_one_thread_callback);
|
||||
err = task_work_add(thread, &ctx->work, TWA_SIGNAL);
|
||||
if (err) {
|
||||
if (unlikely(err)) {
|
||||
/*
|
||||
* task_work_add() only fails if the task is about to exit. We
|
||||
* checked that earlier, but it can happen as a race. Resume
|
||||
* without setting an error, as the task is probably gone in the
|
||||
* next loop iteration. For consistency, remove the task from ctx
|
||||
* so that it does not look like we handed it a task_work.
|
||||
* next loop iteration.
|
||||
*/
|
||||
put_task_struct(ctx->task);
|
||||
ctx->task = NULL;
|
||||
tsync_works_trim(works);
|
||||
|
||||
atomic_dec(&shared_ctx->num_preparing);
|
||||
atomic_dec(&shared_ctx->num_unfinished);
|
||||
|
|
@ -406,12 +441,15 @@ static bool schedule_task_work(struct tsync_works *works,
|
|||
* shared_ctx->num_preparing and shared_ctx->num_unfished and mark the two
|
||||
* completions if needed, as if the task was never scheduled.
|
||||
*/
|
||||
static void cancel_tsync_works(struct tsync_works *works,
|
||||
static void cancel_tsync_works(const struct tsync_works *works,
|
||||
struct tsync_shared_context *shared_ctx)
|
||||
{
|
||||
int i;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < works->size; i++) {
|
||||
if (WARN_ON_ONCE(!works->works[i]->task))
|
||||
continue;
|
||||
|
||||
if (!task_work_cancel(works->works[i]->task,
|
||||
&works->works[i]->work))
|
||||
continue;
|
||||
|
|
@ -447,6 +485,16 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
|
|||
shared_ctx.new_cred = new_cred;
|
||||
shared_ctx.set_no_new_privs = task_no_new_privs(current);
|
||||
|
||||
/*
|
||||
* Serialize concurrent TSYNC operations to prevent deadlocks when
|
||||
* multiple threads call landlock_restrict_self() simultaneously.
|
||||
* If the lock is already held, we gracefully yield by restarting the
|
||||
* syscall. This allows the current thread to process pending
|
||||
* task_works before retrying.
|
||||
*/
|
||||
if (!down_write_trylock(¤t->signal->exec_update_lock))
|
||||
return restart_syscall();
|
||||
|
||||
/*
|
||||
* We schedule a pseudo-signal task_work for each of the calling task's
|
||||
* sibling threads. In the task work, each thread:
|
||||
|
|
@ -527,24 +575,30 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
|
|||
-ERESTARTNOINTR);
|
||||
|
||||
/*
|
||||
* Cancel task works for tasks that did not start running yet,
|
||||
* and decrement all_prepared and num_unfinished accordingly.
|
||||
* Opportunistic improvement: try to cancel task
|
||||
* works for tasks that did not start running
|
||||
* yet. We do not have a guarantee that it
|
||||
* cancels any of the enqueued task works
|
||||
* because task_work_run() might already have
|
||||
* dequeued them.
|
||||
*/
|
||||
cancel_tsync_works(&works, &shared_ctx);
|
||||
|
||||
/*
|
||||
* The remaining task works have started running, so waiting for
|
||||
* their completion will finish.
|
||||
* Break the loop with error. The cleanup code
|
||||
* after the loop unblocks the remaining
|
||||
* task_works.
|
||||
*/
|
||||
wait_for_completion(&shared_ctx.all_prepared);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (found_more_threads &&
|
||||
!atomic_read(&shared_ctx.preparation_error));
|
||||
|
||||
/*
|
||||
* We now have all sibling threads blocking and in "prepared" state in the
|
||||
* task work. Ask all threads to commit.
|
||||
* We now have either (a) all sibling threads blocking and in "prepared"
|
||||
* state in the task work, or (b) the preparation error is set. Ask all
|
||||
* threads to commit (or abort).
|
||||
*/
|
||||
complete_all(&shared_ctx.ready_to_commit);
|
||||
|
||||
|
|
@ -556,6 +610,6 @@ int landlock_restrict_sibling_threads(const struct cred *old_cred,
|
|||
wait_for_completion(&shared_ctx.all_finished);
|
||||
|
||||
tsync_works_release(&works);
|
||||
|
||||
up_write(¤t->signal->exec_update_lock);
|
||||
return atomic_read(&shared_ctx.preparation_error);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,9 +6,10 @@
|
|||
*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
#include <pthread.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <linux/landlock.h>
|
||||
#include <pthread.h>
|
||||
#include <signal.h>
|
||||
#include <sys/prctl.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
|
@ -158,4 +159,92 @@ TEST(competing_enablement)
|
|||
EXPECT_EQ(0, close(ruleset_fd));
|
||||
}
|
||||
|
||||
static void signal_nop_handler(int sig)
|
||||
{
|
||||
}
|
||||
|
||||
struct signaler_data {
|
||||
pthread_t target;
|
||||
volatile bool stop;
|
||||
};
|
||||
|
||||
static void *signaler_thread(void *data)
|
||||
{
|
||||
struct signaler_data *sd = data;
|
||||
|
||||
while (!sd->stop)
|
||||
pthread_kill(sd->target, SIGUSR1);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Number of idle sibling threads. This must be large enough that even on
|
||||
* machines with many cores, the sibling threads cannot all complete their
|
||||
* credential preparation in a single parallel wave, otherwise the signaler
|
||||
* thread has no window to interrupt wait_for_completion_interruptible().
|
||||
* 200 threads on a 64-core machine yields ~3 serialized waves, giving the
|
||||
* tight signal loop enough time to land an interruption.
|
||||
*/
|
||||
#define NUM_IDLE_THREADS 200
|
||||
|
||||
/*
|
||||
* Exercises the tsync interruption and cancellation paths in tsync.c.
|
||||
*
|
||||
* When a signal interrupts the calling thread while it waits for sibling
|
||||
* threads to finish their credential preparation
|
||||
* (wait_for_completion_interruptible in landlock_restrict_sibling_threads),
|
||||
* the kernel sets ERESTARTNOINTR, cancels queued task works that have not
|
||||
* started yet (cancel_tsync_works), then waits for the remaining works to
|
||||
* finish. On the error return, syscalls.c aborts the prepared credentials.
|
||||
* The kernel automatically restarts the syscall, so userspace sees success.
|
||||
*/
|
||||
TEST(tsync_interrupt)
|
||||
{
|
||||
size_t i;
|
||||
pthread_t threads[NUM_IDLE_THREADS];
|
||||
pthread_t signaler;
|
||||
struct signaler_data sd;
|
||||
struct sigaction sa = {};
|
||||
const int ruleset_fd = create_ruleset(_metadata);
|
||||
|
||||
disable_caps(_metadata);
|
||||
|
||||
/* Install a no-op SIGUSR1 handler so the signal does not kill us. */
|
||||
sa.sa_handler = signal_nop_handler;
|
||||
sigemptyset(&sa.sa_mask);
|
||||
ASSERT_EQ(0, sigaction(SIGUSR1, &sa, NULL));
|
||||
|
||||
ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
|
||||
|
||||
for (i = 0; i < NUM_IDLE_THREADS; i++)
|
||||
ASSERT_EQ(0, pthread_create(&threads[i], NULL, idle, NULL));
|
||||
|
||||
/*
|
||||
* Start a signaler thread that continuously sends SIGUSR1 to the
|
||||
* calling thread. This maximizes the chance of interrupting
|
||||
* wait_for_completion_interruptible() in the kernel's tsync path.
|
||||
*/
|
||||
sd.target = pthread_self();
|
||||
sd.stop = false;
|
||||
ASSERT_EQ(0, pthread_create(&signaler, NULL, signaler_thread, &sd));
|
||||
|
||||
/*
|
||||
* The syscall may be interrupted and transparently restarted by the
|
||||
* kernel (ERESTARTNOINTR). From userspace, it should always succeed.
|
||||
*/
|
||||
EXPECT_EQ(0, landlock_restrict_self(ruleset_fd,
|
||||
LANDLOCK_RESTRICT_SELF_TSYNC));
|
||||
|
||||
sd.stop = true;
|
||||
ASSERT_EQ(0, pthread_join(signaler, NULL));
|
||||
|
||||
for (i = 0; i < NUM_IDLE_THREADS; i++) {
|
||||
ASSERT_EQ(0, pthread_cancel(threads[i]));
|
||||
ASSERT_EQ(0, pthread_join(threads[i], NULL));
|
||||
}
|
||||
|
||||
EXPECT_EQ(0, close(ruleset_fd));
|
||||
}
|
||||
|
||||
TEST_HARNESS_MAIN
|
||||
|
|
|
|||
Loading…
Reference in New Issue