cgroup: Wait for dying tasks to leave on rmdir
a72f73c4dd("cgroup: Don't expose dead tasks in cgroup") hid PF_EXITING tasks from cgroup.procs so that systemd doesn't see tasks that have already been reaped via waitpid(). However, the populated counter (nr_populated_csets) is only decremented when the task later passes through cgroup_task_dead() in finish_task_switch(). This means cgroup.procs can appear empty while the cgroup is still populated, causing rmdir to fail with -EBUSY. Fix this by making cgroup_rmdir() wait for dying tasks to fully leave. If the cgroup is populated but all remaining tasks have PF_EXITING set (the task iterator returns none due to the existing filter), wait for a kick from cgroup_task_dead() and retry. The wait is brief as tasks are removed from the cgroup's css_set between PF_EXITING assertion in do_exit() and cgroup_task_dead() in finish_task_switch(). v2: cgroup_is_populated() true to false transition happens under css_set_lock not cgroup_mutex, so retest under css_set_lock before sleeping to avoid missed wakeups (Sebastian). Fixes:a72f73c4dd("cgroup: Don't expose dead tasks in cgroup") Reported-by: kernel test robot <oliver.sang@intel.com> Closes: https://lore.kernel.org/oe-lkp/202603222104.2c81684e-lkp@intel.com Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Cc: Bert Karwatzki <spasswolf@web.de> Cc: Michal Koutny <mkoutny@suse.com> Cc: cgroups@vger.kernel.org
This commit is contained in:
parent
a72f73c4dd
commit
1b164b876c
|
|
@ -609,6 +609,9 @@ struct cgroup {
|
|||
/* used to wait for offlining of csses */
|
||||
wait_queue_head_t offline_waitq;
|
||||
|
||||
/* used by cgroup_rmdir() to wait for dying tasks to leave */
|
||||
wait_queue_head_t dying_populated_waitq;
|
||||
|
||||
/* used to schedule release agent */
|
||||
struct work_struct release_agent_work;
|
||||
|
||||
|
|
|
|||
|
|
@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
|
|||
#endif
|
||||
|
||||
init_waitqueue_head(&cgrp->offline_waitq);
|
||||
init_waitqueue_head(&cgrp->dying_populated_waitq);
|
||||
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
|
||||
}
|
||||
|
||||
|
|
@ -6224,6 +6225,76 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|||
return 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* cgroup_drain_dying - wait for dying tasks to leave before rmdir
|
||||
* @cgrp: the cgroup being removed
|
||||
*
|
||||
* The PF_EXITING filter in css_task_iter_advance() hides exiting tasks from
|
||||
* cgroup.procs so that userspace (e.g. systemd) doesn't see tasks that have
|
||||
* already been reaped via waitpid(). However, the populated counter
|
||||
* (nr_populated_csets) is only decremented when the task later passes through
|
||||
* cgroup_task_dead() in finish_task_switch(). This creates a window where
|
||||
* cgroup.procs appears empty but cgroup_is_populated() is still true, causing
|
||||
* rmdir to fail with -EBUSY.
|
||||
*
|
||||
* This function bridges that gap. If the cgroup is populated but all remaining
|
||||
* tasks have PF_EXITING set, we wait for cgroup_task_dead() to process them.
|
||||
* Tasks are removed from the cgroup's css_set in cgroup_task_dead() called from
|
||||
* finish_task_switch(). As the window between PF_EXITING and cgroup_task_dead()
|
||||
* is short, the number of PF_EXITING tasks on the list is small and the wait
|
||||
* is brief.
|
||||
*
|
||||
* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
|
||||
* retry the full check from scratch.
|
||||
*
|
||||
* Must be called with cgroup_mutex held.
|
||||
*/
|
||||
static int cgroup_drain_dying(struct cgroup *cgrp)
|
||||
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
|
||||
{
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
retry:
|
||||
if (!cgroup_is_populated(cgrp))
|
||||
return 0;
|
||||
|
||||
/* Same iterator as cgroup.threads - if any task is visible, it's busy */
|
||||
css_task_iter_start(&cgrp->self, 0, &it);
|
||||
task = css_task_iter_next(&it);
|
||||
css_task_iter_end(&it);
|
||||
|
||||
if (task)
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
* All remaining tasks are PF_EXITING and will pass through
|
||||
* cgroup_task_dead() shortly. Wait for a kick and retry.
|
||||
*
|
||||
* cgroup_is_populated() can't transition from false to true while
|
||||
* we're holding cgroup_mutex, but the true to false transition
|
||||
* happens under css_set_lock (via cgroup_task_dead()). We must
|
||||
* retest and prepare_to_wait() under css_set_lock. Otherwise, the
|
||||
* transition can happen between our first test and
|
||||
* prepare_to_wait(), and we sleep with no one to wake us.
|
||||
*/
|
||||
spin_lock_irq(&css_set_lock);
|
||||
if (!cgroup_is_populated(cgrp)) {
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
return 0;
|
||||
}
|
||||
prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
schedule();
|
||||
finish_wait(&cgrp->dying_populated_waitq, &wait);
|
||||
mutex_lock(&cgroup_mutex);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
int cgroup_rmdir(struct kernfs_node *kn)
|
||||
{
|
||||
struct cgroup *cgrp;
|
||||
|
|
@ -6233,9 +6304,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
|
|||
if (!cgrp)
|
||||
return 0;
|
||||
|
||||
ret = cgroup_destroy_locked(cgrp);
|
||||
if (!ret)
|
||||
TRACE_CGROUP_PATH(rmdir, cgrp);
|
||||
ret = cgroup_drain_dying(cgrp);
|
||||
if (!ret) {
|
||||
ret = cgroup_destroy_locked(cgrp);
|
||||
if (!ret)
|
||||
TRACE_CGROUP_PATH(rmdir, cgrp);
|
||||
}
|
||||
|
||||
cgroup_kn_unlock(kn);
|
||||
return ret;
|
||||
|
|
@ -6995,6 +7069,7 @@ void cgroup_task_exit(struct task_struct *tsk)
|
|||
|
||||
static void do_cgroup_task_dead(struct task_struct *tsk)
|
||||
{
|
||||
struct cgrp_cset_link *link;
|
||||
struct css_set *cset;
|
||||
unsigned long flags;
|
||||
|
||||
|
|
@ -7008,6 +7083,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
|
|||
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
|
||||
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
|
||||
|
||||
/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
|
||||
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
|
||||
if (waitqueue_active(&link->cgrp->dying_populated_waitq))
|
||||
wake_up(&link->cgrp->dying_populated_waitq);
|
||||
|
||||
if (dl_task(tsk))
|
||||
dec_dl_tasks_cs(tsk);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue