1178 lines
28 KiB
C
1178 lines
28 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright (c) 2024-2026 Oracle. All Rights Reserved.
|
|
* Author: Darrick J. Wong <djwong@kernel.org>
|
|
*/
|
|
#include "xfs.h"
|
|
#include "xfs_fs.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_trace.h"
|
|
#include "xfs_ag.h"
|
|
#include "xfs_btree.h"
|
|
#include "xfs_da_format.h"
|
|
#include "xfs_da_btree.h"
|
|
#include "xfs_quota_defs.h"
|
|
#include "xfs_rtgroup.h"
|
|
#include "xfs_health.h"
|
|
#include "xfs_healthmon.h"
|
|
#include "xfs_fsops.h"
|
|
#include "xfs_notify_failure.h"
|
|
#include "xfs_file.h"
|
|
|
|
#include <linux/anon_inodes.h>
|
|
#include <linux/eventpoll.h>
|
|
#include <linux/poll.h>
|
|
#include <linux/fserror.h>
|
|
|
|
/*
|
|
* Live Health Monitoring
|
|
* ======================
|
|
*
|
|
* Autonomous self-healing of XFS filesystems requires a means for the kernel
|
|
* to send filesystem health events to a monitoring daemon in userspace. To
|
|
* accomplish this, we establish a thread_with_file kthread object to handle
|
|
* translating internal events about filesystem health into a format that can
|
|
* be parsed easily by userspace. When those internal events occur, the core
|
|
* filesystem code calls this health monitor to convey the events to userspace.
|
|
* Userspace reads events from the file descriptor returned by the ioctl.
|
|
*
|
|
* The healthmon abstraction has a weak reference to the host filesystem mount
|
|
* so that the queueing and processing of the events do not pin the mount and
|
|
* cannot slow down the main filesystem. The healthmon object can exist past
|
|
* the end of the filesystem mount.
|
|
*/
|
|
|
|
/* sign of a detached health monitor */
|
|
#define DETACHED_MOUNT_COOKIE ((uintptr_t)0)
|
|
|
|
/* Constrain the number of event objects that can build up in memory. */
|
|
#define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \
|
|
sizeof(struct xfs_healthmon_event))
|
|
|
|
/* Constrain the size of the output buffer for read_iter. */
|
|
#define XFS_HEALTHMON_MAX_OUTBUF SZ_64K
|
|
|
|
/* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */
|
|
static DEFINE_SPINLOCK(xfs_healthmon_lock);
|
|
|
|
/* Grab a reference to the healthmon object for a given mount, if any. */
|
|
static struct xfs_healthmon *
|
|
xfs_healthmon_get(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_healthmon *hm;
|
|
|
|
rcu_read_lock();
|
|
hm = mp->m_healthmon;
|
|
if (hm && !refcount_inc_not_zero(&hm->ref))
|
|
hm = NULL;
|
|
rcu_read_unlock();
|
|
|
|
return hm;
|
|
}
|
|
|
|
/*
|
|
* Release the reference to a healthmon object. If there are no more holders,
|
|
* free the health monitor after an RCU grace period to eliminate possibility
|
|
* of races with xfs_healthmon_get.
|
|
*/
|
|
static void
|
|
xfs_healthmon_put(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
if (refcount_dec_and_test(&hm->ref)) {
|
|
struct xfs_healthmon_event *event;
|
|
struct xfs_healthmon_event *next = hm->first_event;
|
|
|
|
while ((event = next) != NULL) {
|
|
trace_xfs_healthmon_drop(hm, event);
|
|
next = event->next;
|
|
kfree(event);
|
|
}
|
|
|
|
kfree(hm->unmount_event);
|
|
kfree(hm->buffer);
|
|
mutex_destroy(&hm->lock);
|
|
kfree_rcu_mightsleep(hm);
|
|
}
|
|
}
|
|
|
|
/* Attach a health monitor to an xfs_mount. Only one allowed at a time. */
|
|
STATIC int
|
|
xfs_healthmon_attach(
|
|
struct xfs_mount *mp,
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
spin_lock(&xfs_healthmon_lock);
|
|
if (mp->m_healthmon != NULL) {
|
|
spin_unlock(&xfs_healthmon_lock);
|
|
return -EEXIST;
|
|
}
|
|
|
|
refcount_inc(&hm->ref);
|
|
mp->m_healthmon = hm;
|
|
hm->mount_cookie = (uintptr_t)mp->m_super;
|
|
spin_unlock(&xfs_healthmon_lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Detach a xfs mount from a specific healthmon instance. */
|
|
STATIC void
|
|
xfs_healthmon_detach(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
spin_lock(&xfs_healthmon_lock);
|
|
if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) {
|
|
spin_unlock(&xfs_healthmon_lock);
|
|
return;
|
|
}
|
|
|
|
XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL;
|
|
hm->mount_cookie = DETACHED_MOUNT_COOKIE;
|
|
spin_unlock(&xfs_healthmon_lock);
|
|
|
|
trace_xfs_healthmon_detach(hm);
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm)
|
|
{
|
|
hm->events++;
|
|
hm->total_events++;
|
|
}
|
|
|
|
static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm)
|
|
{
|
|
hm->lost_prev_event++;
|
|
hm->total_lost++;
|
|
}
|
|
|
|
/*
|
|
* If possible, merge a new event into an existing event. Returns whether or
|
|
* not it merged anything.
|
|
*/
|
|
static bool
|
|
xfs_healthmon_merge_events(
|
|
struct xfs_healthmon_event *existing,
|
|
const struct xfs_healthmon_event *new)
|
|
{
|
|
if (!existing)
|
|
return false;
|
|
|
|
/* type and domain must match to merge events */
|
|
if (existing->type != new->type ||
|
|
existing->domain != new->domain)
|
|
return false;
|
|
|
|
switch (existing->type) {
|
|
case XFS_HEALTHMON_RUNNING:
|
|
case XFS_HEALTHMON_UNMOUNT:
|
|
/* should only ever be one of these events anyway */
|
|
return false;
|
|
|
|
case XFS_HEALTHMON_LOST:
|
|
existing->lostcount += new->lostcount;
|
|
return true;
|
|
|
|
case XFS_HEALTHMON_SICK:
|
|
case XFS_HEALTHMON_CORRUPT:
|
|
case XFS_HEALTHMON_HEALTHY:
|
|
switch (existing->domain) {
|
|
case XFS_HEALTHMON_FS:
|
|
existing->fsmask |= new->fsmask;
|
|
return true;
|
|
case XFS_HEALTHMON_AG:
|
|
case XFS_HEALTHMON_RTGROUP:
|
|
if (existing->group == new->group){
|
|
existing->grpmask |= new->grpmask;
|
|
return true;
|
|
}
|
|
return false;
|
|
case XFS_HEALTHMON_INODE:
|
|
if (existing->ino == new->ino &&
|
|
existing->gen == new->gen) {
|
|
existing->imask |= new->imask;
|
|
return true;
|
|
}
|
|
return false;
|
|
default:
|
|
ASSERT(0);
|
|
return false;
|
|
}
|
|
return false;
|
|
|
|
case XFS_HEALTHMON_SHUTDOWN:
|
|
/* yes, we can race to shutdown */
|
|
existing->flags |= new->flags;
|
|
return true;
|
|
|
|
case XFS_HEALTHMON_MEDIA_ERROR:
|
|
/* physically adjacent errors can merge */
|
|
if (existing->daddr + existing->bbcount == new->daddr) {
|
|
existing->bbcount += new->bbcount;
|
|
return true;
|
|
}
|
|
if (new->daddr + new->bbcount == existing->daddr) {
|
|
existing->daddr = new->daddr;
|
|
existing->bbcount += new->bbcount;
|
|
return true;
|
|
}
|
|
return false;
|
|
|
|
case XFS_HEALTHMON_BUFREAD:
|
|
case XFS_HEALTHMON_BUFWRITE:
|
|
case XFS_HEALTHMON_DIOREAD:
|
|
case XFS_HEALTHMON_DIOWRITE:
|
|
case XFS_HEALTHMON_DATALOST:
|
|
/* logically adjacent file ranges can merge */
|
|
if (existing->fino != new->fino || existing->fgen != new->fgen)
|
|
return false;
|
|
|
|
if (existing->fpos + existing->flen == new->fpos) {
|
|
existing->flen += new->flen;
|
|
return true;
|
|
}
|
|
|
|
if (new->fpos + new->flen == existing->fpos) {
|
|
existing->fpos = new->fpos;
|
|
existing->flen += new->flen;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Insert an event onto the start of the queue. */
|
|
static inline void
|
|
__xfs_healthmon_insert(
|
|
struct xfs_healthmon *hm,
|
|
struct xfs_healthmon_event *event)
|
|
{
|
|
struct timespec64 now;
|
|
|
|
ktime_get_coarse_real_ts64(&now);
|
|
event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
|
|
|
|
event->next = hm->first_event;
|
|
if (!hm->first_event)
|
|
hm->first_event = event;
|
|
if (!hm->last_event)
|
|
hm->last_event = event;
|
|
xfs_healthmon_bump_events(hm);
|
|
wake_up(&hm->wait);
|
|
|
|
trace_xfs_healthmon_insert(hm, event);
|
|
}
|
|
|
|
/* Push an event onto the end of the queue. */
|
|
static inline void
|
|
__xfs_healthmon_push(
|
|
struct xfs_healthmon *hm,
|
|
struct xfs_healthmon_event *event)
|
|
{
|
|
struct timespec64 now;
|
|
|
|
ktime_get_coarse_real_ts64(&now);
|
|
event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec;
|
|
|
|
if (!hm->first_event)
|
|
hm->first_event = event;
|
|
if (hm->last_event)
|
|
hm->last_event->next = event;
|
|
hm->last_event = event;
|
|
event->next = NULL;
|
|
xfs_healthmon_bump_events(hm);
|
|
wake_up(&hm->wait);
|
|
|
|
trace_xfs_healthmon_push(hm, event);
|
|
}
|
|
|
|
/* Deal with any previously lost events */
|
|
static int
|
|
xfs_healthmon_clear_lost_prev(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
struct xfs_healthmon_event lost_event = {
|
|
.type = XFS_HEALTHMON_LOST,
|
|
.domain = XFS_HEALTHMON_MOUNT,
|
|
.lostcount = hm->lost_prev_event,
|
|
};
|
|
struct xfs_healthmon_event *event = NULL;
|
|
|
|
if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) {
|
|
trace_xfs_healthmon_merge(hm, hm->last_event);
|
|
wake_up(&hm->wait);
|
|
goto cleared;
|
|
}
|
|
|
|
if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
|
|
event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event),
|
|
GFP_NOFS);
|
|
if (!event)
|
|
return -ENOMEM;
|
|
|
|
__xfs_healthmon_push(hm, event);
|
|
cleared:
|
|
hm->lost_prev_event = 0;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Push an event onto the end of the list after dealing with lost events and
|
|
* possibly full queues.
|
|
*/
|
|
STATIC int
|
|
xfs_healthmon_push(
|
|
struct xfs_healthmon *hm,
|
|
const struct xfs_healthmon_event *template)
|
|
{
|
|
struct xfs_healthmon_event *event = NULL;
|
|
int error = 0;
|
|
|
|
/*
|
|
* Locklessly check if the health monitor has already detached from the
|
|
* mount. If so, ignore the event. If we race with deactivation,
|
|
* we'll queue the event but never send it.
|
|
*/
|
|
if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
|
|
return -ESHUTDOWN;
|
|
|
|
mutex_lock(&hm->lock);
|
|
|
|
/* Report previously lost events before we do anything else */
|
|
if (hm->lost_prev_event) {
|
|
error = xfs_healthmon_clear_lost_prev(hm);
|
|
if (error)
|
|
goto out_unlock;
|
|
}
|
|
|
|
/* Try to merge with the newest event */
|
|
if (xfs_healthmon_merge_events(hm->last_event, template)) {
|
|
trace_xfs_healthmon_merge(hm, hm->last_event);
|
|
wake_up(&hm->wait);
|
|
goto out_unlock;
|
|
}
|
|
|
|
/* Only create a heap event object if we're not already at capacity. */
|
|
if (hm->events < XFS_HEALTHMON_MAX_EVENTS)
|
|
event = kmemdup(template, sizeof(struct xfs_healthmon_event),
|
|
GFP_NOFS);
|
|
if (!event) {
|
|
/* No memory means we lose the event */
|
|
trace_xfs_healthmon_lost_event(hm);
|
|
xfs_healthmon_bump_lost(hm);
|
|
error = -ENOMEM;
|
|
goto out_unlock;
|
|
}
|
|
|
|
__xfs_healthmon_push(hm, event);
|
|
|
|
out_unlock:
|
|
mutex_unlock(&hm->lock);
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Report that the filesystem is being unmounted, then detach the xfs mount
|
|
* from this healthmon instance.
|
|
*/
|
|
void
|
|
xfs_healthmon_unmount(
|
|
struct xfs_mount *mp)
|
|
{
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(mp);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
trace_xfs_healthmon_report_unmount(hm);
|
|
|
|
/*
|
|
* Insert the unmount notification at the start of the event queue so
|
|
* that userspace knows the filesystem went away as soon as possible.
|
|
* There's nothing actionable for userspace after an unmount. Once
|
|
* we've inserted the unmount event, hm no longer owns that event.
|
|
*/
|
|
__xfs_healthmon_insert(hm, hm->unmount_event);
|
|
hm->unmount_event = NULL;
|
|
|
|
xfs_healthmon_detach(hm);
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
/* Compute the reporting mask for non-unmount metadata health events. */
|
|
static inline unsigned int
|
|
metadata_event_mask(
|
|
struct xfs_healthmon *hm,
|
|
enum xfs_healthmon_type type,
|
|
unsigned int old_mask,
|
|
unsigned int new_mask)
|
|
{
|
|
/* If we want all events, return all events. */
|
|
if (hm->verbose)
|
|
return new_mask;
|
|
|
|
switch (type) {
|
|
case XFS_HEALTHMON_SICK:
|
|
/* Always report runtime corruptions */
|
|
return new_mask;
|
|
case XFS_HEALTHMON_CORRUPT:
|
|
/* Only report new fsck errors */
|
|
return new_mask & ~old_mask;
|
|
case XFS_HEALTHMON_HEALTHY:
|
|
/* Only report healthy metadata that got fixed */
|
|
return new_mask & old_mask;
|
|
default:
|
|
ASSERT(0);
|
|
break;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Report XFS_FS_SICK_* events to healthmon */
|
|
void
|
|
xfs_healthmon_report_fs(
|
|
struct xfs_mount *mp,
|
|
enum xfs_healthmon_type type,
|
|
unsigned int old_mask,
|
|
unsigned int new_mask)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = type,
|
|
.domain = XFS_HEALTHMON_FS,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(mp);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) &
|
|
~XFS_SICK_FS_SECONDARY;
|
|
trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event);
|
|
|
|
if (event.fsmask)
|
|
xfs_healthmon_push(hm, &event);
|
|
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
/* Report XFS_SICK_(AG|RG)* flags to healthmon */
|
|
void
|
|
xfs_healthmon_report_group(
|
|
struct xfs_group *xg,
|
|
enum xfs_healthmon_type type,
|
|
unsigned int old_mask,
|
|
unsigned int new_mask)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = type,
|
|
.group = xg->xg_gno,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
switch (xg->xg_type) {
|
|
case XG_TYPE_RTG:
|
|
event.domain = XFS_HEALTHMON_RTGROUP;
|
|
event.grpmask = metadata_event_mask(hm, type, old_mask,
|
|
new_mask) &
|
|
~XFS_SICK_RG_SECONDARY;
|
|
break;
|
|
case XG_TYPE_AG:
|
|
event.domain = XFS_HEALTHMON_AG;
|
|
event.grpmask = metadata_event_mask(hm, type, old_mask,
|
|
new_mask) &
|
|
~XFS_SICK_AG_SECONDARY;
|
|
break;
|
|
default:
|
|
ASSERT(0);
|
|
break;
|
|
}
|
|
|
|
trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event);
|
|
|
|
if (event.grpmask)
|
|
xfs_healthmon_push(hm, &event);
|
|
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
/* Report XFS_SICK_INO_* flags to healthmon */
|
|
void
|
|
xfs_healthmon_report_inode(
|
|
struct xfs_inode *ip,
|
|
enum xfs_healthmon_type type,
|
|
unsigned int old_mask,
|
|
unsigned int new_mask)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = type,
|
|
.domain = XFS_HEALTHMON_INODE,
|
|
.ino = ip->i_ino,
|
|
.gen = VFS_I(ip)->i_generation,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
event.imask = metadata_event_mask(hm, type, old_mask, new_mask) &
|
|
~XFS_SICK_INO_SECONDARY;
|
|
trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event);
|
|
|
|
if (event.imask)
|
|
xfs_healthmon_push(hm, &event);
|
|
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
/* Add a shutdown event to the reporting queue. */
|
|
void
|
|
xfs_healthmon_report_shutdown(
|
|
struct xfs_mount *mp,
|
|
uint32_t flags)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = XFS_HEALTHMON_SHUTDOWN,
|
|
.domain = XFS_HEALTHMON_MOUNT,
|
|
.flags = flags,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(mp);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
trace_xfs_healthmon_report_shutdown(hm, flags);
|
|
|
|
xfs_healthmon_push(hm, &event);
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
static inline enum xfs_healthmon_domain
|
|
media_error_domain(
|
|
enum xfs_device fdev)
|
|
{
|
|
switch (fdev) {
|
|
case XFS_DEV_DATA:
|
|
return XFS_HEALTHMON_DATADEV;
|
|
case XFS_DEV_LOG:
|
|
return XFS_HEALTHMON_LOGDEV;
|
|
case XFS_DEV_RT:
|
|
return XFS_HEALTHMON_RTDEV;
|
|
}
|
|
|
|
ASSERT(0);
|
|
return 0;
|
|
}
|
|
|
|
/* Add a media error event to the reporting queue. */
|
|
void
|
|
xfs_healthmon_report_media(
|
|
struct xfs_mount *mp,
|
|
enum xfs_device fdev,
|
|
xfs_daddr_t daddr,
|
|
uint64_t bbcount)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = XFS_HEALTHMON_MEDIA_ERROR,
|
|
.domain = media_error_domain(fdev),
|
|
.daddr = daddr,
|
|
.bbcount = bbcount,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(mp);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
trace_xfs_healthmon_report_media(hm, fdev, &event);
|
|
|
|
xfs_healthmon_push(hm, &event);
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action)
|
|
{
|
|
switch (action) {
|
|
case FSERR_BUFFERED_READ:
|
|
return XFS_HEALTHMON_BUFREAD;
|
|
case FSERR_BUFFERED_WRITE:
|
|
return XFS_HEALTHMON_BUFWRITE;
|
|
case FSERR_DIRECTIO_READ:
|
|
return XFS_HEALTHMON_DIOREAD;
|
|
case FSERR_DIRECTIO_WRITE:
|
|
return XFS_HEALTHMON_DIOWRITE;
|
|
case FSERR_DATA_LOST:
|
|
return XFS_HEALTHMON_DATALOST;
|
|
case FSERR_METADATA:
|
|
/* filtered out by xfs_fs_report_error */
|
|
break;
|
|
}
|
|
|
|
ASSERT(0);
|
|
return -1;
|
|
}
|
|
|
|
/* Add a file io error event to the reporting queue. */
|
|
void
|
|
xfs_healthmon_report_file_ioerror(
|
|
struct xfs_inode *ip,
|
|
const struct fserror_event *p)
|
|
{
|
|
struct xfs_healthmon_event event = {
|
|
.type = file_ioerr_type(p->type),
|
|
.domain = XFS_HEALTHMON_FILERANGE,
|
|
.fino = ip->i_ino,
|
|
.fgen = VFS_I(ip)->i_generation,
|
|
.fpos = p->pos,
|
|
.flen = p->len,
|
|
/* send positive error number to userspace */
|
|
.error = -p->error,
|
|
};
|
|
struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount);
|
|
|
|
if (!hm)
|
|
return;
|
|
|
|
trace_xfs_healthmon_report_file_ioerror(hm, p);
|
|
|
|
xfs_healthmon_push(hm, &event);
|
|
xfs_healthmon_put(hm);
|
|
}
|
|
|
|
static inline void
|
|
xfs_healthmon_reset_outbuf(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
hm->buftail = 0;
|
|
hm->bufhead = 0;
|
|
}
|
|
|
|
struct flags_map {
|
|
unsigned int in_mask;
|
|
unsigned int out_mask;
|
|
};
|
|
|
|
static const struct flags_map shutdown_map[] = {
|
|
{ SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR },
|
|
{ SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR },
|
|
{ SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT },
|
|
{ SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE },
|
|
{ SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK },
|
|
{ SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED },
|
|
};
|
|
|
|
static inline unsigned int
|
|
__map_flags(
|
|
const struct flags_map *map,
|
|
size_t array_len,
|
|
unsigned int flags)
|
|
{
|
|
const struct flags_map *m;
|
|
unsigned int ret = 0;
|
|
|
|
for (m = map; m < map + array_len; m++) {
|
|
if (flags & m->in_mask)
|
|
ret |= m->out_mask;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
#define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags))
|
|
|
|
static inline unsigned int shutdown_mask(unsigned int in)
|
|
{
|
|
return map_flags(shutdown_map, in);
|
|
}
|
|
|
|
static const unsigned int domain_map[] = {
|
|
[XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT,
|
|
[XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS,
|
|
[XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG,
|
|
[XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE,
|
|
[XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP,
|
|
[XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV,
|
|
[XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV,
|
|
[XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV,
|
|
[XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE,
|
|
};
|
|
|
|
static const unsigned int type_map[] = {
|
|
[XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING,
|
|
[XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST,
|
|
[XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK,
|
|
[XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT,
|
|
[XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY,
|
|
[XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT,
|
|
[XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN,
|
|
[XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR,
|
|
[XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD,
|
|
[XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE,
|
|
[XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD,
|
|
[XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE,
|
|
[XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST,
|
|
};
|
|
|
|
/* Render event as a V0 structure */
|
|
STATIC int
|
|
xfs_healthmon_format_v0(
|
|
struct xfs_healthmon *hm,
|
|
const struct xfs_healthmon_event *event)
|
|
{
|
|
struct xfs_health_monitor_event hme = {
|
|
.time_ns = event->time_ns,
|
|
};
|
|
|
|
trace_xfs_healthmon_format(hm, event);
|
|
|
|
if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) ||
|
|
event->type < 0 || event->type >= ARRAY_SIZE(type_map))
|
|
return -EFSCORRUPTED;
|
|
|
|
hme.domain = domain_map[event->domain];
|
|
hme.type = type_map[event->type];
|
|
|
|
/* fill in the event-specific details */
|
|
switch (event->domain) {
|
|
case XFS_HEALTHMON_MOUNT:
|
|
switch (event->type) {
|
|
case XFS_HEALTHMON_LOST:
|
|
hme.e.lost.count = event->lostcount;
|
|
break;
|
|
case XFS_HEALTHMON_SHUTDOWN:
|
|
hme.e.shutdown.reasons = shutdown_mask(event->flags);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
case XFS_HEALTHMON_FS:
|
|
hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask);
|
|
break;
|
|
case XFS_HEALTHMON_RTGROUP:
|
|
hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask);
|
|
hme.e.group.gno = event->group;
|
|
break;
|
|
case XFS_HEALTHMON_AG:
|
|
hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask);
|
|
hme.e.group.gno = event->group;
|
|
break;
|
|
case XFS_HEALTHMON_INODE:
|
|
hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask);
|
|
hme.e.inode.ino = event->ino;
|
|
hme.e.inode.gen = event->gen;
|
|
break;
|
|
case XFS_HEALTHMON_DATADEV:
|
|
case XFS_HEALTHMON_LOGDEV:
|
|
case XFS_HEALTHMON_RTDEV:
|
|
hme.e.media.daddr = event->daddr;
|
|
hme.e.media.bbcount = event->bbcount;
|
|
break;
|
|
case XFS_HEALTHMON_FILERANGE:
|
|
hme.e.filerange.ino = event->fino;
|
|
hme.e.filerange.gen = event->fgen;
|
|
hme.e.filerange.pos = event->fpos;
|
|
hme.e.filerange.len = event->flen;
|
|
hme.e.filerange.error = abs(event->error);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize);
|
|
|
|
/* copy formatted object to the outbuf */
|
|
if (hm->bufhead + sizeof(hme) <= hm->bufsize) {
|
|
memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme));
|
|
hm->bufhead += sizeof(hme);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* How many bytes are waiting in the outbuf to be copied? */
|
|
static inline size_t
|
|
xfs_healthmon_outbuf_bytes(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
if (hm->bufhead > hm->buftail)
|
|
return hm->bufhead - hm->buftail;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Do we have something for userspace to read? This can mean unmount events,
|
|
* events pending in the queue, or pending bytes in the outbuf.
|
|
*/
|
|
static inline bool
|
|
xfs_healthmon_has_eventdata(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
/*
|
|
* If the health monitor is already detached from the xfs_mount, we
|
|
* want reads to return 0 bytes even if there are no events, because
|
|
* userspace interprets that as EOF. If we race with deactivation,
|
|
* read_iter will take the necessary locks to discover that there are
|
|
* no events to send.
|
|
*/
|
|
if (hm->mount_cookie == DETACHED_MOUNT_COOKIE)
|
|
return true;
|
|
|
|
/*
|
|
* Either there are events waiting to be formatted into the buffer, or
|
|
* there's unread bytes in the buffer.
|
|
*/
|
|
return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0;
|
|
}
|
|
|
|
/* Try to copy the rest of the outbuf to the iov iter. */
|
|
STATIC ssize_t
|
|
xfs_healthmon_copybuf(
|
|
struct xfs_healthmon *hm,
|
|
struct iov_iter *to)
|
|
{
|
|
size_t to_copy;
|
|
size_t w = 0;
|
|
|
|
trace_xfs_healthmon_copybuf(hm, to);
|
|
|
|
to_copy = xfs_healthmon_outbuf_bytes(hm);
|
|
if (to_copy) {
|
|
w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to);
|
|
if (!w)
|
|
return -EFAULT;
|
|
|
|
hm->buftail += w;
|
|
}
|
|
|
|
/*
|
|
* Nothing left to copy? Reset the output buffer cursors to the start
|
|
* since there's no live data in the buffer.
|
|
*/
|
|
if (xfs_healthmon_outbuf_bytes(hm) == 0)
|
|
xfs_healthmon_reset_outbuf(hm);
|
|
return w;
|
|
}
|
|
|
|
/*
|
|
* Return a health monitoring event for formatting into the output buffer if
|
|
* there's enough space in the outbuf and an event waiting for us. Caller
|
|
* must hold i_rwsem on the healthmon file.
|
|
*/
|
|
static inline struct xfs_healthmon_event *
|
|
xfs_healthmon_format_pop(
|
|
struct xfs_healthmon *hm)
|
|
{
|
|
struct xfs_healthmon_event *event;
|
|
|
|
if (hm->bufhead + sizeof(*event) > hm->bufsize)
|
|
return NULL;
|
|
|
|
mutex_lock(&hm->lock);
|
|
event = hm->first_event;
|
|
if (event) {
|
|
if (hm->last_event == event)
|
|
hm->last_event = NULL;
|
|
hm->first_event = event->next;
|
|
hm->events--;
|
|
|
|
trace_xfs_healthmon_pop(hm, event);
|
|
}
|
|
mutex_unlock(&hm->lock);
|
|
return event;
|
|
}
|
|
|
|
/* Allocate formatting buffer */
|
|
STATIC int
|
|
xfs_healthmon_alloc_outbuf(
|
|
struct xfs_healthmon *hm,
|
|
size_t user_bufsize)
|
|
{
|
|
void *outbuf;
|
|
size_t bufsize =
|
|
min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize));
|
|
|
|
outbuf = kzalloc(bufsize, GFP_KERNEL);
|
|
if (!outbuf) {
|
|
if (bufsize == PAGE_SIZE)
|
|
return -ENOMEM;
|
|
|
|
bufsize = PAGE_SIZE;
|
|
outbuf = kzalloc(bufsize, GFP_KERNEL);
|
|
if (!outbuf)
|
|
return -ENOMEM;
|
|
}
|
|
|
|
hm->buffer = outbuf;
|
|
hm->bufsize = bufsize;
|
|
hm->bufhead = 0;
|
|
hm->buftail = 0;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Convey queued event data to userspace. First copy any remaining bytes in
|
|
* the outbuf, then format the oldest event into the outbuf and copy that too.
|
|
*/
|
|
STATIC ssize_t
|
|
xfs_healthmon_read_iter(
|
|
struct kiocb *iocb,
|
|
struct iov_iter *to)
|
|
{
|
|
struct file *file = iocb->ki_filp;
|
|
struct inode *inode = file_inode(file);
|
|
struct xfs_healthmon *hm = file->private_data;
|
|
struct xfs_healthmon_event *event;
|
|
size_t copied = 0;
|
|
ssize_t ret = 0;
|
|
|
|
if (file->f_flags & O_NONBLOCK) {
|
|
if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode))
|
|
return -EAGAIN;
|
|
} else {
|
|
ret = wait_event_interruptible(hm->wait,
|
|
xfs_healthmon_has_eventdata(hm));
|
|
if (ret)
|
|
return ret;
|
|
|
|
inode_lock(inode);
|
|
}
|
|
|
|
if (hm->bufsize == 0) {
|
|
ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to));
|
|
if (ret)
|
|
goto out_unlock;
|
|
}
|
|
|
|
trace_xfs_healthmon_read_start(hm);
|
|
|
|
/*
|
|
* If there's anything left in the output buffer, copy that before
|
|
* formatting more events.
|
|
*/
|
|
ret = xfs_healthmon_copybuf(hm, to);
|
|
if (ret < 0)
|
|
goto out_unlock;
|
|
copied += ret;
|
|
|
|
while (iov_iter_count(to) > 0) {
|
|
/* Format the next events into the outbuf until it's full. */
|
|
while ((event = xfs_healthmon_format_pop(hm)) != NULL) {
|
|
ret = xfs_healthmon_format_v0(hm, event);
|
|
kfree(event);
|
|
if (ret)
|
|
goto out_unlock;
|
|
}
|
|
|
|
/* Copy anything formatted into outbuf to userspace */
|
|
ret = xfs_healthmon_copybuf(hm, to);
|
|
if (ret <= 0)
|
|
break;
|
|
|
|
copied += ret;
|
|
}
|
|
|
|
out_unlock:
|
|
trace_xfs_healthmon_read_finish(hm);
|
|
inode_unlock(inode);
|
|
return copied ?: ret;
|
|
}
|
|
|
|
/* Poll for available events. */
|
|
STATIC __poll_t
|
|
xfs_healthmon_poll(
|
|
struct file *file,
|
|
struct poll_table_struct *wait)
|
|
{
|
|
struct xfs_healthmon *hm = file->private_data;
|
|
__poll_t mask = 0;
|
|
|
|
poll_wait(file, &hm->wait, wait);
|
|
|
|
if (xfs_healthmon_has_eventdata(hm))
|
|
mask |= EPOLLIN;
|
|
return mask;
|
|
}
|
|
|
|
/* Free the health monitoring information. */
|
|
STATIC int
|
|
xfs_healthmon_release(
|
|
struct inode *inode,
|
|
struct file *file)
|
|
{
|
|
struct xfs_healthmon *hm = file->private_data;
|
|
|
|
trace_xfs_healthmon_release(hm);
|
|
|
|
/*
|
|
* We might be closing the healthmon file before the filesystem
|
|
* unmounts, because userspace processes can terminate at any time and
|
|
* for any reason. Null out xfs_mount::m_healthmon so that another
|
|
* process can create another health monitor file.
|
|
*/
|
|
xfs_healthmon_detach(hm);
|
|
|
|
/*
|
|
* Wake up any readers that might be left. There shouldn't be any
|
|
* because the only users of the waiter are read and poll.
|
|
*/
|
|
wake_up_all(&hm->wait);
|
|
|
|
xfs_healthmon_put(hm);
|
|
return 0;
|
|
}
|
|
|
|
/* Validate ioctl parameters. */
|
|
static inline bool
|
|
xfs_healthmon_validate(
|
|
const struct xfs_health_monitor *hmo)
|
|
{
|
|
if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL)
|
|
return false;
|
|
if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0)
|
|
return false;
|
|
if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad)))
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/* Emit some data about the health monitoring fd. */
|
|
static void
|
|
xfs_healthmon_show_fdinfo(
|
|
struct seq_file *m,
|
|
struct file *file)
|
|
{
|
|
struct xfs_healthmon *hm = file->private_data;
|
|
|
|
mutex_lock(&hm->lock);
|
|
seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n",
|
|
hm->mount_cookie == DETACHED_MOUNT_COOKIE ?
|
|
"dead" : "alive",
|
|
MAJOR(hm->dev), MINOR(hm->dev),
|
|
hm->total_events,
|
|
hm->total_lost);
|
|
mutex_unlock(&hm->lock);
|
|
}
|
|
|
|
static const struct file_operations xfs_healthmon_fops = {
|
|
.owner = THIS_MODULE,
|
|
.show_fdinfo = xfs_healthmon_show_fdinfo,
|
|
.read_iter = xfs_healthmon_read_iter,
|
|
.poll = xfs_healthmon_poll,
|
|
.release = xfs_healthmon_release,
|
|
};
|
|
|
|
/*
|
|
* Create a health monitoring file. Returns an index to the fd table or a
|
|
* negative errno.
|
|
*/
|
|
long
|
|
xfs_ioc_health_monitor(
|
|
struct file *file,
|
|
struct xfs_health_monitor __user *arg)
|
|
{
|
|
struct xfs_health_monitor hmo;
|
|
struct xfs_healthmon_event *running_event;
|
|
struct xfs_healthmon *hm;
|
|
struct xfs_inode *ip = XFS_I(file_inode(file));
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
int ret;
|
|
|
|
/*
|
|
* The only intended user of the health monitoring system should be the
|
|
* xfs_healer daemon running on behalf of the whole filesystem in the
|
|
* initial user namespace. IOWs, we don't allow unprivileged userspace
|
|
* (they can use fsnotify) nor do we allow containers.
|
|
*/
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
if (ip->i_ino != mp->m_sb.sb_rootino)
|
|
return -EPERM;
|
|
if (current_user_ns() != &init_user_ns)
|
|
return -EPERM;
|
|
|
|
if (copy_from_user(&hmo, arg, sizeof(hmo)))
|
|
return -EFAULT;
|
|
|
|
if (!xfs_healthmon_validate(&hmo))
|
|
return -EINVAL;
|
|
|
|
hm = kzalloc(sizeof(*hm), GFP_KERNEL);
|
|
if (!hm)
|
|
return -ENOMEM;
|
|
hm->dev = mp->m_super->s_dev;
|
|
refcount_set(&hm->ref, 1);
|
|
|
|
mutex_init(&hm->lock);
|
|
init_waitqueue_head(&hm->wait);
|
|
|
|
if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE)
|
|
hm->verbose = true;
|
|
|
|
/* Queue up the first event that lets the client know we're running. */
|
|
running_event = kzalloc(sizeof(struct xfs_healthmon_event), GFP_NOFS);
|
|
if (!running_event) {
|
|
ret = -ENOMEM;
|
|
goto out_hm;
|
|
}
|
|
running_event->type = XFS_HEALTHMON_RUNNING;
|
|
running_event->domain = XFS_HEALTHMON_MOUNT;
|
|
__xfs_healthmon_insert(hm, running_event);
|
|
|
|
/*
|
|
* Preallocate the unmount event so that we can't fail to notify the
|
|
* filesystem later. This is key for triggering fast exit of the
|
|
* xfs_healer daemon.
|
|
*/
|
|
hm->unmount_event = kzalloc(sizeof(struct xfs_healthmon_event),
|
|
GFP_NOFS);
|
|
if (!hm->unmount_event) {
|
|
ret = -ENOMEM;
|
|
goto out_hm;
|
|
}
|
|
hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT;
|
|
hm->unmount_event->domain = XFS_HEALTHMON_MOUNT;
|
|
|
|
/*
|
|
* Try to attach this health monitor to the xfs_mount. The monitor is
|
|
* considered live and will receive events if this succeeds.
|
|
*/
|
|
ret = xfs_healthmon_attach(mp, hm);
|
|
if (ret)
|
|
goto out_hm;
|
|
|
|
/*
|
|
* Create the anonymous file and install a fd for it. If it succeeds,
|
|
* the file owns hm and can go away at any time, so we must not access
|
|
* it again. This must go last because we can't undo a fd table
|
|
* installation.
|
|
*/
|
|
ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm,
|
|
O_CLOEXEC | O_RDONLY);
|
|
if (ret < 0)
|
|
goto out_mp;
|
|
|
|
trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format);
|
|
|
|
return ret;
|
|
|
|
out_mp:
|
|
xfs_healthmon_detach(hm);
|
|
out_hm:
|
|
ASSERT(refcount_read(&hm->ref) == 1);
|
|
xfs_healthmon_put(hm);
|
|
return ret;
|
|
}
|