net: track pfmemalloc drops via SKB_DROP_REASON_PFMEMALLOC

Add a new SKB drop reason (SKB_DROP_REASON_PFMEMALLOC) to track packets
dropped due to memory pressure. In production environments, we've observed
memory exhaustion reported by memory layer stack traces, but these drops
were not properly tracked in the SKB drop reason infrastructure.

While most network code paths now properly report pfmemalloc drops, some
protocol-specific socket implementations still use sk_filter() without
drop reason tracking:
- Bluetooth L2CAP sockets
- CAIF sockets
- IUCV sockets
- Netlink sockets
- SCTP sockets
- Unix domain sockets

These remaining cases represent less common paths and could be converted
in a follow-up patch if needed. The current implementation provides
significantly improved observability into memory pressure events in the
network stack, especially for key protocols like TCP and UDP, helping to
diagnose problems in production environments.

Reported-by: Matt Fleming <mfleming@cloudflare.com>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Link: https://patch.msgid.link/175268316579.2407873.11634752355644843509.stgit@firesoul
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jesper Dangaard Brouer 2025-07-16 18:26:53 +02:00 committed by Jakub Kicinski
parent 8b7ab8eb52
commit a6f190630d
12 changed files with 75 additions and 44 deletions

View File

@ -1002,8 +1002,8 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun,
/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct tun_struct *tun = netdev_priv(dev);
enum skb_drop_reason drop_reason;
int txq = skb->queue_mapping;
struct netdev_queue *queue;
struct tun_file *tfile;
@ -1032,10 +1032,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
}
if (tfile->socket.sk->sk_filter &&
sk_filter(tfile->socket.sk, skb)) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
sk_filter_reason(tfile->socket.sk, skb, &drop_reason))
goto drop;
}
len = run_ebpf_filter(tun, skb, len);
if (len == 0) {

View File

@ -1073,10 +1073,20 @@ bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap,
enum skb_drop_reason *reason);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
return sk_filter_trim_cap(sk, skb, 1);
enum skb_drop_reason ignore_reason;
return sk_filter_trim_cap(sk, skb, 1, &ignore_reason);
}
static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason)
{
return sk_filter_trim_cap(sk, skb, 1, reason);
}
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);

View File

@ -125,6 +125,7 @@
FN(CAN_RX_INVALID_FRAME) \
FN(CANFD_RX_INVALID_FRAME) \
FN(CANXL_RX_INVALID_FRAME) \
FN(PFMEMALLOC) \
FNe(MAX)
/**
@ -598,6 +599,11 @@ enum skb_drop_reason {
* non conform CAN-XL frame (or device is unable to receive CAN frames)
*/
SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
/**
* @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve
* reached a path or socket not eligible for use of memory reserves
*/
SKB_DROP_REASON_PFMEMALLOC,
/**
* @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
* shouldn't be used as a real 'reason' - only for tracing code gen

View File

@ -1559,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason *reason);
int tcp_filter(struct sock *sk, struct sk_buff *skb);
int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);

View File

@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct sk_buff *skb = *pskb;
@ -5840,8 +5841,10 @@ skip_taps:
#endif
skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
drop_reason = SKB_DROP_REASON_PFMEMALLOC;
goto drop;
}
if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
@ -5946,7 +5949,8 @@ drop:
dev_core_stats_rx_dropped_inc(skb->dev);
else
dev_core_stats_rx_nohandler_inc(skb->dev);
kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
kfree_skb_reason(skb, drop_reason);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/

View File

@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
* @sk: sock associated with &sk_buff
* @skb: buffer to filter
* @cap: limit on how short the eBPF program may trim the packet
* @reason: record drop reason on errors (negative return value)
*
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
* be accepted or -EPERM if the packet should be tossed.
*
*/
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
unsigned int cap, enum skb_drop_reason *reason)
{
int err;
struct sk_filter *filter;
@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
*/
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
*reason = SKB_DROP_REASON_PFMEMALLOC;
return -ENOMEM;
}
err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
if (err)
if (err) {
*reason = SKB_DROP_REASON_SOCKET_FILTER;
return err;
}
err = security_sock_rcv_skb(sk, skb);
if (err)
if (err) {
*reason = SKB_DROP_REASON_SECURITY_HOOK;
return err;
}
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
if (err)
*reason = SKB_DROP_REASON_SOCKET_FILTER;
}
rcu_read_unlock();

View File

@ -526,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
enum skb_drop_reason drop_reason;
int err;
err = sk_filter(sk, skb);
if (err) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
err = sk_filter_reason(sk, skb, &drop_reason);
if (err)
goto out;
}
err = __sock_queue_rcv_skb(sk, skb);
switch (err) {
case -ENOMEM:
@ -553,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested, unsigned int trim_cap, bool refcounted)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
int rc = NET_RX_SUCCESS;
int err;
if (sk_filter_trim_cap(sk, skb, trim_cap))
if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
goto discard_and_relse;
skb->dev = NULL;
if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
atomic_inc(&sk->sk_drops);
reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
goto discard_and_relse;
}
if (nested)
@ -577,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
rc = sk_backlog_rcv(sk, skb);
mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
} else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
bh_unlock_sock(sk);
if (err == -ENOMEM)
reason = SKB_DROP_REASON_PFMEMALLOC;
if (err == -ENOBUFS)
reason = SKB_DROP_REASON_SOCKET_BACKLOG;
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@ -589,7 +595,7 @@ out:
sock_put(sk);
return rc;
discard_and_relse:
kfree_skb(skb);
sk_skb_reason_drop(sk, skb, reason);
goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

View File

@ -2026,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
u32 gso_size;
u64 limit;
int delta;
int err;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@ -2136,21 +2137,27 @@ no_coalesce:
limit = min_t(u64, limit, UINT_MAX);
if (unlikely(sk_add_backlog(sk, skb, limit))) {
err = sk_add_backlog(sk, skb, limit);
if (unlikely(err)) {
bh_unlock_sock(sk);
*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
if (err == -ENOMEM) {
*reason = SKB_DROP_REASON_PFMEMALLOC;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
} else {
*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
}
return true;
}
return false;
}
EXPORT_IPV6_MOD(tcp_add_backlog);
int tcp_filter(struct sock *sk, struct sk_buff *skb)
int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
return sk_filter_trim_cap(sk, skb, th->doff * 4);
return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
}
EXPORT_IPV6_MOD(tcp_filter);
@ -2277,14 +2284,12 @@ lookup:
}
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
&drop_reason);
} else {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
@ -2340,10 +2345,9 @@ process:
nf_reset_ct(skb);
if (tcp_filter(sk, skb)) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
}
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);

View File

@ -2347,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
*/
static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
@ -2436,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
}
udp_csum_pull_header(skb);

View File

@ -1834,14 +1834,12 @@ lookup:
}
refcounted = true;
nsk = NULL;
if (!tcp_filter(sk, skb)) {
if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
&drop_reason);
} else {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
}
if (!nsk) {
reqsk_put(req);
@ -1897,10 +1895,9 @@ process:
nf_reset_ct(skb);
if (tcp_filter(sk, skb)) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
}
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);

View File

@ -894,10 +894,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) {
drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
}
udp_csum_pull_header(skb);

View File

@ -101,6 +101,7 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety
*/
static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m)
{
enum skb_drop_reason dr; /* ignored */
struct rose_sock *rose = rose_sk(sk);
int queued = 0;
@ -162,7 +163,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety
rose_frames_acked(sk, nr);
if (ns == rose->vr) {
rose_start_idletimer(sk);
if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 &&
if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) &&
__sock_queue_rcv_skb(sk, skb) == 0) {
rose->vr = (rose->vr + 1) % ROSE_MODULUS;
queued = 1;