Skip to content

Commit 3d25fa2

Browse files
mrpregregkh
authored andcommitted
bpf, sockmap: Fix panic when calling skb_linearize
[ Upstream commit 5ca2e29 ] The panic can be reproduced by executing the command: ./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress --rx-strp 100000 Then a kernel panic was captured: ''' [ 657.460555] kernel BUG at net/core/skbuff.c:2178! [ 657.462680] Tainted: [W]=WARN [ 657.463287] Workqueue: events sk_psock_backlog ... [ 657.469610] <TASK> [ 657.469738] ? die+0x36/0x90 [ 657.469916] ? do_trap+0x1d0/0x270 [ 657.470118] ? pskb_expand_head+0x612/0xf40 [ 657.470376] ? pskb_expand_head+0x612/0xf40 [ 657.470620] ? do_error_trap+0xa3/0x170 [ 657.470846] ? pskb_expand_head+0x612/0xf40 [ 657.471092] ? handle_invalid_op+0x2c/0x40 [ 657.471335] ? pskb_expand_head+0x612/0xf40 [ 657.471579] ? exc_invalid_op+0x2d/0x40 [ 657.471805] ? asm_exc_invalid_op+0x1a/0x20 [ 657.472052] ? pskb_expand_head+0xd1/0xf40 [ 657.472292] ? pskb_expand_head+0x612/0xf40 [ 657.472540] ? lock_acquire+0x18f/0x4e0 [ 657.472766] ? find_held_lock+0x2d/0x110 [ 657.472999] ? __pfx_pskb_expand_head+0x10/0x10 [ 657.473263] ? __kmalloc_cache_noprof+0x5b/0x470 [ 657.473537] ? __pfx___lock_release.isra.0+0x10/0x10 [ 657.473826] __pskb_pull_tail+0xfd/0x1d20 [ 657.474062] ? __kasan_slab_alloc+0x4e/0x90 [ 657.474707] sk_psock_skb_ingress_enqueue+0x3bf/0x510 [ 657.475392] ? __kasan_kmalloc+0xaa/0xb0 [ 657.476010] sk_psock_backlog+0x5cf/0xd70 [ 657.476637] process_one_work+0x858/0x1a20 ''' The panic originates from the assertion BUG_ON(skb_shared(skb)) in skb_linearize(). A previous commit(see Fixes tag) introduced skb_get() to avoid race conditions between skb operations in the backlog and skb release in the recvmsg path. However, this caused the panic to always occur when skb_linearize is executed. The "--rx-strp 100000" parameter forces the RX path to use the strparser module which aggregates data until it reaches 100KB before calling sockmap logic. The 100KB payload exceeds MAX_MSG_FRAGS, triggering skb_linearize. To fix this issue, just move skb_get into sk_psock_skb_ingress_enqueue. ''' sk_psock_backlog: sk_psock_handle_skb skb_get(skb) <== we move it into 'sk_psock_skb_ingress_enqueue' sk_psock_skb_ingress____________ ↓ | | → sk_psock_skb_ingress_self | sk_psock_skb_ingress_enqueue sk_psock_verdict_apply_________________↑ skb_linearize ''' Note that for verdict_apply path, the skb_get operation is unnecessary so we add 'take_ref' param to control it's behavior. Fixes: a454d84 ("bpf, sockmap: Fix skb refcnt race after locking changes") Signed-off-by: Jiayuan Chen <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 3a8e680 commit 3d25fa2

File tree

1 file changed

+16
-15
lines changed

1 file changed

+16
-15
lines changed

net/core/skmsg.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -529,16 +529,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
529529
u32 off, u32 len,
530530
struct sk_psock *psock,
531531
struct sock *sk,
532-
struct sk_msg *msg)
532+
struct sk_msg *msg,
533+
bool take_ref)
533534
{
534535
int num_sge, copied;
535536

537+
/* skb_to_sgvec will fail when the total number of fragments in
538+
* frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
539+
* caller may aggregate multiple skbs.
540+
*/
536541
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
537542
if (num_sge < 0) {
538543
/* skb linearize may fail with ENOMEM, but lets simply try again
539544
* later if this happens. Under memory pressure we don't want to
540545
* drop the skb. We need to linearize the skb so that the mapping
541546
* in skb_to_sgvec can not error.
547+
* Note that skb_linearize requires the skb not to be shared.
542548
*/
543549
if (skb_linearize(skb))
544550
return -EAGAIN;
@@ -555,15 +561,15 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
555561
msg->sg.start = 0;
556562
msg->sg.size = copied;
557563
msg->sg.end = num_sge;
558-
msg->skb = skb;
564+
msg->skb = take_ref ? skb_get(skb) : skb;
559565

560566
sk_psock_queue_msg(psock, msg);
561567
sk_psock_data_ready(sk, psock);
562568
return copied;
563569
}
564570

565571
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
566-
u32 off, u32 len);
572+
u32 off, u32 len, bool take_ref);
567573

568574
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
569575
u32 off, u32 len)
@@ -577,7 +583,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
577583
* correctly.
578584
*/
579585
if (unlikely(skb->sk == sk))
580-
return sk_psock_skb_ingress_self(psock, skb, off, len);
586+
return sk_psock_skb_ingress_self(psock, skb, off, len, true);
581587
msg = sk_psock_create_ingress_msg(sk, skb);
582588
if (!msg)
583589
return -EAGAIN;
@@ -589,7 +595,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
589595
* into user buffers.
590596
*/
591597
skb_set_owner_r(skb, sk);
592-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
598+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
593599
if (err < 0)
594600
kfree(msg);
595601
return err;
@@ -600,7 +606,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
600606
* because the skb is already accounted for here.
601607
*/
602608
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
603-
u32 off, u32 len)
609+
u32 off, u32 len, bool take_ref)
604610
{
605611
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
606612
struct sock *sk = psock->sk;
@@ -609,7 +615,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
609615
if (unlikely(!msg))
610616
return -EAGAIN;
611617
skb_set_owner_r(skb, sk);
612-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
618+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
613619
if (err < 0)
614620
kfree(msg);
615621
return err;
@@ -618,18 +624,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
618624
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
619625
u32 off, u32 len, bool ingress)
620626
{
621-
int err = 0;
622-
623627
if (!ingress) {
624628
if (!sock_writeable(psock->sk))
625629
return -EAGAIN;
626630
return skb_send_sock(psock->sk, skb, off, len);
627631
}
628-
skb_get(skb);
629-
err = sk_psock_skb_ingress(psock, skb, off, len);
630-
if (err < 0)
631-
kfree_skb(skb);
632-
return err;
632+
633+
return sk_psock_skb_ingress(psock, skb, off, len);
633634
}
634635

635636
static void sk_psock_skb_state(struct sk_psock *psock,
@@ -1017,7 +1018,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10171018
off = stm->offset;
10181019
len = stm->full_len;
10191020
}
1020-
err = sk_psock_skb_ingress_self(psock, skb, off, len);
1021+
err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
10211022
}
10221023
if (err < 0) {
10231024
spin_lock_bh(&psock->ingress_lock);

0 commit comments

Comments
 (0)