Skip to content

Commit 690d43d

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf-sockmap-fix-data-loss-and-panic-issues'
Jiayuan Chen says: ==================== bpf, sockmap: Fix data loss and panic issues I was writing a benchmark based on sockmap + TCP and discovered several issues: 1. When EAGAIN occurs, the direction of skb is incorrect, causing data loss when retry. 2. When sending partial data, the offset is not recorded, leading to duplicate data being sent when retry. 3. An unexpected BUG_ON() judgment in skb_linearize is triggered. 4. The memory of psock->ingress_skb is not limited by the socket buffer and memcg. Issues 1, 2, and 3 are described in each patch's commit message. Regarding issue 4, this patchset does not cover it as it is difficult to handle in practice, and I am still working on it. Here is a brief description of the issue: When using sockmap to skb/stream redirect, if the receiving end does not perform read operations, all data will be buffered in ingress_skb. For example: ''' // set memory limit to 50G cgcreate -g memory:myGroup cgset -r memory.max="5000M" myGroup // start benchmark and disable consumer from reading cgexec -g "memory:myGroup" ./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress --delay-consumer=-1 -d 100 Iter 0 ( 29.179us): Send Speed 2668.548 MB/s (20360.406 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 1 ( -7.237us): Send Speed 2694.467 MB/s (20557.149 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 2 ( -1.918us): Send Speed 2693.404 MB/s (20548.039 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 3 ( -0.684us): Send Speed 2693.138 MB/s (20548.014 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 4 ( 7.879us): Send Speed 2698.620 MB/s (20588.838 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 5 ( -3.224us): Send Speed 2696.553 MB/s (20573.066 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 6 ( -5.409us): Send Speed 2699.705 MB/s (20597.111 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) Iter 7 ( -0.439us): Send Speed 2699.691 MB/s (20597.009 calls/s), ... Rcv Speed 0.000 MB/s ( 0.000 calls/s) ... // memory usage are not limited cat /proc/slabinfo | grep skb skbuff_small_head 11824024 11824024 704 46 8 : tunables 0 0 0 : slabdata 257044 257044 0 skbuff_fclone_cache 11822080 11822080 512 32 4 : tunables 0 0 0 : slabdata 369440 369440 0 ''' Thus, a simple socket in a large file upload/download model can eat the entire OS memory. We must charge the skb memory to psock->sk, and if we do not want losing skb, we need to feedback the error info to read_sock/read_skb when the enqueue operation of psock->ingress_skb fails. --- My another patch related to stability also requires maintainers to spare some time from their busy schedules for review. https://lore.kernel.org/bpf/[email protected]/T/#t ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 727d00a + 7b2fa44 commit 690d43d

File tree

5 files changed

+697
-21
lines changed

5 files changed

+697
-21
lines changed

net/core/skmsg.c

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
530530
u32 off, u32 len,
531531
struct sk_psock *psock,
532532
struct sock *sk,
533-
struct sk_msg *msg)
533+
struct sk_msg *msg,
534+
bool take_ref)
534535
{
535536
int num_sge, copied;
536537

538+
/* skb_to_sgvec will fail when the total number of fragments in
539+
* frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
540+
* caller may aggregate multiple skbs.
541+
*/
537542
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
538543
if (num_sge < 0) {
539544
/* skb linearize may fail with ENOMEM, but lets simply try again
540545
* later if this happens. Under memory pressure we don't want to
541546
* drop the skb. We need to linearize the skb so that the mapping
542547
* in skb_to_sgvec can not error.
548+
* Note that skb_linearize requires the skb not to be shared.
543549
*/
544550
if (skb_linearize(skb))
545551
return -EAGAIN;
@@ -556,15 +562,15 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
556562
msg->sg.start = 0;
557563
msg->sg.size = copied;
558564
msg->sg.end = num_sge;
559-
msg->skb = skb;
565+
msg->skb = take_ref ? skb_get(skb) : skb;
560566

561567
sk_psock_queue_msg(psock, msg);
562568
sk_psock_data_ready(sk, psock);
563569
return copied;
564570
}
565571

566572
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
567-
u32 off, u32 len);
573+
u32 off, u32 len, bool take_ref);
568574

569575
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
570576
u32 off, u32 len)
@@ -578,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
578584
* correctly.
579585
*/
580586
if (unlikely(skb->sk == sk))
581-
return sk_psock_skb_ingress_self(psock, skb, off, len);
587+
return sk_psock_skb_ingress_self(psock, skb, off, len, true);
582588
msg = sk_psock_create_ingress_msg(sk, skb);
583589
if (!msg)
584590
return -EAGAIN;
@@ -590,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
590596
* into user buffers.
591597
*/
592598
skb_set_owner_r(skb, sk);
593-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
599+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
594600
if (err < 0)
595601
kfree(msg);
596602
return err;
@@ -601,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
601607
* because the skb is already accounted for here.
602608
*/
603609
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
604-
u32 off, u32 len)
610+
u32 off, u32 len, bool take_ref)
605611
{
606612
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
607613
struct sock *sk = psock->sk;
@@ -610,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
610616
if (unlikely(!msg))
611617
return -EAGAIN;
612618
skb_set_owner_r(skb, sk);
613-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
619+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
614620
if (err < 0)
615621
kfree(msg);
616622
return err;
@@ -619,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
619625
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
620626
u32 off, u32 len, bool ingress)
621627
{
622-
int err = 0;
623-
624628
if (!ingress) {
625629
if (!sock_writeable(psock->sk))
626630
return -EAGAIN;
627631
return skb_send_sock(psock->sk, skb, off, len);
628632
}
629-
skb_get(skb);
630-
err = sk_psock_skb_ingress(psock, skb, off, len);
631-
if (err < 0)
632-
kfree_skb(skb);
633-
return err;
633+
634+
return sk_psock_skb_ingress(psock, skb, off, len);
634635
}
635636

636637
static void sk_psock_skb_state(struct sk_psock *psock,
@@ -656,11 +657,6 @@ static void sk_psock_backlog(struct work_struct *work)
656657
int ret;
657658

658659
mutex_lock(&psock->work_mutex);
659-
if (unlikely(state->len)) {
660-
len = state->len;
661-
off = state->off;
662-
}
663-
664660
while ((skb = skb_peek(&psock->ingress_skb))) {
665661
len = skb->len;
666662
off = 0;
@@ -670,6 +666,13 @@ static void sk_psock_backlog(struct work_struct *work)
670666
off = stm->offset;
671667
len = stm->full_len;
672668
}
669+
670+
/* Resume processing from previous partial state */
671+
if (unlikely(state->len)) {
672+
len = state->len;
673+
off = state->off;
674+
}
675+
673676
ingress = skb_bpf_ingress(skb);
674677
skb_bpf_redirect_clear(skb);
675678
do {
@@ -680,7 +683,8 @@ static void sk_psock_backlog(struct work_struct *work)
680683
if (ret <= 0) {
681684
if (ret == -EAGAIN) {
682685
sk_psock_skb_state(psock, state, len, off);
683-
686+
/* Restore redir info we cleared before */
687+
skb_bpf_set_redir(skb, psock->sk, ingress);
684688
/* Delay slightly to prioritize any
685689
* other work that might be here.
686690
*/
@@ -697,6 +701,8 @@ static void sk_psock_backlog(struct work_struct *work)
697701
len -= ret;
698702
} while (len);
699703

704+
/* The entire skb sent, clear state */
705+
sk_psock_skb_state(psock, state, 0, 0);
700706
skb = skb_dequeue(&psock->ingress_skb);
701707
kfree_skb(skb);
702708
}
@@ -1014,7 +1020,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10141020
off = stm->offset;
10151021
len = stm->full_len;
10161022
}
1017-
err = sk_psock_skb_ingress_self(psock, skb, off, len);
1023+
err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
10181024
}
10191025
if (err < 0) {
10201026
spin_lock_bh(&psock->ingress_lock);

tools/testing/selftests/bpf/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ $(OUTPUT)/bench_local_storage_create.o: $(OUTPUT)/bench_local_storage_create.ske
811811
$(OUTPUT)/bench_bpf_hashmap_lookup.o: $(OUTPUT)/bpf_hashmap_lookup.skel.h
812812
$(OUTPUT)/bench_htab_mem.o: $(OUTPUT)/htab_mem_bench.skel.h
813813
$(OUTPUT)/bench_bpf_crypto.o: $(OUTPUT)/crypto_bench.skel.h
814+
$(OUTPUT)/bench_sockmap.o: $(OUTPUT)/bench_sockmap_prog.skel.h
814815
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
815816
$(OUTPUT)/bench: LDLIBS += -lm
816817
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
@@ -831,6 +832,7 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
831832
$(OUTPUT)/bench_local_storage_create.o \
832833
$(OUTPUT)/bench_htab_mem.o \
833834
$(OUTPUT)/bench_bpf_crypto.o \
835+
$(OUTPUT)/bench_sockmap.o \
834836
#
835837
$(call msg,BINARY,,$@)
836838
$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@

tools/testing/selftests/bpf/bench.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ extern struct argp bench_local_storage_create_argp;
283283
extern struct argp bench_htab_mem_argp;
284284
extern struct argp bench_trigger_batch_argp;
285285
extern struct argp bench_crypto_argp;
286+
extern struct argp bench_sockmap_argp;
286287

287288
static const struct argp_child bench_parsers[] = {
288289
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
@@ -297,6 +298,7 @@ static const struct argp_child bench_parsers[] = {
297298
{ &bench_htab_mem_argp, 0, "hash map memory benchmark", 0 },
298299
{ &bench_trigger_batch_argp, 0, "BPF triggering benchmark", 0 },
299300
{ &bench_crypto_argp, 0, "bpf crypto benchmark", 0 },
301+
{ &bench_sockmap_argp, 0, "bpf sockmap benchmark", 0 },
300302
{},
301303
};
302304

@@ -549,6 +551,7 @@ extern const struct bench bench_local_storage_create;
549551
extern const struct bench bench_htab_mem;
550552
extern const struct bench bench_crypto_encrypt;
551553
extern const struct bench bench_crypto_decrypt;
554+
extern const struct bench bench_sockmap;
552555

553556
static const struct bench *benchs[] = {
554557
&bench_count_global,
@@ -609,6 +612,7 @@ static const struct bench *benchs[] = {
609612
&bench_htab_mem,
610613
&bench_crypto_encrypt,
611614
&bench_crypto_decrypt,
615+
&bench_sockmap,
612616
};
613617

614618
static void find_benchmark(void)

0 commit comments

Comments
 (0)