Skip to content

Commit dff6bc5

Browse files
committed
netstacklat: Exclude TCP reads for HOL blocked segments
The 'tcp-socket-read' currently reports the latency for the skb containing the last TCP segment read from the socket. However, this segment might have been head of line (HOL) blocked by a previous segment missing. In this case, netstacklat's reported latency will include HOL blocking periods that is dependent on external factors (such as network packet loss, and network latency impacts retransmission time). As netstacklat is primarily intended to identify issues within the local host (in the network stack or receiving applications), by default filter out any socket reads were the last read SKB might have experienced HOL-blocking. Add the new -y/--include-tcp-hol-delay option to retain the old behavior of reporting latency for all reads, including those that are HOL-blocked. This may be useful in some scenarios when you still want to be aware of latency issues caused by HOL-blocking, even though it is caused by external components. For example, in a data center context were you have full control over the network, it may still be relevant to monitor HOL-based caused by the network. To exclude HOL-blocked reads, detect if any new ooo-segments have arrived by checking for differences in the number of ooo-packets in tcp_sock->rcv_ooopack. If any new ooo-segments have arrived, exclude the latency sample from the current read and set a limit for the next safe sequence number to read where the current ooo-packets must have been passed so segments can no longer be HOL-blocked. If there are skbs in the ooo-queue, set the limit to the end of the ooo-queue. Otherwise, set the limit to the current rcv_nxt (as if the ooo-queue is empty the detected ooo-segments must already have been merged into the receive queue and rcv_nxt must have advanced past them). If the read is past the safe sequence limit and no new ooo-segments have arrived, it's safe to start including the latency samples again. For sockets were some ooo-segments have been observed, keep the ooo-range state in socket storage (BPF_MAP_TYPE_SK_STORAGE). Skip protecting this state with a spin-lock, as it should only be concurrently accessed if there are concurrent reads on the same TCP socket, which is assumed to be very rare as applications attempting that cannot know which part of the data each of their concurrent reads will get. There are some scenarios that may cause this ooo-filtering to fail. - If multiple reads are done to the socket concurrently, we may not correctly track the last read byte. The kernel does not keep a lock on the TCP socket at the time our hooked function tcp_recv_timestamp() runs. If two reads are done in parallel, it's therefore possible that for both reads we will check the last read byte (tcp_sock.copied_seq) after the second read has updated it. We may then incorrectly conclude that the first read was ahead of the ooo-range when it was not, and record its latency when we should have excluded it. In practice I belive this issue should be quite rare, as most applications will probably not attempt to perform multiple concurrent reads to a single connected TCP socket in parallel (as then you cannot know which part of the payload the parallel reads will return). - As tcp_recv_timestamp() runs outside of the socket lock, the various state members we access may concurrently be updated as we're attempting to read them. An especially problematic one is tcp_sock.ooo_last_skb, which keeps a pointer to an SKB that is only valid while the ooo-queue is non-empty. It is possible that between our check for if the ooo-queue is non-empty and following the ooo_last_skb pointer, the ooo-queue is cleared and the ooo_last_skb pointer may end up pointing towards a freed SKB. If the socket members we access are updated before or while we read them, it can break the filtering in numerous ways, e.g. result in includes samples that should have been excluded (due to e.g. copied_seq being updated before our read) or excluding a large amount of valid samples (due to e.g. setting a sequence limit based on garbage in a freed SKB). Signed-off-by: Simon Sundberg <[email protected]>
1 parent cc17c68 commit dff6bc5

File tree

5 files changed

+358
-12
lines changed

5 files changed

+358
-12
lines changed

headers/vmlinux/vmlinux_common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ struct list_head {
1313
struct list_head *prev;
1414
};
1515

16+
struct rb_root {
17+
struct rb_node *rb_node;
18+
};
19+
1620
struct rb_node {
1721
long unsigned int __rb_parent_color;
1822
struct rb_node *rb_right;

headers/vmlinux/vmlinux_net.h

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,35 @@ struct sk_buff {
161161
struct skb_ext *extensions;
162162
};
163163

164+
struct tcp_skb_cb {
165+
__u32 seq;
166+
__u32 end_seq;
167+
union {
168+
struct {
169+
u16 tcp_gso_segs;
170+
u16 tcp_gso_size;
171+
};
172+
};
173+
__u8 tcp_flags;
174+
__u8 sacked;
175+
__u8 ip_dsfield;
176+
__u8 txstamp_ack : 1;
177+
__u8 eor : 1;
178+
__u8 has_rxtstamp : 1;
179+
__u8 unused : 5;
180+
__u32 ack_seq;
181+
union {
182+
struct {
183+
__u32 is_app_limited : 1;
184+
__u32 delivered_ce : 20;
185+
__u32 unused : 11;
186+
__u32 delivered;
187+
u64 first_tx_mstamp;
188+
u64 delivered_mstamp;
189+
} tx;
190+
};
191+
};
192+
164193
struct nf_conn {
165194
unsigned long status;
166195
};
@@ -202,4 +231,183 @@ struct sock {
202231
u32 sk_rx_dst_cookie;
203232
};
204233

234+
struct inet_sock {
235+
struct sock sk;
236+
};
237+
238+
struct inet_connection_sock {
239+
struct inet_sock icsk_inet;
240+
};
241+
242+
struct tcp_sock {
243+
struct inet_connection_sock inet_conn;
244+
__u8 __cacheline_group_begin__tcp_sock_read_tx[0];
245+
u32 max_window;
246+
u32 rcv_ssthresh;
247+
u32 reordering;
248+
u32 notsent_lowat;
249+
u16 gso_segs;
250+
struct sk_buff *lost_skb_hint;
251+
struct sk_buff *retransmit_skb_hint;
252+
__u8 __cacheline_group_end__tcp_sock_read_tx[0];
253+
__u8 __cacheline_group_begin__tcp_sock_read_txrx[0];
254+
u32 tsoffset;
255+
u32 snd_wnd;
256+
u32 mss_cache;
257+
u32 snd_cwnd;
258+
u32 prr_out;
259+
u32 lost_out;
260+
u32 sacked_out;
261+
u16 tcp_header_len;
262+
u8 scaling_ratio;
263+
u8 chrono_type: 2;
264+
u8 repair: 1;
265+
u8 tcp_usec_ts: 1;
266+
u8 is_sack_reneg: 1;
267+
u8 is_cwnd_limited: 1;
268+
__u8 __cacheline_group_end__tcp_sock_read_txrx[0];
269+
__u8 __cacheline_group_begin__tcp_sock_read_rx[0];
270+
u32 copied_seq;
271+
u32 rcv_tstamp;
272+
u32 snd_wl1;
273+
u32 tlp_high_seq;
274+
u32 rttvar_us;
275+
u32 retrans_out;
276+
u16 advmss;
277+
u16 urg_data;
278+
u32 lost;
279+
/* struct minmax rtt_min; */
280+
struct rb_root out_of_order_queue;
281+
u32 snd_ssthresh;
282+
u8 recvmsg_inq: 1;
283+
__u8 __cacheline_group_end__tcp_sock_read_rx[0];
284+
long: 0;
285+
__u8 __cacheline_group_begin__tcp_sock_write_tx[0];
286+
u32 segs_out;
287+
u32 data_segs_out;
288+
u64 bytes_sent;
289+
u32 snd_sml;
290+
u32 chrono_start;
291+
u32 chrono_stat[3];
292+
u32 write_seq;
293+
u32 pushed_seq;
294+
u32 lsndtime;
295+
u32 mdev_us;
296+
u32 rtt_seq;
297+
u64 tcp_wstamp_ns;
298+
struct list_head tsorted_sent_queue;
299+
struct sk_buff *highest_sack;
300+
u8 ecn_flags;
301+
__u8 __cacheline_group_end__tcp_sock_write_tx[0];
302+
__u8 __cacheline_group_begin__tcp_sock_write_txrx[0];
303+
__be32 pred_flags;
304+
u64 tcp_clock_cache;
305+
u64 tcp_mstamp;
306+
u32 rcv_nxt;
307+
u32 snd_nxt;
308+
u32 snd_una;
309+
u32 window_clamp;
310+
u32 srtt_us;
311+
u32 packets_out;
312+
u32 snd_up;
313+
u32 delivered;
314+
u32 delivered_ce;
315+
u32 app_limited;
316+
u32 rcv_wnd;
317+
/* struct tcp_options_received rx_opt; */
318+
u8 nonagle: 4;
319+
u8 rate_app_limited: 1;
320+
__u8 __cacheline_group_end__tcp_sock_write_txrx[0];
321+
long: 0;
322+
__u8 __cacheline_group_begin__tcp_sock_write_rx[0];
323+
u64 bytes_received;
324+
u32 segs_in;
325+
u32 data_segs_in;
326+
u32 rcv_wup;
327+
u32 max_packets_out;
328+
u32 cwnd_usage_seq;
329+
u32 rate_delivered;
330+
u32 rate_interval_us;
331+
u32 rcv_rtt_last_tsecr;
332+
u64 first_tx_mstamp;
333+
u64 delivered_mstamp;
334+
u64 bytes_acked;
335+
struct {
336+
u32 rtt_us;
337+
u32 seq;
338+
u64 time;
339+
} rcv_rtt_est;
340+
struct {
341+
u32 space;
342+
u32 seq;
343+
u64 time;
344+
} rcvq_space;
345+
__u8 __cacheline_group_end__tcp_sock_write_rx[0];
346+
u32 dsack_dups;
347+
u32 compressed_ack_rcv_nxt;
348+
struct list_head tsq_node;
349+
/* struct tcp_rack rack; */
350+
u8 compressed_ack;
351+
u8 dup_ack_counter: 2;
352+
u8 tlp_retrans: 1;
353+
u8 unused: 5;
354+
u8 thin_lto: 1;
355+
u8 fastopen_connect: 1;
356+
u8 fastopen_no_cookie: 1;
357+
u8 fastopen_client_fail: 2;
358+
u8 frto: 1;
359+
u8 repair_queue;
360+
u8 save_syn: 2;
361+
u8 syn_data: 1;
362+
u8 syn_fastopen: 1;
363+
u8 syn_fastopen_exp: 1;
364+
u8 syn_fastopen_ch: 1;
365+
u8 syn_data_acked: 1;
366+
u8 keepalive_probes;
367+
u32 tcp_tx_delay;
368+
u32 mdev_max_us;
369+
u32 reord_seen;
370+
u32 snd_cwnd_cnt;
371+
u32 snd_cwnd_clamp;
372+
u32 snd_cwnd_used;
373+
u32 snd_cwnd_stamp;
374+
u32 prior_cwnd;
375+
u32 prr_delivered;
376+
u32 last_oow_ack_time;
377+
/* struct hrtimer pacing_timer; */
378+
/* struct hrtimer compressed_ack_timer; */
379+
struct sk_buff *ooo_last_skb;
380+
/* struct tcp_sack_block duplicate_sack[1]; */
381+
/* struct tcp_sack_block selective_acks[4]; */
382+
/* struct tcp_sack_block recv_sack_cache[4]; */
383+
int lost_cnt_hint;
384+
u32 prior_ssthresh;
385+
u32 high_seq;
386+
u32 retrans_stamp;
387+
u32 undo_marker;
388+
int undo_retrans;
389+
u64 bytes_retrans;
390+
u32 total_retrans;
391+
u32 rto_stamp;
392+
u16 total_rto;
393+
u16 total_rto_recoveries;
394+
u32 total_rto_time;
395+
u32 urg_seq;
396+
unsigned int keepalive_time;
397+
unsigned int keepalive_intvl;
398+
int linger2;
399+
u8 bpf_sock_ops_cb_flags;
400+
u8 bpf_chg_cc_inprogress: 1;
401+
u16 timeout_rehash;
402+
u32 rcv_ooopack;
403+
struct {
404+
u32 probe_seq_start;
405+
u32 probe_seq_end;
406+
} mtu_probe;
407+
u32 plb_rehash;
408+
u32 mtu_info;
409+
bool is_mptcp;
410+
};
411+
412+
205413
#endif /* __VMLINUX_NET_H__ */

0 commit comments

Comments
 (0)