xdp-project · simosund · Jan 9, 2025 · Jan 24, 2025 · Jan 14, 2025 · Jan 23, 2025
diff --git a/netstacklat/.gitignore b/netstacklat/.gitignore
@@ -0,0 +1 @@
+netstacklat
diff --git a/netstacklat/Makefile b/netstacklat/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+
+USER_TARGETS   := netstacklat
+BPF_TARGETS    := netstacklat.bpf
+BPF_SKEL_OBJ   := netstacklat.bpf.o
+
+EXTRA_CFLAGS += -DBPF_EXAMPLES
+EXTRA_DEPS += netstacklat.h bits.bpf.h maps.bpf.h
+LDLIBS     += -lm
+
+LIB_DIR = ../lib
+
+include $(LIB_DIR)/common.mk
+
diff --git a/netstacklat/README.md b/netstacklat/README.md
@@ -0,0 +1,36 @@
+# Netstacklat - Monitor latency within the network stack
+Netstacklat is a simple tool for monitoring latency within the Linux
+network stack for ingress traffic. The tool relies on the kernel time
+stamping received packets (`SOF_TIMESTAMPING_RX_SOFTWARE`),
+specifically setting `sk_buff->tstamp`, and then reports when packets
+arrive at various hooks relative to this timestamp, i.e. the time
+between the latency between the packet being timestamped and reaching
+the hook.
+
+The tool is based on the following bpftrace script from Jesper
+Dangaard Brouer:
+```console
+sudo bpftrace -e '
+	kfunc:tcp_v4_do_rcv,
+	kfunc:tcp_data_queue,
+	kfunc:udp_queue_rcv_one_skb
+	{
+		$tai_offset=37000000000;
+		$now=nsecs(tai)-$tai_offset; @cnt[probe]=count(); @total[probe]=count();
+		$ts=args->skb->tstamp; $delta=$now-(uint64)$ts;
+		@hist_ns[probe]=hist($delta);
+		@stats[probe]=stats($delta);
+		//printf("now:%llu - ts:%llu = delta:%llu\n", $now, $ts, $delta);
+	}
+	interval:s:10 {time("\n%H:%M:%S\n");
+		print(@cnt); clear(@cnt);
+		print(@total);
+		print(@stats);
+		print(@hist_ns);
+	}'
+```
+
+The eBPF part of the tool (`netstacklat.bpf.c`) is designed to be
+compatible with
+[ebpf_exporter](https://github.com/cloudflare/ebpf_exporter), so that
+the data can easily be exported to Prometheus.
diff --git a/netstacklat/bits.bpf.h b/netstacklat/bits.bpf.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* From https://github.com/iovisor/bcc/blob/v0.25.0/libbpf-tools/bits.bpf.h*/
+
+#ifndef __BITS_BPF_H
+#define __BITS_BPF_H
+
+static __always_inline u64 log2(u32 v)
+{
+    u32 shift, r;
+
+    r = (v > 0xFFFF) << 4; v >>= r;
+    shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+    shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+    shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+    r |= (v >> 1);
+
+    return r;
+}
+
+static __always_inline u64 log2l(u64 v)
+{
+    u32 hi = v >> 32;
+    if (hi)
+        return log2(hi) + 32;
+    else
+        return log2(v);
+}
+
+#endif /* __BITS_BPF_H */
diff --git a/netstacklat/maps.bpf.h b/netstacklat/maps.bpf.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: MIT */
+/* From https://github.com/cloudflare/ebpf_exporter/blob/99d2752f9e0a095b57f53e5da6856ad143b0e443/examples/maps.bpf.h */
+
+#include "bits.bpf.h"
+
+#define lookup_or_zero_init_key(map, key, into)                                                                        \
+    u64 zero = 0;                                                                                                      \
+                                                                                                                       \
+    into = bpf_map_lookup_elem(map, key);                                                                              \
+    if (!into) {                                                                                                       \
+        bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);                                                             \
+        into = bpf_map_lookup_elem(map, key);                                                                          \
+        if (!into) {                                                                                                   \
+            return 0;                                                                                                  \
+        }                                                                                                              \
+    }
+
+#define increment_variant(map, key, increment, variant)                                                                \
+    u64 *count;                                                                                                        \
+                                                                                                                       \
+    lookup_or_zero_init_key(map, key, count);                                                                          \
+                                                                                                                       \
+    variant;                                                                                                           \
+                                                                                                                       \
+    return *count;
+
+static inline int increment_map(void *map, void *key, u64 increment)
+{
+    increment_variant(map, key, increment, __sync_fetch_and_add(count, increment));
+}
+
+static inline int increment_map_nosync(void *map, void *key, u64 increment)
+{
+    increment_variant(map, key, increment, *count += increment);
+}
+
+// Arrays are always preallocated, so this only fails if the key is missing
+#define read_array_ptr(map, key, into)                                                                                 \
+    into = bpf_map_lookup_elem(map, key);                                                                              \
+    if (!into) {                                                                                                       \
+        return 0;                                                                                                      \
+    }
+
+#define _increment_histogram(map, key, increment, max_bucket, increment_fn)                                            \
+    if (key.bucket > max_bucket) {                                                                                     \
+        key.bucket = max_bucket;                                                                                       \
+    }                                                                                                                  \
+                                                                                                                       \
+    increment_fn(map, &key, 1);                                                                                        \
+                                                                                                                       \
+    if (increment > 0) {                                                                                               \
+        key.bucket = max_bucket + 1;                                                                                   \
+        increment_fn(map, &key, increment);                                                                            \
+    }
+
+#define _increment_ex2_histogram(map, key, increment, max_bucket, increment_fn)                                        \
+    key.bucket = log2l(increment);                                                                                     \
+                                                                                                                       \
+    if (key.bucket > max_bucket) {                                                                                     \
+        key.bucket = max_bucket;                                                                                       \
+    }                                                                                                                  \
+                                                                                                                       \
+    _increment_histogram(map, key, increment, max_bucket, increment_fn);
+
+#define increment_exp2_histogram(map, key, increment, max_bucket)                                                      \
+    _increment_ex2_histogram(map, key, increment, max_bucket, increment_map)
+
+#define increment_exp2_histogram_nosync(map, key, increment, max_bucket)                                               \
+    _increment_ex2_histogram(map, key, increment, max_bucket, increment_map_nosync)
+
+#define _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_fn)                                   \
+    if (increment == 0) {                                                                                              \
+        key.bucket = 0;                                                                                                \
+    } else {                                                                                                           \
+        key.bucket = log2l(increment) + 1;                                                                             \
+    }                                                                                                                  \
+                                                                                                                       \
+    _increment_histogram(map, key, increment, max_bucket, increment_fn);
+
+#define increment_exp2zero_histogram(map, key, increment, max_bucket)                                                  \
+    _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map)
+
+#define increment_exp2zero_histogram_nosync(map, key, increment, max_bucket)                                           \
+    _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map_nosync)
diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifdef BPF_EXAMPLES
+#include "vmlinux_local.h"
+#include <linux/bpf.h>
+#else
+#include <vmlinux.h>
+#endif
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "netstacklat.h"
+#include "maps.bpf.h"
+
+char LICENSE[] SEC("license") = "GPL";
+
+volatile const signed long long TAI_OFFSET = (37LL * NS_PER_S);
+
+/* Helpers in maps.bpf.h require any histogram key to be a struct with a bucket member */
+struct hist_key {
+	u32 bucket;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBINS);
+	__type(key, u32);
+	__type(value, u64);
+} tcp_v4_do_rcv_hist SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBINS);
+	__type(key, u32);
+	__type(value, u64);
+} tcp_data_queue_hist SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, HIST_NBINS);
+	__type(key, u32);
+	__type(value, u64);
+} udp_queue_rcv_hist SEC(".maps");
+
+static void *hook_to_histmap(enum netstacklat_hook hook)
+{
+	switch (hook) {
+	case NETSTACKLAT_HOOK_TCP_V4_DO_RCV:
+		return &tcp_v4_do_rcv_hist;
+	case NETSTACKLAT_HOOK_TCP_DATA_QUEUE:
+		return &tcp_data_queue_hist;
+	case NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE:
+		return &udp_queue_rcv_hist;
+	default:
+		return NULL;
+	}
+}
+
+static void record_current_netstacklat(struct sk_buff *skb,
+				       enum netstacklat_hook hook)
+{
+	ktime_t delta_ns, skb_tstamp;
+	struct hist_key key;
+
+	if (!skb)
+		return;
+
+	skb_tstamp = BPF_CORE_READ(skb, tstamp);
+	if (skb_tstamp == 0)
+		return;
+
+	delta_ns = bpf_ktime_get_tai_ns() - TAI_OFFSET - skb_tstamp;
+	if (delta_ns < 0)
+		return;
+
+	increment_exp2_histogram_nosync(hook_to_histmap(hook), key, delta_ns,
+					HIST_MAX_LATENCY_SLOT);
+}
+
+SEC("fentry/tcp_v4_do_rcv")
+int BPF_PROG(netstacklat_tcp_v4_do_rcv, struct sock *sk, struct sk_buff *skb)
+{
+	record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_V4_DO_RCV);
+	return 0;
+}
+
+SEC("fentry/tcp_data_queue")
+int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb)
+{
+	record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_DATA_QUEUE);
+	return 0;
+}
+
+SEC("fentry/udp_queue_rcv_one_skb")
+int BPF_PROG(netstacklat_udp_queue_rcv, struct sock *sk, struct sk_buff *skb)
+{
+	record_current_netstacklat(skb, NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE);
+	return 0;
+}