Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add netstacklat tool #125

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions netstacklat/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
netstacklat
14 changes: 14 additions & 0 deletions netstacklat/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)

USER_TARGETS := netstacklat
BPF_TARGETS := netstacklat.bpf
BPF_SKEL_OBJ := netstacklat.bpf.o

EXTRA_CFLAGS += -DBPF_EXAMPLES
EXTRA_DEPS += netstacklat.h bits.bpf.h maps.bpf.h
LDLIBS += -lm

LIB_DIR = ../lib

include $(LIB_DIR)/common.mk

36 changes: 36 additions & 0 deletions netstacklat/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Netstacklat - Monitor latency within the network stack
Netstacklat is a simple tool for monitoring latency within the Linux
network stack for ingress traffic. The tool relies on the kernel time
stamping received packets (`SOF_TIMESTAMPING_RX_SOFTWARE`),
specifically setting `sk_buff->tstamp`, and then reports when packets
arrive at various hooks relative to this timestamp, i.e. the time
between the latency between the packet being timestamped and reaching
the hook.

The tool is based on the following bpftrace script from Jesper
Dangaard Brouer:
```console
sudo bpftrace -e '
kfunc:tcp_v4_do_rcv,
kfunc:tcp_data_queue,
kfunc:udp_queue_rcv_one_skb
{
$tai_offset=37000000000;
$now=nsecs(tai)-$tai_offset; @cnt[probe]=count(); @total[probe]=count();
$ts=args->skb->tstamp; $delta=$now-(uint64)$ts;
@hist_ns[probe]=hist($delta);
@stats[probe]=stats($delta);
//printf("now:%llu - ts:%llu = delta:%llu\n", $now, $ts, $delta);
}
interval:s:10 {time("\n%H:%M:%S\n");
print(@cnt); clear(@cnt);
print(@total);
print(@stats);
print(@hist_ns);
}'
```

The eBPF part of the tool (`netstacklat.bpf.c`) is designed to be
compatible with
[ebpf_exporter](https://github.com/cloudflare/ebpf_exporter), so that
the data can easily be exported to Prometheus.
29 changes: 29 additions & 0 deletions netstacklat/bits.bpf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
/* From https://github.com/iovisor/bcc/blob/v0.25.0/libbpf-tools/bits.bpf.h*/

#ifndef __BITS_BPF_H
#define __BITS_BPF_H

static __always_inline u64 log2(u32 v)
{
u32 shift, r;

r = (v > 0xFFFF) << 4; v >>= r;
shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
shift = (v > 0xF) << 2; v >>= shift; r |= shift;
shift = (v > 0x3) << 1; v >>= shift; r |= shift;
r |= (v >> 1);

return r;
}

static __always_inline u64 log2l(u64 v)
{
u32 hi = v >> 32;
if (hi)
return log2(hi) + 32;
else
return log2(v);
}

#endif /* __BITS_BPF_H */
84 changes: 84 additions & 0 deletions netstacklat/maps.bpf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/* SPDX-License-Identifier: MIT */
/* From https://github.com/cloudflare/ebpf_exporter/blob/99d2752f9e0a095b57f53e5da6856ad143b0e443/examples/maps.bpf.h */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so a bunch of macros that don't properly delimit their contents, even going so far as having later macros directly refer to the expanded values of earlier ones? Yikes, no thanks! Please get rid of these and reimplement them using proper inline functions, or at least macros that are sane :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, well for just using it here in bpf-examples it would be easy to implement these as functions (not sure if I even need to specify them as inlined, generally leave that decision to the compiler).

However, if the goal is to be compatible with ebpf-exporter, maybe it makes more sense to try and fix this upstream (and then import the fixed version of these)? For that it will probably still have to be macros as the type of the key is unknown, it's just assumed to be a struct that has the member bucket (which they simply access with .bucket). Not very experienced in writing anything but the most trivial macros, so might have to bounce this back and forth a few times before I get this right if we go down this path.

Could of coure still be handled with functions that accept a void pointer to the key, as long as we decide that the bucket member has to be the first member. However, ebpf-exporter currently seem to assume that the bucket is the the last member of the key (ran into some issues when I tried placing something after the bucket member), so that would likely require changes to some interal aspects of ebpf-exporter as well as updating all of their examples.


#include "bits.bpf.h"

#define lookup_or_zero_init_key(map, key, into) \
u64 zero = 0; \
\
into = bpf_map_lookup_elem(map, key); \
if (!into) { \
bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); \
into = bpf_map_lookup_elem(map, key); \
if (!into) { \
return 0; \
} \
}

#define increment_variant(map, key, increment, variant) \
u64 *count; \
\
lookup_or_zero_init_key(map, key, count); \
\
variant; \
\
return *count;

static inline int increment_map(void *map, void *key, u64 increment)
{
increment_variant(map, key, increment, __sync_fetch_and_add(count, increment));
}

static inline int increment_map_nosync(void *map, void *key, u64 increment)
{
increment_variant(map, key, increment, *count += increment);
}

// Arrays are always preallocated, so this only fails if the key is missing
#define read_array_ptr(map, key, into) \
into = bpf_map_lookup_elem(map, key); \
if (!into) { \
return 0; \
}

#define _increment_histogram(map, key, increment, max_bucket, increment_fn) \
if (key.bucket > max_bucket) { \
key.bucket = max_bucket; \
} \
\
increment_fn(map, &key, 1); \
\
if (increment > 0) { \
key.bucket = max_bucket + 1; \
increment_fn(map, &key, increment); \
}

#define _increment_ex2_histogram(map, key, increment, max_bucket, increment_fn) \
key.bucket = log2l(increment); \
\
if (key.bucket > max_bucket) { \
key.bucket = max_bucket; \
} \
\
_increment_histogram(map, key, increment, max_bucket, increment_fn);

#define increment_exp2_histogram(map, key, increment, max_bucket) \
_increment_ex2_histogram(map, key, increment, max_bucket, increment_map)

#define increment_exp2_histogram_nosync(map, key, increment, max_bucket) \
_increment_ex2_histogram(map, key, increment, max_bucket, increment_map_nosync)

#define _increment_exp2zero_histogram(map, key, increment, max_bucket, increment_fn) \
if (increment == 0) { \
key.bucket = 0; \
} else { \
key.bucket = log2l(increment) + 1; \
} \
\
_increment_histogram(map, key, increment, max_bucket, increment_fn);

#define increment_exp2zero_histogram(map, key, increment, max_bucket) \
_increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map)

#define increment_exp2zero_histogram_nosync(map, key, increment, max_bucket) \
_increment_exp2zero_histogram(map, key, increment, max_bucket, increment_map_nosync)
100 changes: 100 additions & 0 deletions netstacklat/netstacklat.bpf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifdef BPF_EXAMPLES
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is probably going a bit too far in the compatibility. Let's just drop this commit for now, and circle back to it once we have an idea for how things will look wrt bpf-exporter.

#include "vmlinux_local.h"
#include <linux/bpf.h>
#else
#include <vmlinux.h>
#endif

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

#include "netstacklat.h"
#include "maps.bpf.h"

char LICENSE[] SEC("license") = "GPL";

volatile const signed long long TAI_OFFSET = (37LL * NS_PER_S);

/* Helpers in maps.bpf.h require any histogram key to be a struct with a bucket member */
struct hist_key {
u32 bucket;
};

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, HIST_NBINS);
__type(key, u32);
__type(value, u64);
} tcp_v4_do_rcv_hist SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, HIST_NBINS);
__type(key, u32);
__type(value, u64);
} tcp_data_queue_hist SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, HIST_NBINS);
__type(key, u32);
__type(value, u64);
} udp_queue_rcv_hist SEC(".maps");

static void *hook_to_histmap(enum netstacklat_hook hook)
{
switch (hook) {
case NETSTACKLAT_HOOK_TCP_V4_DO_RCV:
return &tcp_v4_do_rcv_hist;
case NETSTACKLAT_HOOK_TCP_DATA_QUEUE:
return &tcp_data_queue_hist;
case NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE:
return &udp_queue_rcv_hist;
default:
return NULL;
}
}

static void record_current_netstacklat(struct sk_buff *skb,
enum netstacklat_hook hook)
{
ktime_t delta_ns, skb_tstamp;
struct hist_key key;

if (!skb)
return;

skb_tstamp = BPF_CORE_READ(skb, tstamp);
if (skb_tstamp == 0)
return;

delta_ns = bpf_ktime_get_tai_ns() - TAI_OFFSET - skb_tstamp;
if (delta_ns < 0)
return;

increment_exp2_histogram_nosync(hook_to_histmap(hook), key, delta_ns,
HIST_MAX_LATENCY_SLOT);
}

SEC("fentry/tcp_v4_do_rcv")
int BPF_PROG(netstacklat_tcp_v4_do_rcv, struct sock *sk, struct sk_buff *skb)
{
record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_V4_DO_RCV);
return 0;
}

SEC("fentry/tcp_data_queue")
int BPF_PROG(netstacklat_tcp_data_queue, struct sock *sk, struct sk_buff *skb)
{
record_current_netstacklat(skb, NETSTACKLAT_HOOK_TCP_DATA_QUEUE);
return 0;
}

SEC("fentry/udp_queue_rcv_one_skb")
int BPF_PROG(netstacklat_udp_queue_rcv, struct sock *sk, struct sk_buff *skb)
{
record_current_netstacklat(skb, NETSTACKLAT_HOOK_UDP_QUEUE_RCV_ONE);
return 0;
}
Loading