Skip to content

Commit 2f9bf92

Browse files
committed
mctp: Add retry for one-time peer property queries on timeout
The function `query_peer_properties()` is called once during peer initialization to query basic information after the EID becomes routable. To improve reliability, this change adds a retry mechanism when the query fails with `-ETIMEDOUT`. Since these queries are one-time initialization steps, a single successful attempt is sufficient, and retrying enhances stability under transient MCTP bus contention or multi-master timing issues. Testing: add stress test for peer initialization under multi-master ``` while true; do echo "Restarting mctpd.service..." systemctl restart mctpd.service # Wait a few seconds to allow service to initialize sleep 20 done ``` After the 30 loops, the script checks mctpd.service journal for expected retry messages to verify robustness under transient MCTP bus contention. ``` root@bmc:~# journalctl -xeu mctpd.service | grep Retrying Oct 29 00:35:21 bmc mctpd[31801]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:00 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:01 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 2 Oct 29 00:45:08 bmc mctpd[32360]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 ``` Signed-off-by: Daniel Hsu <[email protected]>
1 parent 7ea8652 commit 2f9bf92

File tree

3 files changed

+102
-14
lines changed

3 files changed

+102
-14
lines changed

src/mctpd.c

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2927,23 +2927,59 @@ static int method_learn_endpoint(sd_bus_message *call, void *data,
29272927
// and routable.
29282928
static int query_peer_properties(struct peer *peer)
29292929
{
2930+
const unsigned int max_retries = 4;
29302931
int rc;
29312932

2932-
rc = query_get_peer_msgtypes(peer);
2933-
if (rc < 0) {
2934-
// Warn here, it's a mandatory command code.
2935-
// It might be too noisy if some devices don't implement it.
2936-
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2937-
peer_tostr(peer), rc, strerror(-rc));
2938-
rc = 0;
2933+
for (unsigned int i = 0; i < max_retries; i++) {
2934+
rc = query_get_peer_msgtypes(peer);
2935+
2936+
// Success
2937+
if (rc == 0)
2938+
break;
2939+
2940+
// On timeout, retry
2941+
if (rc == -ETIMEDOUT) {
2942+
if (peer->ctx->verbose)
2943+
warnx("Retrying to get endpoint types for %s. Attempt %u",
2944+
peer_tostr(peer), i + 1);
2945+
rc = 0;
2946+
continue;
2947+
}
2948+
2949+
// On other errors, warn and ignore
2950+
if (rc < 0) {
2951+
if (peer->ctx->verbose)
2952+
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2953+
peer_tostr(peer), -rc, strerror(-rc));
2954+
rc = 0;
2955+
break;
2956+
}
29392957
}
29402958

2941-
rc = query_get_peer_uuid(peer);
2942-
if (rc < 0) {
2943-
if (peer->ctx->verbose)
2944-
warnx("Error getting UUID for %s. Ignoring error %d %s",
2945-
peer_tostr(peer), rc, strerror(-rc));
2946-
rc = 0;
2959+
for (unsigned int i = 0; i < max_retries; i++) {
2960+
rc = query_get_peer_uuid(peer);
2961+
2962+
// Success
2963+
if (rc == 0)
2964+
break;
2965+
2966+
// On timeout, retry
2967+
if (rc == -ETIMEDOUT) {
2968+
if (peer->ctx->verbose)
2969+
warnx("Retrying to get peer UUID for %s. Attempt %u",
2970+
peer_tostr(peer), i + 1);
2971+
rc = 0;
2972+
continue;
2973+
}
2974+
2975+
// On other errors, warn and ignore
2976+
if (rc < 0) {
2977+
if (peer->ctx->verbose)
2978+
warnx("Error getting UUID for %s. Ignoring error %d %s",
2979+
peer_tostr(peer), -rc, strerror(-rc));
2980+
rc = 0;
2981+
break;
2982+
}
29472983
}
29482984

29492985
// TODO: emit property changed? Though currently they are all const.

tests/mctpenv/__init__.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,14 +318,16 @@ def to_buf(self):
318318
return bytes([flags, self.cmd]) + self.data
319319

320320
class Endpoint:
321-
def __init__(self, iface, lladdr, ep_uuid = None, eid = 0, types = None):
321+
def __init__(self, iface, lladdr, ep_uuid = None, eid = 0, types = None, timeout_opcodes = set(), retry_count = 2):
322322
self.iface = iface
323323
self.lladdr = lladdr
324324
self.uuid = ep_uuid or uuid.uuid1()
325325
self.eid = eid
326326
self.types = types or [0]
327327
self.bridged_eps = []
328328
self.allocated_pool = None # or (start, size)
329+
self.timeout_opcodes = timeout_opcodes
330+
self.retry_count = retry_count
329331

330332
# keyed by (type, type-specific-instance)
331333
self.commands = {}
@@ -368,6 +370,11 @@ async def handle_mctp_control(self, sock, addr, data):
368370
# Use IID from request, zero Rq and D bits
369371
hdr = [iid, opcode]
370372

373+
if opcode in self.timeout_opcodes:
374+
if self.retry_count > 0:
375+
self.retry_count -= 1
376+
return
377+
371378
if opcode == 1:
372379
# Set Endpoint ID
373380
(op, eid) = data[2:]
@@ -435,6 +442,9 @@ async def send_control(self, sock, cmd):
435442

436443
return await cmd.wait()
437444

445+
def response_timeout_control(self, opcode):
446+
self.timeout_opcodes.add(opcode)
447+
438448
class Network:
439449
def __init__(self):
440450
self.endpoints = []
@@ -1197,13 +1207,18 @@ async def handle_control(self, nursery):
11971207
else:
11981208
print(f"unknown op {op}")
11991209

1210+
import subprocess
1211+
12001212
class MctpdWrapper(MctpProcessWrapper):
12011213
def __init__(self, bus, sysnet, binary=None, config=None):
12021214
super().__init__(sysnet)
12031215
self.bus = bus
12041216
self.binary = binary or './test-mctpd'
12051217
self.config = config
12061218

1219+
self.stdout_logs = []
1220+
self.stderr_logs = []
1221+
12071222
async def start_mctpd(self, nursery):
12081223
nursery.start_soon(self.handle_control, nursery)
12091224
(send_chan, self.proc_rc_recv_chan) = trio.open_memory_channel(1)
@@ -1253,10 +1268,13 @@ def name_owner_changed(name, new_owner, old_owner):
12531268
config_file = None
12541269
command = [self.binary, '-v']
12551270

1271+
import subprocess
12561272
proc = await trio.lowlevel.open_process(
12571273
command = command,
12581274
pass_fds = (1, 2, self.sock_remote.fileno()),
12591275
env = env,
1276+
stdout = subprocess.PIPE,
1277+
stderr = subprocess.PIPE,
12601278
)
12611279
self.sock_remote.close()
12621280

@@ -1269,6 +1287,13 @@ def name_owner_changed(name, new_owner, old_owner):
12691287
# process after the test has run.
12701288
task_status.started(proc)
12711289

1290+
async def read_stream(stream, storage):
1291+
async for data in stream:
1292+
storage.append(data.decode(errors="replace"))
1293+
1294+
nursery.start_soon(read_stream, proc.stdout, self.stdout_logs)
1295+
nursery.start_soon(read_stream, proc.stderr, self.stderr_logs)
1296+
12721297
proc_rc = await proc.wait()
12731298

12741299
if config_file:

tests/test_mctpd.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,3 +1376,30 @@ async def test_register_vdm_type_support_errors(dbus, mctpd):
13761376
with pytest.raises(asyncdbus.errors.DBusError) as ex:
13771377
await mctp.call_register_vdm_type_support(0x00, v_type, 0x0001)
13781378
assert str(ex.value) == "VDM type already registered"
1379+
1380+
async def test_query_peer_properties_retry_timeout(nursery, dbus, sysnet):
1381+
1382+
# activate mctpd
1383+
mctpd = MctpdWrapper(dbus, sysnet)
1384+
await mctpd.start_mctpd(nursery)
1385+
1386+
iface = mctpd.system.interfaces[0]
1387+
ep = mctpd.network.endpoints[0]
1388+
1389+
mctp = await mctpd_mctp_iface_obj(dbus, iface)
1390+
1391+
# add a bridged endpoint to ep
1392+
fake_ep = Endpoint(iface, b'\x12\x34', types=[0, 2])
1393+
fake_ep.response_timeout_control(5)
1394+
ep.add_bridged_ep(fake_ep)
1395+
mctpd.network.add_endpoint(fake_ep)
1396+
1397+
# call assign_endpoint on ep, which will allocate a pool for fake_ep
1398+
mctp = await mctpd_mctp_iface_obj(dbus, iface)
1399+
await mctp.call_setup_endpoint(ep.lladdr)
1400+
1401+
assert any("Retrying to get endpoint types" in l for l in mctpd.stderr_logs)
1402+
1403+
# exit mctpd
1404+
res = await mctpd.stop_mctpd()
1405+
assert res == 0

0 commit comments

Comments
 (0)