@@ -318,18 +318,23 @@ async fn retry_write_different_mongos() {
318
318
return ;
319
319
}
320
320
321
- // NOTE: This test places all failpoints on a single mongos server to avoid flakiness caused by
322
- // incomplete server discovery.
323
- //
324
- // In MongoDB versions 4.2 and 4.4, the SDAM process can be slow or non-deterministic,
325
- // especially immediately after creating the cluster. The driver may not have sent "hello"
326
- // messages to all connected servers yet, which means some mongos instances may still be in
327
- // the "Unknown" state and not selectable for retryable writes.
328
- //
329
- // This caused test failures because the retry logic expected to find a second eligible server,
330
- // but the driver was unaware of its existence. By placing all failpoints on a single mongos
331
- // host, we ensure that server selection and retries happen within a single fully discovered
332
- // router, avoiding issues caused by prematurely filtered or undiscovered servers.
321
+ // NOTE: This test uses a single client to set failpoints on each mongos and run the insert
322
+ // operation. This avoids flakiness caused by a race between server discovery and server
323
+ // selection.
324
+
325
+ // When a client is first created, it initializes its view of the topology with all configured
326
+ // mongos addresses, but marks each as Unknown until it completes the server discovery process
327
+ // by sending and receiving "hello" messages Unknown servers are not eligible for server
328
+ // selection.
329
+
330
+ // Previously, we created a new client for each call to `enable_fail_point` and for the insert
331
+ // operation. Each new client restarted the discovery process, and sometimes had not yet marked
332
+ // both mongos servers as usable, leading to test failures when the retry logic couldn't insert
333
+ // a second eligible server.
334
+
335
+ // By reusing a single client, each `enable_fail_point` call forces discovery to complete for
336
+ // the corresponding mongos. As a result, when the insert operation runs, the client has a
337
+ // fully discovered topology and can reliably select between both servers.
333
338
client_options. hosts . drain ( 2 ..) ;
334
339
client_options. retry_writes = Some ( true ) ;
335
340
let hosts = client_options. hosts . clone ( ) ;
0 commit comments