|
1 | 1 | package modmanager
|
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "bytes" |
4 | 5 | "context"
|
5 | 6 | "encoding/json"
|
6 | 7 | "errors"
|
@@ -28,6 +29,7 @@ import (
|
28 | 29 | "go.viam.com/rdk/components/generic"
|
29 | 30 | "go.viam.com/rdk/components/motor"
|
30 | 31 | "go.viam.com/rdk/config"
|
| 32 | + "go.viam.com/rdk/ftdc" |
31 | 33 | "go.viam.com/rdk/logging"
|
32 | 34 | modlib "go.viam.com/rdk/module"
|
33 | 35 | modmanageroptions "go.viam.com/rdk/module/modmanager/options"
|
@@ -1379,6 +1381,91 @@ func TestBadModuleFailsFast(t *testing.T) {
|
1379 | 1381 | test.That(t, err.Error(), test.ShouldContainSubstring, "module test-module exited too quickly after attempted startup")
|
1380 | 1382 | }
|
1381 | 1383 |
|
| 1384 | +// TestFTDCAfterModuleCrash is to give confidence that the FTDC sections devoted to tracking module |
| 1385 | +// process information (e.g: CPU usage) is in sync with the Process IDs (PIDs) that are actually |
| 1386 | +// running. |
| 1387 | +func TestFTDCAfterModuleCrash(t *testing.T) { |
| 1388 | + logger := logging.NewTestLogger(t) |
| 1389 | + modCfgs := []config.Module{ |
| 1390 | + { |
| 1391 | + Name: "test-module", |
| 1392 | + // testmodule2 has a `kill_module` DoCommand to force a module/process crash. |
| 1393 | + ExePath: rtestutils.BuildTempModule(t, "module/testmodule2"), |
| 1394 | + Type: config.ModuleTypeLocal, |
| 1395 | + }, |
| 1396 | + } |
| 1397 | + |
| 1398 | + ctx := context.Background() |
| 1399 | + parentAddr := setupSocketWithRobot(t) |
| 1400 | + opts := modmanageroptions.Options{UntrustedEnv: false} |
| 1401 | + |
| 1402 | + // Start up a mod manager with FTDC enabled. We will inspect the FTDC output for the |
| 1403 | + // `ElapsedTimeSecs` to assert the "pid tracking code" is working correctly. |
| 1404 | + ftdcData := bytes.NewBuffer(nil) |
| 1405 | + opts.FTDC = ftdc.NewWithWriter(ftdcData, logger) |
| 1406 | + // Normally a test would explicitly call `constructDatum` to control/guarantee FTDC gets |
| 1407 | + // data. But as a short-cut to avoid exposing methods that are currently private, we just run |
| 1408 | + // FTDC in the background. And sleep long enough between testing events (killing the module) to |
| 1409 | + // assert the right behavior. |
| 1410 | + opts.FTDC.Start() |
| 1411 | + |
| 1412 | + // Set up a mod manager. Currently there are zero modules running. |
| 1413 | + mgr := setupModManager(t, ctx, parentAddr, logger, opts) |
| 1414 | + |
| 1415 | + // Add a module, this will register an FTDC "section" for that module process. |
| 1416 | + err := mgr.Add(ctx, modCfgs...) |
| 1417 | + test.That(t, err, test.ShouldBeNil) |
| 1418 | + |
| 1419 | + // Add a resource -- this is simply to invoke the `kill_module` command. |
| 1420 | + res, err := mgr.AddResource(ctx, resource.Config{ |
| 1421 | + Name: "foo", |
| 1422 | + API: generic.API, |
| 1423 | + Model: resource.NewModel("rdk", "test", "helper2"), |
| 1424 | + }, nil) |
| 1425 | + test.That(t, err, test.ShouldBeNil) |
| 1426 | + test.That(t, mgr.IsModularResource(generic.Named("foo")), test.ShouldBeTrue) |
| 1427 | + |
| 1428 | + // Kill the module a few times for good measure. |
| 1429 | + for idx := 0; idx < 3; idx++ { |
| 1430 | + _, _ = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"}) |
| 1431 | + |
| 1432 | + // FTDC is running in the background with a one second interval. So we sleep for two seconds |
| 1433 | + // and cross our fingers we don't get a poor scheduler execution. The assertions are |
| 1434 | + // intentionally weak to minimize the risk of false positives (a test failure with correct |
| 1435 | + // production code). |
| 1436 | + time.Sleep(2 * time.Second) |
| 1437 | + } |
| 1438 | + |
| 1439 | + mgr.Close(ctx) |
| 1440 | + opts.FTDC.StopAndJoin(ctx) |
| 1441 | + |
| 1442 | + datums, err := ftdc.Parse(ftdcData) |
| 1443 | + test.That(t, err, test.ShouldBeNil) |
| 1444 | + logger.Info("Num ftdc datums: ", len(datums)) |
| 1445 | + |
| 1446 | + // Keep count of the number of `ElapsedTimeSecs` readings we encounter. It is a testing bug if |
| 1447 | + // we don't see any process FTDC metrics for the module. |
| 1448 | + numModuleElapsedTimeMetricsSeen := 0 |
| 1449 | + for _, datum := range datums { |
| 1450 | + for _, reading := range datum.Readings { |
| 1451 | + if reading.MetricName == "proc.modules.test-module.ElapsedTimeSecs" { |
| 1452 | + logger.Infow("Reading", "timestamp", datum.Time, "elapsedTimeSecs", reading.Value) |
| 1453 | + numModuleElapsedTimeMetricsSeen++ |
| 1454 | + // Dan: I don't have a good reason to believe that we can't (legitimately) observe |
| 1455 | + // an `ElapsedTimeSecs` of 0 here. It's more likely we'd see a 0 because we queried |
| 1456 | + // a bad PID. |
| 1457 | + // |
| 1458 | + // If my assumption is wrong and we get a false positive here, we can reevaluate the |
| 1459 | + // options for making a more robust test. |
| 1460 | + test.That(t, reading.Value, test.ShouldBeGreaterThan, 0) |
| 1461 | + } |
| 1462 | + } |
| 1463 | + } |
| 1464 | + |
| 1465 | + // Assert that we saw at least one datapoint before considering the test a success. |
| 1466 | + test.That(t, numModuleElapsedTimeMetricsSeen, test.ShouldBeGreaterThan, 0) |
| 1467 | +} |
| 1468 | + |
1382 | 1469 | func TestModularDiscoverFunc(t *testing.T) {
|
1383 | 1470 | ctx := context.Background()
|
1384 | 1471 | logger := logging.NewTestLogger(t)
|
|
0 commit comments