Skip to content

Commit d2b9c7e

Browse files
authored
RSDK-9706: Fix FTDC tracking of module restarts/crashes. (#4695)
1 parent 91b9240 commit d2b9c7e

File tree

3 files changed

+100
-2
lines changed

3 files changed

+100
-2
lines changed

ftdc/cmd/parser.go

+3
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,9 @@ func main() {
441441
nolintPrintln("reset range")
442442
nolintPrintln("- Unset any prior range. \"zoom out to full\"")
443443
nolintPrintln()
444+
nolintPrintln("r, refresh")
445+
nolintPrintln("- Regenerate the plot.png image. Useful when a current viam-server is running.")
446+
nolintPrintln()
444447
nolintPrintln("`quit` or Ctrl-d to exit")
445448
case strings.HasPrefix(cmd, "range "):
446449
pieces := strings.SplitN(cmd, " ", 3)

module/modmanager/manager.go

+10-2
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,10 @@ func (mgr *Manager) newOnUnexpectedExitHandler(mod *module) func(exitCode int) b
867867
"error", err)
868868
}
869869

870+
if mgr.ftdc != nil {
871+
mgr.ftdc.Remove(mod.getFTDCName())
872+
}
873+
870874
// If attemptRestart returns any orphaned resource names, restart failed,
871875
// and we should remove orphaned resources. Since we handle process
872876
// restarting ourselves, return false here so goutils knows not to attempt
@@ -1226,7 +1230,7 @@ func (m *module) stopProcess() error {
12261230
// of metrics will be reported. Therefore it is safe to continue monitoring the module process
12271231
// while it's in shutdown.
12281232
if m.ftdc != nil {
1229-
m.ftdc.Remove(m.process.ID())
1233+
m.ftdc.Remove(m.getFTDCName())
12301234
}
12311235
}()
12321236

@@ -1348,6 +1352,10 @@ func (m *module) getFullEnvironment(viamHomeDir string) map[string]string {
13481352
return getFullEnvironment(m.cfg, m.dataDir, viamHomeDir)
13491353
}
13501354

1355+
func (m *module) getFTDCName() string {
1356+
return fmt.Sprintf("proc.modules.%s", m.process.ID())
1357+
}
1358+
13511359
func (m *module) registerProcessWithFTDC() {
13521360
if m.ftdc == nil {
13531361
return
@@ -1365,7 +1373,7 @@ func (m *module) registerProcessWithFTDC() {
13651373
return
13661374
}
13671375

1368-
m.ftdc.Add(fmt.Sprintf("proc.modules.%s", m.process.ID()), statser)
1376+
m.ftdc.Add(m.getFTDCName(), statser)
13691377
}
13701378

13711379
func getFullEnvironment(

module/modmanager/manager_test.go

+87
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package modmanager
22

33
import (
4+
"bytes"
45
"context"
56
"encoding/json"
67
"errors"
@@ -28,6 +29,7 @@ import (
2829
"go.viam.com/rdk/components/generic"
2930
"go.viam.com/rdk/components/motor"
3031
"go.viam.com/rdk/config"
32+
"go.viam.com/rdk/ftdc"
3133
"go.viam.com/rdk/logging"
3234
modlib "go.viam.com/rdk/module"
3335
modmanageroptions "go.viam.com/rdk/module/modmanager/options"
@@ -1379,6 +1381,91 @@ func TestBadModuleFailsFast(t *testing.T) {
13791381
test.That(t, err.Error(), test.ShouldContainSubstring, "module test-module exited too quickly after attempted startup")
13801382
}
13811383

1384+
// TestFTDCAfterModuleCrash is to give confidence that the FTDC sections devoted to tracking module
1385+
// process information (e.g: CPU usage) is in sync with the Process IDs (PIDs) that are actually
1386+
// running.
1387+
func TestFTDCAfterModuleCrash(t *testing.T) {
1388+
logger := logging.NewTestLogger(t)
1389+
modCfgs := []config.Module{
1390+
{
1391+
Name: "test-module",
1392+
// testmodule2 has a `kill_module` DoCommand to force a module/process crash.
1393+
ExePath: rtestutils.BuildTempModule(t, "module/testmodule2"),
1394+
Type: config.ModuleTypeLocal,
1395+
},
1396+
}
1397+
1398+
ctx := context.Background()
1399+
parentAddr := setupSocketWithRobot(t)
1400+
opts := modmanageroptions.Options{UntrustedEnv: false}
1401+
1402+
// Start up a mod manager with FTDC enabled. We will inspect the FTDC output for the
1403+
// `ElapsedTimeSecs` to assert the "pid tracking code" is working correctly.
1404+
ftdcData := bytes.NewBuffer(nil)
1405+
opts.FTDC = ftdc.NewWithWriter(ftdcData, logger)
1406+
// Normally a test would explicitly call `constructDatum` to control/guarantee FTDC gets
1407+
// data. But as a short-cut to avoid exposing methods that are currently private, we just run
1408+
// FTDC in the background. And sleep long enough between testing events (killing the module) to
1409+
// assert the right behavior.
1410+
opts.FTDC.Start()
1411+
1412+
// Set up a mod manager. Currently there are zero modules running.
1413+
mgr := setupModManager(t, ctx, parentAddr, logger, opts)
1414+
1415+
// Add a module, this will register an FTDC "section" for that module process.
1416+
err := mgr.Add(ctx, modCfgs...)
1417+
test.That(t, err, test.ShouldBeNil)
1418+
1419+
// Add a resource -- this is simply to invoke the `kill_module` command.
1420+
res, err := mgr.AddResource(ctx, resource.Config{
1421+
Name: "foo",
1422+
API: generic.API,
1423+
Model: resource.NewModel("rdk", "test", "helper2"),
1424+
}, nil)
1425+
test.That(t, err, test.ShouldBeNil)
1426+
test.That(t, mgr.IsModularResource(generic.Named("foo")), test.ShouldBeTrue)
1427+
1428+
// Kill the module a few times for good measure.
1429+
for idx := 0; idx < 3; idx++ {
1430+
_, _ = res.DoCommand(ctx, map[string]interface{}{"command": "kill_module"})
1431+
1432+
// FTDC is running in the background with a one second interval. So we sleep for two seconds
1433+
// and cross our fingers we don't get a poor scheduler execution. The assertions are
1434+
// intentionally weak to minimize the risk of false positives (a test failure with correct
1435+
// production code).
1436+
time.Sleep(2 * time.Second)
1437+
}
1438+
1439+
mgr.Close(ctx)
1440+
opts.FTDC.StopAndJoin(ctx)
1441+
1442+
datums, err := ftdc.Parse(ftdcData)
1443+
test.That(t, err, test.ShouldBeNil)
1444+
logger.Info("Num ftdc datums: ", len(datums))
1445+
1446+
// Keep count of the number of `ElapsedTimeSecs` readings we encounter. It is a testing bug if
1447+
// we don't see any process FTDC metrics for the module.
1448+
numModuleElapsedTimeMetricsSeen := 0
1449+
for _, datum := range datums {
1450+
for _, reading := range datum.Readings {
1451+
if reading.MetricName == "proc.modules.test-module.ElapsedTimeSecs" {
1452+
logger.Infow("Reading", "timestamp", datum.Time, "elapsedTimeSecs", reading.Value)
1453+
numModuleElapsedTimeMetricsSeen++
1454+
// Dan: I don't have a good reason to believe that we can't (legitimately) observe
1455+
// an `ElapsedTimeSecs` of 0 here. It's more likely we'd see a 0 because we queried
1456+
// a bad PID.
1457+
//
1458+
// If my assumption is wrong and we get a false positive here, we can reevaluate the
1459+
// options for making a more robust test.
1460+
test.That(t, reading.Value, test.ShouldBeGreaterThan, 0)
1461+
}
1462+
}
1463+
}
1464+
1465+
// Assert that we saw at least one datapoint before considering the test a success.
1466+
test.That(t, numModuleElapsedTimeMetricsSeen, test.ShouldBeGreaterThan, 0)
1467+
}
1468+
13821469
func TestModularDiscoverFunc(t *testing.T) {
13831470
ctx := context.Background()
13841471
logger := logging.NewTestLogger(t)

0 commit comments

Comments
 (0)