Skip to content

Commit 0a82858

Browse files
committed
[core] improvements in error reporting during env deployment and configuration
- "task '/opt/o2/bin/o2-readout-exe' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU, name alio2-cr1-hv-gw01.cern.ch:/opt/git/ControlWorkflows/tasks/readout@12b11ac4bb652e1835e3e94806a688c951691d5f#2sBwA3Z8yWU) failed with error..." displayed by COG becomes "task 'readout' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU) failed with error...". I believe the removed information is not useful for typical operators. Details can be always found in logs. - "MesosCommand MesosCommand_Transition timed out for task 2sBwYFZ82wn" becomes "MesosCommand_Transition timed out for task 2sBwYFZ82wn" because by convention, all MesosCommands have "MesosCommand" in their names. - If a transition request times out, more accurate error is reported in COG. "nil response" becomes "CONFIGURE could not complete for critical tasks, errors: task 'readout' on alio2-cr1-mvs11 (id 2sBwA3Z8yWU) failed with error: MesosCommand_Transition timed out for task 2sBwA3Z8yWU". - In case a task transition times out, any other errors which happened at the same time (e.g. a task crashed during transition) are not omitted anymore in COG. - "nil response" in task/manager.go becomes "no response from Mesos to CONFIGURE transition request within 120s timeout", but it will not be typically printed for tasks timing out during a transition. - "nil response" in commandqueue.go becomes "did not receive neither response nor error for MesosCommand_Transition" One could consider further simplification of some of these messages, but perhaps let's see how this goes.
1 parent 3830420 commit 0a82858

File tree

4 files changed

+11
-13
lines changed

4 files changed

+11
-13
lines changed

core/controlcommands/commandqueue.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (m *CommandQueue) Start() {
101101
}
102102
if err == nil && response == nil {
103103
log.WithField("partition", entry.cmd.GetEnvironmentId().String()).
104-
Error("nil response")
104+
Errorf("did not receive neither response nor error for %s", entry.cmd.GetName())
105105
}
106106

107107
entry.callback <- response
@@ -198,6 +198,11 @@ func (m *CommandQueue) commit(command MesosCommand) (response MesosCommandRespon
198198
// Wait for goroutines to finish
199199
for i := 0; i < len(command.targets()); i++ {
200200
respSemaphore := <-semaphore
201+
// for the sake of better error propagation, we treat a lack of response as a response with error,
202+
// even though it's not technically the same. it can be surely done better, but it would require a larger refactoring.
203+
if respSemaphore.err != nil && respSemaphore.response == nil {
204+
respSemaphore.response = NewMesosCommandResponse(command, respSemaphore.err)
205+
}
201206
responses[respSemaphore.receiver] = respSemaphore.response
202207
if respSemaphore.err != nil {
203208
sendErrorList = append(sendErrorList, respSemaphore.err)
@@ -215,12 +220,11 @@ func (m *CommandQueue) commit(command MesosCommand) (response MesosCommandRespon
215220
}
216221
return
217222
}(), "\n"))
218-
return
219223
}
220224
response = consolidateResponses(command, responses)
221225

222226
log.WithField("partition", command.GetEnvironmentId().String()).
223227
Debug("responses consolidated, CommandQueue commit done")
224228

225-
return response, nil
229+
return response, err
226230
}

core/controlcommands/mesoscommandservent.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ func (s *Servent) RunCommand(cmd MesosCommand, receiver MesosCommandTarget) (Mes
144144
// By the time we get here, ProcessResponse should have already added a Response to the
145145
// pending call, and removed it from servent.pending.
146146
case <-time.After(cmd.GetResponseTimeout()):
147-
call.Error = fmt.Errorf("MesosCommand %s timed out for task %s", cmd.GetName(), receiver.TaskId.Value)
147+
call.Error = fmt.Errorf("%s timed out for task %s", cmd.GetName(), receiver.TaskId.Value)
148148

149149
log.WithPrefix("servent").
150150
WithField("partition", cmd.GetEnvironmentId().String()).

core/task/manager.go

+2-9
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error {
754754
close(notify)
755755

756756
if response == nil {
757-
return errors.New("nil response")
757+
return fmt.Errorf("no response from Mesos to CONFIGURE transition request within %ds timeout", int(cmd.ResponseTimeout.Seconds()))
758758
}
759759

760760
if response.IsMultiResponse() {
@@ -765,14 +765,7 @@ func (m *Manager) configureTasks(envId uid.ID, tasks Tasks) error {
765765
task := m.GetTask(k.TaskId.Value)
766766
var taskDescription string
767767
if task != nil {
768-
tci := task.GetTaskCommandInfo()
769-
tciValue := "unknown command"
770-
if tci.Value != nil {
771-
tciValue = *tci.Value
772-
}
773-
774-
taskDescription = fmt.Sprintf("task '%s' on %s (id %s, name %s) failed with error: %s", tciValue, task.GetHostname(), task.GetTaskId(), task.GetName(), v.Error())
775-
768+
taskDescription = fmt.Sprintf("task '%s' on %s (id %s) failed with error: %s", task.GetParent().GetName(), task.GetHostname(), task.GetTaskId(), v.Error())
776769
} else {
777770
taskDescription = fmt.Sprintf("unknown task (id %s) failed with error: %s", k.TaskId.Value, v.Error())
778771
}

core/task/task.go

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ type parentRole interface {
7676
ConsolidatedVarStack() (varStack map[string]string, err error)
7777
CollectInboundChannels() []channel.Inbound
7878
SendEvent(event.Event)
79+
GetName() string
7980
}
8081

8182
type Traits struct {

0 commit comments

Comments
 (0)