Skip to content

Commit 72f1268

Browse files
committed
Make tc health summary more informative and include worker manager info
1 parent 5aa25f6 commit 72f1268

2 files changed

Lines changed: 52 additions & 12 deletions

File tree

.claude/skills/debug-tc-logs/tc-health-summary.sh

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
SINCE="${SINCE:-12h}"
77
ENVS="${ENVS:-community-tc fx-ci}"
8-
LIMIT=200
8+
LIMIT=500
99
TMPDIR_BASE=$(mktemp -d)
1010
trap 'rm -rf "$TMPDIR_BASE"' EXIT
1111

@@ -39,19 +39,32 @@ query_env() {
3939
--since "$SINCE" --limit "$LIMIT" \
4040
> "$dir/api_403.jsonl" &
4141

42+
query -e "$env" --type worker-requested --since "$SINCE" --limit "$LIMIT" \
43+
> "$dir/worker_requested.jsonl" &
44+
4245
query -e "$env" --type worker-stopped --since "$SINCE" --limit "$LIMIT" \
4346
> "$dir/worker_stopped.jsonl" &
4447

45-
query -e "$env" \
48+
# Claim-expired tasks (task-exception from claim-resolver)
49+
query -e "$env" --type task-exception \
4650
--filter 'jsonPayload.Logger="taskcluster.queue.claim-resolver"' \
4751
--since "$SINCE" --limit "$LIMIT" \
4852
> "$dir/claim_expired.jsonl" &
4953

50-
query -e "$env" \
54+
# Deadline-exceeded tasks (task-exception from deadline-resolver)
55+
query -e "$env" --type task-exception \
5156
--filter 'jsonPayload.Logger="taskcluster.queue.deadline-resolver"' \
5257
--since "$SINCE" --limit "$LIMIT" \
5358
> "$dir/deadline_exceeded.jsonl" &
5459

60+
# Worker-manager periodic loop failures
61+
for loop in workerScanner workerScannerAzure provisioner; do
62+
query -e "$env" --type monitor.periodic \
63+
--where "name=$loop" --where 'status!=success' \
64+
--since "$SINCE" --limit "$LIMIT" \
65+
> "$dir/periodic_${loop}.jsonl" &
66+
done
67+
5568
wait
5669
}
5770

@@ -112,21 +125,29 @@ summarize_env() {
112125
local nce; nce=$(count "$dir/claim_expired.jsonl")
113126
echo ""
114127
echo "CLAIM-EXPIRED ($nce tasks)"
115-
# taskQueueId not present in resolver logs; show taskId sample instead
116128
if [[ $nce -gt 0 ]]; then
117-
echo " Sample task IDs:"
118-
jq -r '.taskId // "?"' "$dir/claim_expired.jsonl" 2>/dev/null | head -5 \
119-
| awk '{print " " $0}' || true
129+
echo " By task queue:"
130+
top_n "$dir/claim_expired.jsonl" '.taskQueueId // "unknown"'
120131
fi
121132

122133
# Deadline-exceeded
123134
local nde; nde=$(count "$dir/deadline_exceeded.jsonl")
124135
echo ""
125136
echo "DEADLINE-EXCEEDED ($nde tasks)"
126137
if [[ $nde -gt 0 ]]; then
127-
echo " Sample task IDs:"
128-
jq -r '.taskId // "?"' "$dir/deadline_exceeded.jsonl" 2>/dev/null | head -5 \
129-
| awk '{print " " $0}' || true
138+
echo " By task queue:"
139+
top_n "$dir/deadline_exceeded.jsonl" '.taskQueueId // "unknown"'
140+
fi
141+
142+
# Workers requested
143+
local nwr; nwr=$(count "$dir/worker_requested.jsonl")
144+
echo ""
145+
echo "WORKERS REQUESTED ($nwr total)"
146+
if [[ $nwr -gt 0 ]]; then
147+
echo " By pool (top 8):"
148+
top_n "$dir/worker_requested.jsonl" '.workerPoolId // "unknown"' 8
149+
echo " By provider:"
150+
top_n "$dir/worker_requested.jsonl" '.providerId // "unknown"'
130151
fi
131152

132153
# Worker stopped
@@ -140,10 +161,29 @@ summarize_env() {
140161
top_n "$dir/worker_stopped.jsonl" '.reason // "(no reason)"' 8
141162
fi
142163

164+
# Worker-manager periodic loop failures
165+
echo ""
166+
echo "PERIODIC LOOP FAILURES"
167+
local any_periodic_fail=0
168+
for loop in workerScanner workerScannerAzure provisioner; do
169+
local f="$dir/periodic_${loop}.jsonl"
170+
local nfail; nfail=$(count "$f")
171+
if [[ $nfail -eq 0 ]]; then continue; fi
172+
any_periodic_fail=1
173+
echo " $loop: ${nfail} failures"
174+
echo " By status:"
175+
top_n "$f" '.status // "unknown"'
176+
echo " Max duration:"
177+
local max_dur; max_dur=$(jq -r '.duration // 0' "$f" 2>/dev/null | sort -n | tail -1)
178+
echo " ${max_dur}s"
179+
done
180+
[[ $any_periodic_fail -eq 0 ]] && echo " ✓ All loops healthy (no non-success runs)"
181+
143182
# Limit warnings
144183
echo ""
145184
local warned=0
146-
for f in errors api_500 api_403 claim_expired deadline_exceeded worker_stopped; do
185+
for f in errors api_500 api_403 claim_expired deadline_exceeded worker_requested worker_stopped \
186+
periodic_workerScanner periodic_workerScannerAzure periodic_provisioner; do
147187
local c; c=$(count "$dir/$f.jsonl")
148188
if [[ "$c" -ge "$LIMIT" ]]; then
149189
echo "$f hit limit ($LIMIT) — results truncated, tighten filters or reduce --since"

cmd/query.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ func runQuery(cmd *cobra.Command, args []string) error {
221221
}
222222

223223
// Add message field when --filter is used or there are just two columns (ts, service)
224-
if (queryFilter != "" || len(fieldNames) < 3 ) && !slices.Contains(fieldNames, "message") {
224+
if (queryFilter != "" || len(types) == 0 ) && !slices.Contains(fieldNames, "message") {
225225
fieldNames = append(fieldNames, "message")
226226
}
227227

0 commit comments

Comments
 (0)