Skip to content

Commit df67f03

Browse files
committed
Add health summary script
1 parent 387f16d commit df67f03

2 files changed

Lines changed: 182 additions & 0 deletions

File tree

.claude/skills/debug-tc-logs/SKILL.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ user_invocable: true
88

99
You are debugging Taskcluster by querying task status/logs via the `taskcluster` CLI and GCP Cloud Logging via `tc-logview`. Follow this protocol strictly.
1010

11+
## 0. Health Summary Script
12+
13+
For broad "what happened?" investigations across environments, use the pre-built aggregation script before running manual queries:
14+
15+
```bash
16+
bash ~/.claude/skills/debug-tc-logs/tc-health-summary.sh --since 12h
17+
# or: --since 6h, --envs "fx-ci", --envs "community-tc fx-ci staging", etc.
18+
```
19+
20+
The script runs all key queries in parallel (errors, 500s, 403s, claim-expired, deadline-exceeded, worker-stopped) for each environment and prints a structured summary. Use it as a starting point — it will reveal which areas need deeper follow-up queries.
21+
1122
## 1. Prerequisites
1223

1324
Before running any query, verify:
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#!/usr/bin/env bash
2+
# tc-health-summary.sh — aggregate Taskcluster health signals across environments
3+
# Usage: tc-health-summary.sh [--since 12h] [--envs "community-tc fx-ci"]
4+
# Runs queries in parallel per environment and prints a structured summary.
5+
6+
SINCE="${SINCE:-12h}"
7+
ENVS="${ENVS:-community-tc fx-ci}"
8+
LIMIT=200
9+
TMPDIR_BASE=$(mktemp -d)
10+
trap 'rm -rf "$TMPDIR_BASE"' EXIT
11+
12+
while [[ $# -gt 0 ]]; do
13+
case $1 in
14+
--since) SINCE="$2"; shift 2 ;;
15+
--envs) ENVS="$2"; shift 2 ;;
16+
*) echo "Unknown flag: $1"; exit 1 ;;
17+
esac
18+
done
19+
20+
# Helper: run tc-logview and emit only JSON lines
21+
query() {
22+
tc-logview query "$@" --json 2>/dev/null | grep '^{' || true
23+
}
24+
25+
# Run all queries for a single environment in parallel
26+
query_env() {
27+
local env="$1"
28+
local dir="$TMPDIR_BASE/$env"
29+
mkdir -p "$dir"
30+
31+
query -e "$env" --type monitor.error --since "$SINCE" --limit "$LIMIT" \
32+
> "$dir/errors.jsonl" &
33+
34+
query -e "$env" --type monitor.apiMethod --where 'statusCode="500"' \
35+
--since "$SINCE" --limit "$LIMIT" \
36+
> "$dir/api_500.jsonl" &
37+
38+
query -e "$env" --type monitor.apiMethod --where 'statusCode="403"' \
39+
--since "$SINCE" --limit "$LIMIT" \
40+
> "$dir/api_403.jsonl" &
41+
42+
query -e "$env" --type worker-stopped --since "$SINCE" --limit "$LIMIT" \
43+
> "$dir/worker_stopped.jsonl" &
44+
45+
query -e "$env" \
46+
--filter 'jsonPayload.Logger="taskcluster.queue.claim-resolver"' \
47+
--since "$SINCE" --limit "$LIMIT" \
48+
> "$dir/claim_expired.jsonl" &
49+
50+
query -e "$env" \
51+
--filter 'jsonPayload.Logger="taskcluster.queue.deadline-resolver"' \
52+
--since "$SINCE" --limit "$LIMIT" \
53+
> "$dir/deadline_exceeded.jsonl" &
54+
55+
wait
56+
}
57+
58+
count() {
59+
local f="${1:-}"
60+
[[ -s "$f" ]] && wc -l < "$f" | tr -d ' ' || echo 0
61+
}
62+
63+
# Top N: jq_expr produces one string key per JSON line; handles multiline values safely
64+
top_n() {
65+
local file="$1" expr="$2" n="${3:-5}"
66+
if [[ ! -s "$file" ]]; then echo " (none)"; return; fi
67+
jq -r "$expr" "$file" 2>/dev/null \
68+
| sort | uniq -c | sort -rn | head -"$n" \
69+
| awk '{cnt=$1; $1=""; sub(/^ /,""); printf " %4s × %s\n", cnt, $0}' \
70+
|| true
71+
}
72+
73+
summarize_env() {
74+
local env="$1"
75+
local dir="$TMPDIR_BASE/$env"
76+
77+
echo ""
78+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
79+
echo " $env (--since $SINCE)"
80+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
81+
82+
# Errors — group by "service | first-line-of-message"
83+
local err_total; err_total=$(count "$dir/errors.jsonl")
84+
echo ""
85+
echo "ERRORS ($err_total total)"
86+
if [[ $err_total -gt 0 ]]; then
87+
echo " By type:"
88+
top_n "$dir/errors.jsonl" \
89+
'[.Service // "?", ((.message // .name // "?") | split("\n")[0])] | join(" | ")'
90+
echo " By hour (UTC):"
91+
top_n "$dir/errors.jsonl" \
92+
'(.ts | split("T")[1] | split(":")[0]) + "h"' 24
93+
fi
94+
95+
# API 500s
96+
local n500; n500=$(count "$dir/api_500.jsonl")
97+
echo ""
98+
echo "API 500s ($n500 total)"
99+
if [[ $n500 -gt 0 ]]; then
100+
top_n "$dir/api_500.jsonl" '.name // "unknown"'
101+
fi
102+
103+
# API 403s
104+
local n403; n403=$(count "$dir/api_403.jsonl")
105+
echo ""
106+
echo "API 403s ($n403 total)"
107+
if [[ $n403 -gt 0 ]]; then
108+
top_n "$dir/api_403.jsonl" '.name // "unknown"'
109+
fi
110+
111+
# Claim-expired
112+
local nce; nce=$(count "$dir/claim_expired.jsonl")
113+
echo ""
114+
echo "CLAIM-EXPIRED ($nce tasks)"
115+
# taskQueueId not present in resolver logs; show taskId sample instead
116+
if [[ $nce -gt 0 ]]; then
117+
echo " Sample task IDs:"
118+
jq -r '.taskId // "?"' "$dir/claim_expired.jsonl" 2>/dev/null | head -5 \
119+
| awk '{print " " $0}' || true
120+
fi
121+
122+
# Deadline-exceeded
123+
local nde; nde=$(count "$dir/deadline_exceeded.jsonl")
124+
echo ""
125+
echo "DEADLINE-EXCEEDED ($nde tasks)"
126+
if [[ $nde -gt 0 ]]; then
127+
echo " Sample task IDs:"
128+
jq -r '.taskId // "?"' "$dir/deadline_exceeded.jsonl" 2>/dev/null | head -5 \
129+
| awk '{print " " $0}' || true
130+
fi
131+
132+
# Worker stopped
133+
local nws; nws=$(count "$dir/worker_stopped.jsonl")
134+
echo ""
135+
echo "WORKERS STOPPED ($nws total)"
136+
if [[ $nws -gt 0 ]]; then
137+
echo " By pool (top 8):"
138+
top_n "$dir/worker_stopped.jsonl" '.workerPoolId // "unknown"' 8
139+
echo " By reason:"
140+
top_n "$dir/worker_stopped.jsonl" '.reason // "(no reason)"' 8
141+
fi
142+
143+
# Limit warnings
144+
echo ""
145+
local warned=0
146+
for f in errors api_500 api_403 claim_expired deadline_exceeded worker_stopped; do
147+
local c; c=$(count "$dir/$f.jsonl")
148+
if [[ "$c" -ge "$LIMIT" ]]; then
149+
echo "$f hit limit ($LIMIT) — results truncated, tighten filters or reduce --since"
150+
warned=1
151+
fi
152+
done
153+
[[ $warned -eq 0 ]] && echo " ✓ No result limits hit"
154+
}
155+
156+
# ── Main ─────────────────────────────────────────────────────────────────────
157+
158+
echo "Querying: $ENVS (--since $SINCE, limit $LIMIT per query) ..."
159+
160+
for env in $ENVS; do
161+
query_env "$env" &
162+
done
163+
wait
164+
165+
echo "Done. Summarizing..."
166+
167+
for env in $ENVS; do
168+
summarize_env "$env"
169+
done
170+
171+
echo ""

0 commit comments

Comments
 (0)