|
| 1 | +#!/usr/bin/env bash |
| 2 | +# tc-health-summary.sh — aggregate Taskcluster health signals across environments |
| 3 | +# Usage: tc-health-summary.sh [--since 12h] [--envs "community-tc fx-ci"] |
| 4 | +# Runs queries in parallel per environment and prints a structured summary. |
| 5 | + |
| 6 | +SINCE="${SINCE:-12h}" |
| 7 | +ENVS="${ENVS:-community-tc fx-ci}" |
| 8 | +LIMIT=200 |
| 9 | +TMPDIR_BASE=$(mktemp -d) |
| 10 | +trap 'rm -rf "$TMPDIR_BASE"' EXIT |
| 11 | + |
| 12 | +while [[ $# -gt 0 ]]; do |
| 13 | + case $1 in |
| 14 | + --since) SINCE="$2"; shift 2 ;; |
| 15 | + --envs) ENVS="$2"; shift 2 ;; |
| 16 | + *) echo "Unknown flag: $1"; exit 1 ;; |
| 17 | + esac |
| 18 | +done |
| 19 | + |
| 20 | +# Helper: run tc-logview and emit only JSON lines |
| 21 | +query() { |
| 22 | + tc-logview query "$@" --json 2>/dev/null | grep '^{' || true |
| 23 | +} |
| 24 | + |
| 25 | +# Run all queries for a single environment in parallel |
| 26 | +query_env() { |
| 27 | + local env="$1" |
| 28 | + local dir="$TMPDIR_BASE/$env" |
| 29 | + mkdir -p "$dir" |
| 30 | + |
| 31 | + query -e "$env" --type monitor.error --since "$SINCE" --limit "$LIMIT" \ |
| 32 | + > "$dir/errors.jsonl" & |
| 33 | + |
| 34 | + query -e "$env" --type monitor.apiMethod --where 'statusCode="500"' \ |
| 35 | + --since "$SINCE" --limit "$LIMIT" \ |
| 36 | + > "$dir/api_500.jsonl" & |
| 37 | + |
| 38 | + query -e "$env" --type monitor.apiMethod --where 'statusCode="403"' \ |
| 39 | + --since "$SINCE" --limit "$LIMIT" \ |
| 40 | + > "$dir/api_403.jsonl" & |
| 41 | + |
| 42 | + query -e "$env" --type worker-stopped --since "$SINCE" --limit "$LIMIT" \ |
| 43 | + > "$dir/worker_stopped.jsonl" & |
| 44 | + |
| 45 | + query -e "$env" \ |
| 46 | + --filter 'jsonPayload.Logger="taskcluster.queue.claim-resolver"' \ |
| 47 | + --since "$SINCE" --limit "$LIMIT" \ |
| 48 | + > "$dir/claim_expired.jsonl" & |
| 49 | + |
| 50 | + query -e "$env" \ |
| 51 | + --filter 'jsonPayload.Logger="taskcluster.queue.deadline-resolver"' \ |
| 52 | + --since "$SINCE" --limit "$LIMIT" \ |
| 53 | + > "$dir/deadline_exceeded.jsonl" & |
| 54 | + |
| 55 | + wait |
| 56 | +} |
| 57 | + |
| 58 | +count() { |
| 59 | + local f="${1:-}" |
| 60 | + [[ -s "$f" ]] && wc -l < "$f" | tr -d ' ' || echo 0 |
| 61 | +} |
| 62 | + |
| 63 | +# Top N: jq_expr produces one string key per JSON line; handles multiline values safely |
| 64 | +top_n() { |
| 65 | + local file="$1" expr="$2" n="${3:-5}" |
| 66 | + if [[ ! -s "$file" ]]; then echo " (none)"; return; fi |
| 67 | + jq -r "$expr" "$file" 2>/dev/null \ |
| 68 | + | sort | uniq -c | sort -rn | head -"$n" \ |
| 69 | + | awk '{cnt=$1; $1=""; sub(/^ /,""); printf " %4s × %s\n", cnt, $0}' \ |
| 70 | + || true |
| 71 | +} |
| 72 | + |
| 73 | +summarize_env() { |
| 74 | + local env="$1" |
| 75 | + local dir="$TMPDIR_BASE/$env" |
| 76 | + |
| 77 | + echo "" |
| 78 | + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" |
| 79 | + echo " $env (--since $SINCE)" |
| 80 | + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" |
| 81 | + |
| 82 | + # Errors — group by "service | first-line-of-message" |
| 83 | + local err_total; err_total=$(count "$dir/errors.jsonl") |
| 84 | + echo "" |
| 85 | + echo "ERRORS ($err_total total)" |
| 86 | + if [[ $err_total -gt 0 ]]; then |
| 87 | + echo " By type:" |
| 88 | + top_n "$dir/errors.jsonl" \ |
| 89 | + '[.Service // "?", ((.message // .name // "?") | split("\n")[0])] | join(" | ")' |
| 90 | + echo " By hour (UTC):" |
| 91 | + top_n "$dir/errors.jsonl" \ |
| 92 | + '(.ts | split("T")[1] | split(":")[0]) + "h"' 24 |
| 93 | + fi |
| 94 | + |
| 95 | + # API 500s |
| 96 | + local n500; n500=$(count "$dir/api_500.jsonl") |
| 97 | + echo "" |
| 98 | + echo "API 500s ($n500 total)" |
| 99 | + if [[ $n500 -gt 0 ]]; then |
| 100 | + top_n "$dir/api_500.jsonl" '.name // "unknown"' |
| 101 | + fi |
| 102 | + |
| 103 | + # API 403s |
| 104 | + local n403; n403=$(count "$dir/api_403.jsonl") |
| 105 | + echo "" |
| 106 | + echo "API 403s ($n403 total)" |
| 107 | + if [[ $n403 -gt 0 ]]; then |
| 108 | + top_n "$dir/api_403.jsonl" '.name // "unknown"' |
| 109 | + fi |
| 110 | + |
| 111 | + # Claim-expired |
| 112 | + local nce; nce=$(count "$dir/claim_expired.jsonl") |
| 113 | + echo "" |
| 114 | + echo "CLAIM-EXPIRED ($nce tasks)" |
| 115 | + # taskQueueId not present in resolver logs; show taskId sample instead |
| 116 | + if [[ $nce -gt 0 ]]; then |
| 117 | + echo " Sample task IDs:" |
| 118 | + jq -r '.taskId // "?"' "$dir/claim_expired.jsonl" 2>/dev/null | head -5 \ |
| 119 | + | awk '{print " " $0}' || true |
| 120 | + fi |
| 121 | + |
| 122 | + # Deadline-exceeded |
| 123 | + local nde; nde=$(count "$dir/deadline_exceeded.jsonl") |
| 124 | + echo "" |
| 125 | + echo "DEADLINE-EXCEEDED ($nde tasks)" |
| 126 | + if [[ $nde -gt 0 ]]; then |
| 127 | + echo " Sample task IDs:" |
| 128 | + jq -r '.taskId // "?"' "$dir/deadline_exceeded.jsonl" 2>/dev/null | head -5 \ |
| 129 | + | awk '{print " " $0}' || true |
| 130 | + fi |
| 131 | + |
| 132 | + # Worker stopped |
| 133 | + local nws; nws=$(count "$dir/worker_stopped.jsonl") |
| 134 | + echo "" |
| 135 | + echo "WORKERS STOPPED ($nws total)" |
| 136 | + if [[ $nws -gt 0 ]]; then |
| 137 | + echo " By pool (top 8):" |
| 138 | + top_n "$dir/worker_stopped.jsonl" '.workerPoolId // "unknown"' 8 |
| 139 | + echo " By reason:" |
| 140 | + top_n "$dir/worker_stopped.jsonl" '.reason // "(no reason)"' 8 |
| 141 | + fi |
| 142 | + |
| 143 | + # Limit warnings |
| 144 | + echo "" |
| 145 | + local warned=0 |
| 146 | + for f in errors api_500 api_403 claim_expired deadline_exceeded worker_stopped; do |
| 147 | + local c; c=$(count "$dir/$f.jsonl") |
| 148 | + if [[ "$c" -ge "$LIMIT" ]]; then |
| 149 | + echo " ⚠ $f hit limit ($LIMIT) — results truncated, tighten filters or reduce --since" |
| 150 | + warned=1 |
| 151 | + fi |
| 152 | + done |
| 153 | + [[ $warned -eq 0 ]] && echo " ✓ No result limits hit" |
| 154 | +} |
| 155 | + |
| 156 | +# ── Main ───────────────────────────────────────────────────────────────────── |
| 157 | + |
| 158 | +echo "Querying: $ENVS (--since $SINCE, limit $LIMIT per query) ..." |
| 159 | + |
| 160 | +for env in $ENVS; do |
| 161 | + query_env "$env" & |
| 162 | +done |
| 163 | +wait |
| 164 | + |
| 165 | +echo "Done. Summarizing..." |
| 166 | + |
| 167 | +for env in $ENVS; do |
| 168 | + summarize_env "$env" |
| 169 | +done |
| 170 | + |
| 171 | +echo "" |
0 commit comments