adarshxs HF Staff commited on
Commit
1d3e970
·
verified ·
1 Parent(s): 200869a

Upload folder using huggingface_hub

Browse files
Files changed (43) hide show
  1. .dockerignore +14 -0
  2. .env.example +19 -0
  3. .gitignore +11 -0
  4. Dockerfile +17 -0
  5. README.md +212 -12
  6. app.py +31 -0
  7. config/monitor.yaml +51 -0
  8. monitoring/docker-compose.yml +45 -0
  9. monitoring/github-actions-post-job.yml +28 -0
  10. monitoring/grafana/dashboards/build-duration-trends.json +474 -0
  11. monitoring/grafana/dashboards/build-failure-overview.json +500 -0
  12. monitoring/grafana/dashboards/build-matrix-overview.json +593 -0
  13. monitoring/grafana/provisioning/dashboards/dashboards.yml +11 -0
  14. monitoring/grafana/provisioning/datasources/prometheus.yml +10 -0
  15. monitoring/prometheus/prometheus.yml +18 -0
  16. monitoring/prometheus/rules/build-alerts.yml +20 -0
  17. requirements-dev.txt +3 -0
  18. requirements.txt +9 -0
  19. scripts/bootstrap_space.py +150 -0
  20. scripts/push_build_metrics.py +48 -0
  21. scripts/smoke_check.py +63 -0
  22. src/kc_monitor/__init__.py +5 -0
  23. src/kc_monitor/config.py +190 -0
  24. src/kc_monitor/github_client.py +456 -0
  25. src/kc_monitor/grafana.py +65 -0
  26. src/kc_monitor/kernel_index.py +108 -0
  27. src/kc_monitor/log_parser.py +216 -0
  28. src/kc_monitor/metrics_push.py +190 -0
  29. src/kc_monitor/models.py +342 -0
  30. src/kc_monitor/service.py +572 -0
  31. src/kc_monitor/stall_detector.py +48 -0
  32. src/kc_monitor/ui.py +1110 -0
  33. tests/conftest.py +10 -0
  34. tests/fixtures/active_build_job.json +45 -0
  35. tests/fixtures/build_release_run.json +19 -0
  36. tests/fixtures/failed_build_job.json +45 -0
  37. tests/fixtures/failed_build_run.json +19 -0
  38. tests/fixtures/manual_build_run.json +19 -0
  39. tests/fixtures/manual_upload_job.json +53 -0
  40. tests/test_grafana.py +44 -0
  41. tests/test_log_parser.py +52 -0
  42. tests/test_metrics_push.py +96 -0
  43. tests/test_service.py +152 -0
.dockerignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .gitignore
3
+ .env
4
+ .venv
5
+ venv
6
+ __pycache__
7
+ .pytest_cache
8
+ .ruff_cache
9
+ *.pyc
10
+ *.pyo
11
+ *.pyd
12
+ .cursor
13
+ tests
14
+ requirements-dev.txt
.env.example ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GITHUB_TOKEN=your_github_token_here
2
+ HF_TOKEN=your_huggingface_token_here
3
+ KCM_SPACE_ID=adarshxs/kernels-community-monitor
4
+ KCM_GITHUB_OWNER=huggingface
5
+ KCM_GITHUB_REPO=kernels-community
6
+ KCM_GITHUB_BRANCH=main
7
+ KCM_REFRESH_INTERVAL_SECONDS=300
8
+ KCM_WORKFLOW_RUN_PAGE_SIZE=100
9
+ KCM_WORKFLOW_RUN_PAGES=12
10
+ KCM_CRITICAL_KERNELS=flash-attn3,sgl-flash-attn3,flash-attn4,vllm-flash-attn3,deep-gemm
11
+ KCM_GRAFANA_BASE_URL=http://localhost:3000
12
+ KCM_GRAFANA_ORG_ID=1
13
+ KCM_GRAFANA_THEME=dark
14
+ KCM_GRAFANA_OVERVIEW_UID=kernels-build-matrix
15
+ KCM_GRAFANA_DURATION_UID=kernels-build-durations
16
+ KCM_GRAFANA_FAILURE_UID=kernels-build-failures
17
+ KCM_PROMETHEUS_BASE_URL=http://prometheus:9090
18
+ KCM_PUSHGATEWAY_URL=http://pushgateway:9091
19
+ KCM_PUSHGATEWAY_JOB_NAME=kernels-community-build-matrix
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ .env
4
+ .venv/
5
+ venv/
6
+ .pytest_cache/
7
+ .ruff_cache/
8
+ .mypy_cache/
9
+ build/
10
+ dist/
11
+ *.log
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PORT=7860
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt /app/requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade pip && \
11
+ pip install --no-cache-dir -r /app/requirements.txt
12
+
13
+ COPY . /app
14
+
15
+ EXPOSE 7860
16
+
17
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,12 +1,212 @@
1
- ---
2
- title: Kernel Ci Monitor
3
- emoji: 🏃
4
- colorFrom: red
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 6.10.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Kernels Community Monitor
3
+ sdk: gradio
4
+ sdk_version: 6.10.0
5
+ python_version: 3.11
6
+ app_file: app.py
7
+ fullWidth: true
8
+ header: mini
9
+ suggested_hardware: cpu-basic
10
+ short_description: Live kernel build table plus optional Grafana metrics deck.
11
+ tags:
12
+ - monitoring
13
+ - github-actions
14
+ - kernels
15
+ - gradio
16
+ - grafana
17
+ ---
18
+
19
+ # Kernels Community Monitor
20
+
21
+ `Kernels Community Monitor` now does two things:
22
+
23
+ 1. Enumerates every kernel source dir in `huggingface/kernels-community`, scans the latest GitHub Actions runs, and renders a live per-kernel / per-variant status table with Actions links.
24
+ 2. Optionally embeds Grafana dashboards for longer-term metrics once a public Grafana endpoint is configured.
25
+
26
+ The app prefers the GitHub Actions REST API when it can, but it also has a public GitHub HTML fallback for workflow runs, job groups, and `build.toml` reads. That avoids the current `huggingface` org restriction that blocks classic PATs on some Actions endpoints.
27
+
28
+ The metrics path is still in the repo:
29
+
30
+ - GitHub Actions pushes per-matrix build metrics to Pushgateway.
31
+ - Prometheus scrapes Pushgateway and evaluates alert rules.
32
+ - Grafana owns the dashboards, filters, and time-series UI.
33
+ - This Hugging Face Space just presents clean links and embeds for those dashboards.
34
+
35
+ ## What Changed
36
+
37
+ The old zero-upstream-change monitor worked, but it had three hard limits:
38
+
39
+ - it depended on GitHub API polling and log scraping
40
+ - it could only infer matrix state indirectly
41
+ - it could not give you clean duration trends or robust alerting without more brittle parsing
42
+
43
+ This cutover replaces that with first-class metrics:
44
+
45
+ - `scripts/push_build_metrics.py` pushes the latest status, duration, and timestamp for each matrix combo.
46
+ - `monitoring/docker-compose.yml` provisions `prometheus`, `pushgateway`, and `grafana`.
47
+ - `monitoring/prometheus/rules/build-alerts.yml` alerts on failing or stale combos.
48
+ - `monitoring/grafana/dashboards/` provides ready dashboards with filters for kernel, backend, compute backend, CUDA, PyTorch, and Python.
49
+ - `src/kc_monitor/ui.py` renders the live kernel matrix table first, then the Grafana deck if configured.
50
+
51
+ ## Metrics Model
52
+
53
+ Each matrix combo is stored as a stable Pushgateway grouping key:
54
+
55
+ `kernel + backend + compute_backend + cuda_version + pytorch_version + python_version`
56
+
57
+ Each push updates these gauges:
58
+
59
+ - `kc_build_last_run_result_code`
60
+ - `kc_build_last_run_failed`
61
+ - `kc_build_last_run_duration_seconds`
62
+ - `kc_build_last_run_timestamp_seconds`
63
+ - `kc_build_last_run_info`
64
+
65
+ That gives you:
66
+
67
+ - current per-combo health
68
+ - duration history per combo
69
+ - stale build telemetry detection
70
+ - alert-friendly failure signals
71
+
72
+ ## Local Setup
73
+
74
+ Install deps:
75
+
76
+ ```bash
77
+ python -m venv .venv
78
+ . .venv/bin/activate
79
+ pip install -r requirements-dev.txt
80
+ ```
81
+
82
+ Windows PowerShell activation:
83
+
84
+ ```powershell
85
+ python -m venv .venv
86
+ .\.venv\Scripts\Activate.ps1
87
+ pip install -r requirements-dev.txt
88
+ ```
89
+
90
+ Create `.env` from `.env.example` and set at least:
91
+
92
+ ```env
93
+ HF_TOKEN=...
94
+ ```
95
+
96
+ If you want local Grafana too, also set:
97
+
98
+ ```bash
99
+ KCM_GRAFANA_BASE_URL=http://localhost:3000
100
+ docker compose -f monitoring/docker-compose.yml up -d
101
+ ```
102
+
103
+ Run the app locally:
104
+
105
+ ```bash
106
+ python app.py
107
+ ```
108
+
109
+ Run the smoke check:
110
+
111
+ ```bash
112
+ python scripts/smoke_check.py
113
+ ```
114
+
115
+ Run tests:
116
+
117
+ ```bash
118
+ pytest
119
+ ```
120
+
121
+ ## GitHub Actions Step
122
+
123
+ The actual workflow YAMLs live in the `huggingface/kernels-community` repo, not here.
124
+
125
+ Use `monitoring/github-actions-post-job.yml` as the drop-in snippet. The important bit is:
126
+
127
+ ```yaml
128
+ - name: Record matrix job start time
129
+ shell: bash
130
+ run: echo "KCM_JOB_STARTED_AT=$(date +%s)" >> "$GITHUB_ENV"
131
+
132
+ - name: Push matrix build metrics
133
+ if: always()
134
+ shell: bash
135
+ env:
136
+ PUSHGATEWAY_URL: ${{ secrets.PUSHGATEWAY_URL }}
137
+ KCM_PUSHGATEWAY_JOB_NAME: kernels-community-build-matrix
138
+ KCM_JOB_STATUS: ${{ job.status }}
139
+ KCM_KERNEL: ${{ matrix.kernel }}
140
+ KCM_BACKEND: ${{ matrix.backend }}
141
+ KCM_COMPUTE_BACKEND: ${{ matrix.compute_backend }}
142
+ KCM_CUDA_VERSION: ${{ matrix.cuda }}
143
+ KCM_PYTORCH_VERSION: ${{ matrix.torch }}
144
+ KCM_PYTHON_VERSION: ${{ matrix.python }}
145
+ run: python scripts/push_build_metrics.py
146
+ ```
147
+
148
+ The emitter is intentionally low-cardinality: it tracks the latest state for each stable combo, which is what you want for Grafana filters and Prometheus alerts without Pushgateway turning into a per-run junk drawer.
149
+
150
+ ## Dashboards
151
+
152
+ Provisioned dashboards:
153
+
154
+ - `kernels-build-matrix`
155
+ - `kernels-build-durations`
156
+ - `kernels-build-failures`
157
+
158
+ All of them expose variables for:
159
+
160
+ - kernel
161
+ - backend
162
+ - compute backend
163
+ - CUDA version
164
+ - PyTorch version
165
+ - Python version
166
+
167
+ ## Alerting
168
+
169
+ Prometheus rules ship in `monitoring/prometheus/rules/build-alerts.yml`.
170
+
171
+ Current rules:
172
+
173
+ - `KernelsBuildMatrixComboFailing`
174
+ - `KernelsBuildMetricsStale`
175
+
176
+ You can route those through Alertmanager later, but the expression layer is already there.
177
+
178
+ ## Runtime Configuration
179
+
180
+ Main env/config knobs:
181
+
182
+ - `KCM_GRAFANA_BASE_URL`
183
+ - `KCM_GRAFANA_ORG_ID`
184
+ - `KCM_GRAFANA_THEME`
185
+ - `KCM_GRAFANA_OVERVIEW_UID`
186
+ - `KCM_GRAFANA_DURATION_UID`
187
+ - `KCM_GRAFANA_FAILURE_UID`
188
+ - `KCM_PROMETHEUS_BASE_URL`
189
+ - `KCM_PUSHGATEWAY_URL`
190
+ - `KCM_PUSHGATEWAY_JOB_NAME`
191
+
192
+ If `KCM_GRAFANA_BASE_URL` is not set, the Space still works: the live GitHub Actions table stays active and the Grafana section renders as a setup card instead of broken embeds.
193
+
194
+ The base YAML config lives at `config/monitor.yaml`. Environment variables override it at runtime.
195
+
196
+ ## Deploy To Hugging Face Space
197
+
198
+ This repo still includes a bootstrap script that creates or updates the Space and uploads the current folder.
199
+
200
+ ```bash
201
+ python scripts/bootstrap_space.py --space-id adarshxs/kernels-community-monitor
202
+ ```
203
+
204
+ What it does:
205
+
206
+ - creates the Space repo if it does not exist
207
+ - uploads this project as a Gradio Space
208
+ - writes the Grafana, Prometheus, and Pushgateway settings into Space variables
209
+
210
+ After upload, the expected Space URL is:
211
+
212
+ - `https://huggingface.co/spaces/adarshxs/kernels-community-monitor`
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+
9
+ ROOT_DIR = Path(__file__).resolve().parent
10
+ SRC_DIR = ROOT_DIR / "src"
11
+ if str(SRC_DIR) not in sys.path:
12
+ sys.path.insert(0, str(SRC_DIR))
13
+
14
+ from kc_monitor.config import load_config
15
+ from kc_monitor.service import MonitorService
16
+ from kc_monitor.ui import CSS, PAGE_JS, THEME, build_dashboard
17
+
18
+
19
+ config = load_config(ROOT_DIR / "config" / "monitor.yaml")
20
+ service = MonitorService(config)
21
+ demo = build_dashboard(service, config)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ demo.launch(
26
+ server_name="0.0.0.0",
27
+ server_port=int(os.getenv("PORT", "7860")),
28
+ theme=THEME,
29
+ css=CSS,
30
+ js=PAGE_JS,
31
+ )
config/monitor.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ github:
2
+ owner: huggingface
3
+ repo: kernels-community
4
+ branch: main
5
+ per_page: 100
6
+ request_timeout_seconds: 25
7
+ workflows:
8
+ - path: .github/workflows/build-release.yaml
9
+ label: Build Release
10
+ enabled: true
11
+ - path: .github/workflows/manual-build-upload.yaml
12
+ label: Manual Kernel Build
13
+ enabled: true
14
+
15
+ monitor:
16
+ refresh_interval_seconds: 300
17
+ snapshot_ttl_seconds: 240
18
+ workflow_run_page_size: 100
19
+ workflow_run_pages: 12
20
+ recent_completed_hours: 336
21
+ recent_limit: 30
22
+ completed_runs_per_workflow: 15
23
+ log_line_limit: 400
24
+ log_char_limit: 35000
25
+ detail_event_limit: 25
26
+ stall_without_log_minutes: 45
27
+ stall_active_phase_minutes: 180
28
+ critical_kernels:
29
+ - flash-attn3
30
+ - sgl-flash-attn3
31
+ - flash-attn4
32
+ - vllm-flash-attn3
33
+ - deep-gemm
34
+
35
+ grafana:
36
+ base_url: null
37
+ org_id: 1
38
+ theme: dark
39
+ default_from: now-30d
40
+ default_to: now
41
+ default_refresh: 5m
42
+ overview_dashboard_uid: kernels-build-matrix
43
+ duration_dashboard_uid: kernels-build-durations
44
+ failure_dashboard_uid: kernels-build-failures
45
+
46
+ prometheus:
47
+ base_url: null
48
+
49
+ pushgateway:
50
+ url: null
51
+ job_name: kernels-community-build-matrix
monitoring/docker-compose.yml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ prometheus:
3
+ image: prom/prometheus
4
+ command:
5
+ - --config.file=/etc/prometheus/prometheus.yml
6
+ - --web.enable-lifecycle
7
+ ports:
8
+ - "9090:9090"
9
+ volumes:
10
+ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
11
+ - ./prometheus/rules:/etc/prometheus/rules:ro
12
+ - prometheus-data:/prometheus
13
+
14
+ pushgateway:
15
+ image: prom/pushgateway
16
+ command:
17
+ - --persistence.file=/data/pushgateway.db
18
+ ports:
19
+ - "9091:9091"
20
+ volumes:
21
+ - pushgateway-data:/data
22
+
23
+ grafana:
24
+ image: grafana/grafana-oss
25
+ depends_on:
26
+ - prometheus
27
+ environment:
28
+ GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
29
+ GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
30
+ GF_AUTH_ANONYMOUS_ENABLED: ${GRAFANA_ANONYMOUS_ENABLED:-true}
31
+ GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
32
+ GF_SECURITY_ALLOW_EMBEDDING: "true"
33
+ GF_DASHBOARDS_MIN_REFRESH_INTERVAL: 10s
34
+ GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3000}
35
+ ports:
36
+ - "3000:3000"
37
+ volumes:
38
+ - ./grafana/provisioning:/etc/grafana/provisioning:ro
39
+ - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
40
+ - grafana-data:/var/lib/grafana
41
+
42
+ volumes:
43
+ prometheus-data:
44
+ pushgateway-data:
45
+ grafana-data:
monitoring/github-actions-post-job.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Drop this into the kernels-community workflow repo.
2
+ #
3
+ # Example matrix fields expected below:
4
+ # matrix.kernel
5
+ # matrix.backend
6
+ # matrix.compute_backend
7
+ # matrix.cuda
8
+ # matrix.torch
9
+ # matrix.python
10
+
11
+ - name: Record matrix job start time
12
+ shell: bash
13
+ run: echo "KCM_JOB_STARTED_AT=$(date +%s)" >> "$GITHUB_ENV"
14
+
15
+ - name: Push matrix build metrics
16
+ if: always()
17
+ shell: bash
18
+ env:
19
+ PUSHGATEWAY_URL: ${{ secrets.PUSHGATEWAY_URL }}
20
+ KCM_PUSHGATEWAY_JOB_NAME: kernels-community-build-matrix
21
+ KCM_JOB_STATUS: ${{ job.status }}
22
+ KCM_KERNEL: ${{ matrix.kernel }}
23
+ KCM_BACKEND: ${{ matrix.backend }}
24
+ KCM_COMPUTE_BACKEND: ${{ matrix.compute_backend }}
25
+ KCM_CUDA_VERSION: ${{ matrix.cuda }}
26
+ KCM_PYTORCH_VERSION: ${{ matrix.torch }}
27
+ KCM_PYTHON_VERSION: ${{ matrix.python }}
28
+ run: python scripts/push_build_metrics.py
monitoring/grafana/dashboards/build-duration-trends.json ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": []
4
+ },
5
+ "editable": true,
6
+ "fiscalYearStartMonth": 0,
7
+ "graphTooltip": 1,
8
+ "links": [],
9
+ "panels": [
10
+ {
11
+ "datasource": {
12
+ "type": "prometheus",
13
+ "uid": "prometheus"
14
+ },
15
+ "fieldConfig": {
16
+ "defaults": {
17
+ "color": {
18
+ "mode": "thresholds"
19
+ },
20
+ "thresholds": {
21
+ "mode": "absolute",
22
+ "steps": [
23
+ {
24
+ "color": "green",
25
+ "value": null
26
+ }
27
+ ]
28
+ },
29
+ "unit": "s"
30
+ },
31
+ "overrides": []
32
+ },
33
+ "gridPos": {
34
+ "h": 5,
35
+ "w": 8,
36
+ "x": 0,
37
+ "y": 0
38
+ },
39
+ "id": 1,
40
+ "options": {
41
+ "colorMode": "value",
42
+ "graphMode": "none",
43
+ "justifyMode": "auto",
44
+ "orientation": "auto",
45
+ "reduceOptions": {
46
+ "calcs": [
47
+ "lastNotNull"
48
+ ],
49
+ "fields": "",
50
+ "values": false
51
+ },
52
+ "textMode": "value"
53
+ },
54
+ "targets": [
55
+ {
56
+ "datasource": {
57
+ "type": "prometheus",
58
+ "uid": "prometheus"
59
+ },
60
+ "expr": "avg(kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
61
+ "instant": true,
62
+ "refId": "A"
63
+ }
64
+ ],
65
+ "title": "Average current duration",
66
+ "type": "stat"
67
+ },
68
+ {
69
+ "datasource": {
70
+ "type": "prometheus",
71
+ "uid": "prometheus"
72
+ },
73
+ "fieldConfig": {
74
+ "defaults": {
75
+ "color": {
76
+ "mode": "thresholds"
77
+ },
78
+ "thresholds": {
79
+ "mode": "absolute",
80
+ "steps": [
81
+ {
82
+ "color": "green",
83
+ "value": null
84
+ }
85
+ ]
86
+ },
87
+ "unit": "s"
88
+ },
89
+ "overrides": []
90
+ },
91
+ "gridPos": {
92
+ "h": 5,
93
+ "w": 8,
94
+ "x": 8,
95
+ "y": 0
96
+ },
97
+ "id": 2,
98
+ "options": {
99
+ "colorMode": "value",
100
+ "graphMode": "none",
101
+ "justifyMode": "auto",
102
+ "orientation": "auto",
103
+ "reduceOptions": {
104
+ "calcs": [
105
+ "lastNotNull"
106
+ ],
107
+ "fields": "",
108
+ "values": false
109
+ },
110
+ "textMode": "value"
111
+ },
112
+ "targets": [
113
+ {
114
+ "datasource": {
115
+ "type": "prometheus",
116
+ "uid": "prometheus"
117
+ },
118
+ "expr": "max(kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
119
+ "instant": true,
120
+ "refId": "A"
121
+ }
122
+ ],
123
+ "title": "Slowest current combo",
124
+ "type": "stat"
125
+ },
126
+ {
127
+ "datasource": {
128
+ "type": "prometheus",
129
+ "uid": "prometheus"
130
+ },
131
+ "fieldConfig": {
132
+ "defaults": {
133
+ "color": {
134
+ "mode": "thresholds"
135
+ },
136
+ "thresholds": {
137
+ "mode": "absolute",
138
+ "steps": [
139
+ {
140
+ "color": "green",
141
+ "value": null
142
+ },
143
+ {
144
+ "color": "orange",
145
+ "value": 6
146
+ },
147
+ {
148
+ "color": "red",
149
+ "value": 24
150
+ }
151
+ ]
152
+ },
153
+ "unit": "h"
154
+ },
155
+ "overrides": []
156
+ },
157
+ "gridPos": {
158
+ "h": 5,
159
+ "w": 8,
160
+ "x": 16,
161
+ "y": 0
162
+ },
163
+ "id": 3,
164
+ "options": {
165
+ "colorMode": "value",
166
+ "graphMode": "none",
167
+ "justifyMode": "auto",
168
+ "orientation": "auto",
169
+ "reduceOptions": {
170
+ "calcs": [
171
+ "lastNotNull"
172
+ ],
173
+ "fields": "",
174
+ "values": false
175
+ },
176
+ "textMode": "value"
177
+ },
178
+ "targets": [
179
+ {
180
+ "datasource": {
181
+ "type": "prometheus",
182
+ "uid": "prometheus"
183
+ },
184
+ "expr": "avg((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
185
+ "instant": true,
186
+ "refId": "A"
187
+ }
188
+ ],
189
+ "title": "Average age of last sample",
190
+ "type": "stat"
191
+ },
192
+ {
193
+ "datasource": {
194
+ "type": "prometheus",
195
+ "uid": "prometheus"
196
+ },
197
+ "fieldConfig": {
198
+ "defaults": {
199
+ "color": {
200
+ "mode": "continuous-BlPu"
201
+ },
202
+ "unit": "s"
203
+ },
204
+ "overrides": []
205
+ },
206
+ "gridPos": {
207
+ "h": 8,
208
+ "w": 24,
209
+ "x": 0,
210
+ "y": 5
211
+ },
212
+ "id": 4,
213
+ "options": {
214
+ "legend": {
215
+ "displayMode": "table",
216
+ "placement": "bottom"
217
+ },
218
+ "tooltip": {
219
+ "mode": "multi"
220
+ }
221
+ },
222
+ "targets": [
223
+ {
224
+ "datasource": {
225
+ "type": "prometheus",
226
+ "uid": "prometheus"
227
+ },
228
+ "expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
229
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
230
+ "refId": "A"
231
+ }
232
+ ],
233
+ "title": "Duration trends by combo",
234
+ "type": "timeseries"
235
+ },
236
+ {
237
+ "datasource": {
238
+ "type": "prometheus",
239
+ "uid": "prometheus"
240
+ },
241
+ "fieldConfig": {
242
+ "defaults": {
243
+ "color": {
244
+ "mode": "continuous-GrYlRd"
245
+ },
246
+ "unit": "s"
247
+ },
248
+ "overrides": []
249
+ },
250
+ "gridPos": {
251
+ "h": 8,
252
+ "w": 24,
253
+ "x": 0,
254
+ "y": 13
255
+ },
256
+ "id": 5,
257
+ "options": {
258
+ "displayMode": "gradient",
259
+ "orientation": "horizontal",
260
+ "reduceOptions": {
261
+ "calcs": [
262
+ "lastNotNull"
263
+ ],
264
+ "fields": "",
265
+ "values": false
266
+ },
267
+ "showUnfilled": true
268
+ },
269
+ "targets": [
270
+ {
271
+ "datasource": {
272
+ "type": "prometheus",
273
+ "uid": "prometheus"
274
+ },
275
+ "expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
276
+ "instant": true,
277
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
278
+ "refId": "A"
279
+ }
280
+ ],
281
+ "title": "Current duration distribution",
282
+ "type": "bargauge"
283
+ }
284
+ ],
285
+ "refresh": "5m",
286
+ "schemaVersion": 39,
287
+ "style": "dark",
288
+ "tags": [
289
+ "kernels-community",
290
+ "ci",
291
+ "durations"
292
+ ],
293
+ "templating": {
294
+ "list": [
295
+ {
296
+ "current": {
297
+ "selected": true,
298
+ "text": [
299
+ "All"
300
+ ],
301
+ "value": [
302
+ "$__all"
303
+ ]
304
+ },
305
+ "datasource": {
306
+ "type": "prometheus",
307
+ "uid": "prometheus"
308
+ },
309
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
310
+ "includeAll": true,
311
+ "label": "Kernel",
312
+ "multi": true,
313
+ "name": "kernel",
314
+ "options": [],
315
+ "query": {
316
+ "query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
317
+ "refId": "PrometheusVariableQueryEditor-kernel"
318
+ },
319
+ "refresh": 1,
320
+ "sort": 1,
321
+ "type": "query"
322
+ },
323
+ {
324
+ "current": {
325
+ "selected": true,
326
+ "text": [
327
+ "All"
328
+ ],
329
+ "value": [
330
+ "$__all"
331
+ ]
332
+ },
333
+ "datasource": {
334
+ "type": "prometheus",
335
+ "uid": "prometheus"
336
+ },
337
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
338
+ "includeAll": true,
339
+ "label": "Backend",
340
+ "multi": true,
341
+ "name": "backend",
342
+ "options": [],
343
+ "query": {
344
+ "query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
345
+ "refId": "PrometheusVariableQueryEditor-backend"
346
+ },
347
+ "refresh": 1,
348
+ "sort": 1,
349
+ "type": "query"
350
+ },
351
+ {
352
+ "current": {
353
+ "selected": true,
354
+ "text": [
355
+ "All"
356
+ ],
357
+ "value": [
358
+ "$__all"
359
+ ]
360
+ },
361
+ "datasource": {
362
+ "type": "prometheus",
363
+ "uid": "prometheus"
364
+ },
365
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
366
+ "includeAll": true,
367
+ "label": "Compute backend",
368
+ "multi": true,
369
+ "name": "compute_backend",
370
+ "options": [],
371
+ "query": {
372
+ "query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
373
+ "refId": "PrometheusVariableQueryEditor-compute_backend"
374
+ },
375
+ "refresh": 1,
376
+ "sort": 1,
377
+ "type": "query"
378
+ },
379
+ {
380
+ "current": {
381
+ "selected": true,
382
+ "text": [
383
+ "All"
384
+ ],
385
+ "value": [
386
+ "$__all"
387
+ ]
388
+ },
389
+ "datasource": {
390
+ "type": "prometheus",
391
+ "uid": "prometheus"
392
+ },
393
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
394
+ "includeAll": true,
395
+ "label": "CUDA",
396
+ "multi": true,
397
+ "name": "cuda_version",
398
+ "options": [],
399
+ "query": {
400
+ "query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
401
+ "refId": "PrometheusVariableQueryEditor-cuda_version"
402
+ },
403
+ "refresh": 1,
404
+ "sort": 1,
405
+ "type": "query"
406
+ },
407
+ {
408
+ "current": {
409
+ "selected": true,
410
+ "text": [
411
+ "All"
412
+ ],
413
+ "value": [
414
+ "$__all"
415
+ ]
416
+ },
417
+ "datasource": {
418
+ "type": "prometheus",
419
+ "uid": "prometheus"
420
+ },
421
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
422
+ "includeAll": true,
423
+ "label": "PyTorch",
424
+ "multi": true,
425
+ "name": "pytorch_version",
426
+ "options": [],
427
+ "query": {
428
+ "query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
429
+ "refId": "PrometheusVariableQueryEditor-pytorch_version"
430
+ },
431
+ "refresh": 1,
432
+ "sort": 1,
433
+ "type": "query"
434
+ },
435
+ {
436
+ "current": {
437
+ "selected": true,
438
+ "text": [
439
+ "All"
440
+ ],
441
+ "value": [
442
+ "$__all"
443
+ ]
444
+ },
445
+ "datasource": {
446
+ "type": "prometheus",
447
+ "uid": "prometheus"
448
+ },
449
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
450
+ "includeAll": true,
451
+ "label": "Python",
452
+ "multi": true,
453
+ "name": "python_version",
454
+ "options": [],
455
+ "query": {
456
+ "query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
457
+ "refId": "PrometheusVariableQueryEditor-python_version"
458
+ },
459
+ "refresh": 1,
460
+ "sort": 1,
461
+ "type": "query"
462
+ }
463
+ ]
464
+ },
465
+ "time": {
466
+ "from": "now-30d",
467
+ "to": "now"
468
+ },
469
+ "timezone": "browser",
470
+ "title": "Kernels Build Duration Trends",
471
+ "uid": "kernels-build-durations",
472
+ "version": 1,
473
+ "weekStart": ""
474
+ }
monitoring/grafana/dashboards/build-failure-overview.json ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": []
4
+ },
5
+ "editable": true,
6
+ "fiscalYearStartMonth": 0,
7
+ "graphTooltip": 1,
8
+ "links": [],
9
+ "panels": [
10
+ {
11
+ "datasource": {
12
+ "type": "prometheus",
13
+ "uid": "prometheus"
14
+ },
15
+ "fieldConfig": {
16
+ "defaults": {
17
+ "color": {
18
+ "mode": "thresholds"
19
+ },
20
+ "thresholds": {
21
+ "mode": "absolute",
22
+ "steps": [
23
+ {
24
+ "color": "green",
25
+ "value": null
26
+ },
27
+ {
28
+ "color": "red",
29
+ "value": 1
30
+ }
31
+ ]
32
+ },
33
+ "unit": "none"
34
+ },
35
+ "overrides": []
36
+ },
37
+ "gridPos": {
38
+ "h": 5,
39
+ "w": 8,
40
+ "x": 0,
41
+ "y": 0
42
+ },
43
+ "id": 1,
44
+ "options": {
45
+ "colorMode": "value",
46
+ "graphMode": "none",
47
+ "justifyMode": "auto",
48
+ "orientation": "auto",
49
+ "reduceOptions": {
50
+ "calcs": [
51
+ "lastNotNull"
52
+ ],
53
+ "fields": "",
54
+ "values": false
55
+ },
56
+ "textMode": "value"
57
+ },
58
+ "targets": [
59
+ {
60
+ "datasource": {
61
+ "type": "prometheus",
62
+ "uid": "prometheus"
63
+ },
64
+ "expr": "sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
65
+ "instant": true,
66
+ "refId": "A"
67
+ }
68
+ ],
69
+ "title": "Failing combos",
70
+ "type": "stat"
71
+ },
72
+ {
73
+ "datasource": {
74
+ "type": "prometheus",
75
+ "uid": "prometheus"
76
+ },
77
+ "fieldConfig": {
78
+ "defaults": {
79
+ "color": {
80
+ "mode": "thresholds"
81
+ },
82
+ "thresholds": {
83
+ "mode": "absolute",
84
+ "steps": [
85
+ {
86
+ "color": "green",
87
+ "value": null
88
+ }
89
+ ]
90
+ },
91
+ "unit": "none"
92
+ },
93
+ "overrides": []
94
+ },
95
+ "gridPos": {
96
+ "h": 5,
97
+ "w": 8,
98
+ "x": 8,
99
+ "y": 0
100
+ },
101
+ "id": 2,
102
+ "options": {
103
+ "colorMode": "value",
104
+ "graphMode": "none",
105
+ "justifyMode": "auto",
106
+ "orientation": "auto",
107
+ "reduceOptions": {
108
+ "calcs": [
109
+ "lastNotNull"
110
+ ],
111
+ "fields": "",
112
+ "values": false
113
+ },
114
+ "textMode": "value"
115
+ },
116
+ "targets": [
117
+ {
118
+ "datasource": {
119
+ "type": "prometheus",
120
+ "uid": "prometheus"
121
+ },
122
+ "expr": "count(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"} == 1)",
123
+ "instant": true,
124
+ "refId": "A"
125
+ }
126
+ ],
127
+ "title": "Alerting series",
128
+ "type": "stat"
129
+ },
130
+ {
131
+ "datasource": {
132
+ "type": "prometheus",
133
+ "uid": "prometheus"
134
+ },
135
+ "fieldConfig": {
136
+ "defaults": {
137
+ "color": {
138
+ "mode": "thresholds"
139
+ },
140
+ "thresholds": {
141
+ "mode": "absolute",
142
+ "steps": [
143
+ {
144
+ "color": "green",
145
+ "value": null
146
+ },
147
+ {
148
+ "color": "orange",
149
+ "value": 6
150
+ },
151
+ {
152
+ "color": "red",
153
+ "value": 24
154
+ }
155
+ ]
156
+ },
157
+ "unit": "h"
158
+ },
159
+ "overrides": []
160
+ },
161
+ "gridPos": {
162
+ "h": 5,
163
+ "w": 8,
164
+ "x": 16,
165
+ "y": 0
166
+ },
167
+ "id": 3,
168
+ "options": {
169
+ "colorMode": "value",
170
+ "graphMode": "none",
171
+ "justifyMode": "auto",
172
+ "orientation": "auto",
173
+ "reduceOptions": {
174
+ "calcs": [
175
+ "lastNotNull"
176
+ ],
177
+ "fields": "",
178
+ "values": false
179
+ },
180
+ "textMode": "value"
181
+ },
182
+ "targets": [
183
+ {
184
+ "datasource": {
185
+ "type": "prometheus",
186
+ "uid": "prometheus"
187
+ },
188
+ "expr": "max((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
189
+ "instant": true,
190
+ "refId": "A"
191
+ }
192
+ ],
193
+ "title": "Oldest sample age",
194
+ "type": "stat"
195
+ },
196
+ {
197
+ "datasource": {
198
+ "type": "prometheus",
199
+ "uid": "prometheus"
200
+ },
201
+ "fieldConfig": {
202
+ "defaults": {
203
+ "color": {
204
+ "mode": "palette-classic"
205
+ },
206
+ "custom": {
207
+ "axisBorderShow": false,
208
+ "axisCenteredZero": false,
209
+ "drawStyle": "line",
210
+ "fillOpacity": 18,
211
+ "lineInterpolation": "stepAfter",
212
+ "lineWidth": 2,
213
+ "pointSize": 4,
214
+ "showPoints": "never",
215
+ "spanNulls": true
216
+ },
217
+ "max": 1,
218
+ "min": 0,
219
+ "unit": "none"
220
+ },
221
+ "overrides": []
222
+ },
223
+ "gridPos": {
224
+ "h": 8,
225
+ "w": 24,
226
+ "x": 0,
227
+ "y": 5
228
+ },
229
+ "id": 4,
230
+ "options": {
231
+ "legend": {
232
+ "displayMode": "table",
233
+ "placement": "bottom"
234
+ },
235
+ "tooltip": {
236
+ "mode": "multi"
237
+ }
238
+ },
239
+ "targets": [
240
+ {
241
+ "datasource": {
242
+ "type": "prometheus",
243
+ "uid": "prometheus"
244
+ },
245
+ "expr": "kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
246
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
247
+ "refId": "A"
248
+ }
249
+ ],
250
+ "title": "Failure state by combo",
251
+ "type": "timeseries"
252
+ },
253
+ {
254
+ "datasource": {
255
+ "type": "prometheus",
256
+ "uid": "prometheus"
257
+ },
258
+ "fieldConfig": {
259
+ "defaults": {
260
+ "color": {
261
+ "mode": "palette-classic"
262
+ },
263
+ "custom": {
264
+ "axisBorderShow": false,
265
+ "axisCenteredZero": false,
266
+ "drawStyle": "line",
267
+ "fillOpacity": 20,
268
+ "lineInterpolation": "stepAfter",
269
+ "lineWidth": 2,
270
+ "pointSize": 4,
271
+ "showPoints": "never",
272
+ "spanNulls": true
273
+ },
274
+ "max": 3,
275
+ "min": 0,
276
+ "unit": "none"
277
+ },
278
+ "overrides": []
279
+ },
280
+ "gridPos": {
281
+ "h": 8,
282
+ "w": 24,
283
+ "x": 0,
284
+ "y": 13
285
+ },
286
+ "id": 5,
287
+ "options": {
288
+ "legend": {
289
+ "displayMode": "table",
290
+ "placement": "bottom"
291
+ },
292
+ "tooltip": {
293
+ "mode": "multi"
294
+ }
295
+ },
296
+ "targets": [
297
+ {
298
+ "datasource": {
299
+ "type": "prometheus",
300
+ "uid": "prometheus"
301
+ },
302
+ "expr": "kc_build_last_run_result_code{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
303
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
304
+ "refId": "A"
305
+ }
306
+ ],
307
+ "title": "Result code over time",
308
+ "type": "timeseries"
309
+ }
310
+ ],
311
+ "refresh": "5m",
312
+ "schemaVersion": 39,
313
+ "style": "dark",
314
+ "tags": [
315
+ "kernels-community",
316
+ "ci",
317
+ "failures"
318
+ ],
319
+ "templating": {
320
+ "list": [
321
+ {
322
+ "current": {
323
+ "selected": true,
324
+ "text": [
325
+ "All"
326
+ ],
327
+ "value": [
328
+ "$__all"
329
+ ]
330
+ },
331
+ "datasource": {
332
+ "type": "prometheus",
333
+ "uid": "prometheus"
334
+ },
335
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
336
+ "includeAll": true,
337
+ "label": "Kernel",
338
+ "multi": true,
339
+ "name": "kernel",
340
+ "options": [],
341
+ "query": {
342
+ "query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
343
+ "refId": "PrometheusVariableQueryEditor-kernel"
344
+ },
345
+ "refresh": 1,
346
+ "sort": 1,
347
+ "type": "query"
348
+ },
349
+ {
350
+ "current": {
351
+ "selected": true,
352
+ "text": [
353
+ "All"
354
+ ],
355
+ "value": [
356
+ "$__all"
357
+ ]
358
+ },
359
+ "datasource": {
360
+ "type": "prometheus",
361
+ "uid": "prometheus"
362
+ },
363
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
364
+ "includeAll": true,
365
+ "label": "Backend",
366
+ "multi": true,
367
+ "name": "backend",
368
+ "options": [],
369
+ "query": {
370
+ "query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
371
+ "refId": "PrometheusVariableQueryEditor-backend"
372
+ },
373
+ "refresh": 1,
374
+ "sort": 1,
375
+ "type": "query"
376
+ },
377
+ {
378
+ "current": {
379
+ "selected": true,
380
+ "text": [
381
+ "All"
382
+ ],
383
+ "value": [
384
+ "$__all"
385
+ ]
386
+ },
387
+ "datasource": {
388
+ "type": "prometheus",
389
+ "uid": "prometheus"
390
+ },
391
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
392
+ "includeAll": true,
393
+ "label": "Compute backend",
394
+ "multi": true,
395
+ "name": "compute_backend",
396
+ "options": [],
397
+ "query": {
398
+ "query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
399
+ "refId": "PrometheusVariableQueryEditor-compute_backend"
400
+ },
401
+ "refresh": 1,
402
+ "sort": 1,
403
+ "type": "query"
404
+ },
405
+ {
406
+ "current": {
407
+ "selected": true,
408
+ "text": [
409
+ "All"
410
+ ],
411
+ "value": [
412
+ "$__all"
413
+ ]
414
+ },
415
+ "datasource": {
416
+ "type": "prometheus",
417
+ "uid": "prometheus"
418
+ },
419
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
420
+ "includeAll": true,
421
+ "label": "CUDA",
422
+ "multi": true,
423
+ "name": "cuda_version",
424
+ "options": [],
425
+ "query": {
426
+ "query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
427
+ "refId": "PrometheusVariableQueryEditor-cuda_version"
428
+ },
429
+ "refresh": 1,
430
+ "sort": 1,
431
+ "type": "query"
432
+ },
433
+ {
434
+ "current": {
435
+ "selected": true,
436
+ "text": [
437
+ "All"
438
+ ],
439
+ "value": [
440
+ "$__all"
441
+ ]
442
+ },
443
+ "datasource": {
444
+ "type": "prometheus",
445
+ "uid": "prometheus"
446
+ },
447
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
448
+ "includeAll": true,
449
+ "label": "PyTorch",
450
+ "multi": true,
451
+ "name": "pytorch_version",
452
+ "options": [],
453
+ "query": {
454
+ "query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
455
+ "refId": "PrometheusVariableQueryEditor-pytorch_version"
456
+ },
457
+ "refresh": 1,
458
+ "sort": 1,
459
+ "type": "query"
460
+ },
461
+ {
462
+ "current": {
463
+ "selected": true,
464
+ "text": [
465
+ "All"
466
+ ],
467
+ "value": [
468
+ "$__all"
469
+ ]
470
+ },
471
+ "datasource": {
472
+ "type": "prometheus",
473
+ "uid": "prometheus"
474
+ },
475
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
476
+ "includeAll": true,
477
+ "label": "Python",
478
+ "multi": true,
479
+ "name": "python_version",
480
+ "options": [],
481
+ "query": {
482
+ "query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
483
+ "refId": "PrometheusVariableQueryEditor-python_version"
484
+ },
485
+ "refresh": 1,
486
+ "sort": 1,
487
+ "type": "query"
488
+ }
489
+ ]
490
+ },
491
+ "time": {
492
+ "from": "now-30d",
493
+ "to": "now"
494
+ },
495
+ "timezone": "browser",
496
+ "title": "Kernels Build Failure Overview",
497
+ "uid": "kernels-build-failures",
498
+ "version": 1,
499
+ "weekStart": ""
500
+ }
monitoring/grafana/dashboards/build-matrix-overview.json ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": []
4
+ },
5
+ "editable": true,
6
+ "fiscalYearStartMonth": 0,
7
+ "graphTooltip": 1,
8
+ "links": [],
9
+ "panels": [
10
+ {
11
+ "datasource": {
12
+ "type": "prometheus",
13
+ "uid": "prometheus"
14
+ },
15
+ "fieldConfig": {
16
+ "defaults": {
17
+ "color": {
18
+ "mode": "thresholds"
19
+ },
20
+ "thresholds": {
21
+ "mode": "absolute",
22
+ "steps": [
23
+ {
24
+ "color": "green",
25
+ "value": null
26
+ }
27
+ ]
28
+ },
29
+ "unit": "none"
30
+ },
31
+ "overrides": []
32
+ },
33
+ "gridPos": {
34
+ "h": 5,
35
+ "w": 6,
36
+ "x": 0,
37
+ "y": 0
38
+ },
39
+ "id": 1,
40
+ "options": {
41
+ "colorMode": "value",
42
+ "graphMode": "none",
43
+ "justifyMode": "auto",
44
+ "orientation": "auto",
45
+ "reduceOptions": {
46
+ "calcs": [
47
+ "lastNotNull"
48
+ ],
49
+ "fields": "",
50
+ "values": false
51
+ },
52
+ "textMode": "value"
53
+ },
54
+ "targets": [
55
+ {
56
+ "datasource": {
57
+ "type": "prometheus",
58
+ "uid": "prometheus"
59
+ },
60
+ "expr": "count(kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
61
+ "instant": true,
62
+ "refId": "A"
63
+ }
64
+ ],
65
+ "title": "Tracked combos",
66
+ "type": "stat"
67
+ },
68
+ {
69
+ "datasource": {
70
+ "type": "prometheus",
71
+ "uid": "prometheus"
72
+ },
73
+ "fieldConfig": {
74
+ "defaults": {
75
+ "color": {
76
+ "mode": "thresholds"
77
+ },
78
+ "thresholds": {
79
+ "mode": "absolute",
80
+ "steps": [
81
+ {
82
+ "color": "green",
83
+ "value": null
84
+ },
85
+ {
86
+ "color": "red",
87
+ "value": 1
88
+ }
89
+ ]
90
+ },
91
+ "unit": "none"
92
+ },
93
+ "overrides": []
94
+ },
95
+ "gridPos": {
96
+ "h": 5,
97
+ "w": 6,
98
+ "x": 6,
99
+ "y": 0
100
+ },
101
+ "id": 2,
102
+ "options": {
103
+ "colorMode": "value",
104
+ "graphMode": "none",
105
+ "justifyMode": "auto",
106
+ "orientation": "auto",
107
+ "reduceOptions": {
108
+ "calcs": [
109
+ "lastNotNull"
110
+ ],
111
+ "fields": "",
112
+ "values": false
113
+ },
114
+ "textMode": "value"
115
+ },
116
+ "targets": [
117
+ {
118
+ "datasource": {
119
+ "type": "prometheus",
120
+ "uid": "prometheus"
121
+ },
122
+ "expr": "sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
123
+ "instant": true,
124
+ "refId": "A"
125
+ }
126
+ ],
127
+ "title": "Failing combos",
128
+ "type": "stat"
129
+ },
130
+ {
131
+ "datasource": {
132
+ "type": "prometheus",
133
+ "uid": "prometheus"
134
+ },
135
+ "fieldConfig": {
136
+ "defaults": {
137
+ "color": {
138
+ "mode": "thresholds"
139
+ },
140
+ "thresholds": {
141
+ "mode": "absolute",
142
+ "steps": [
143
+ {
144
+ "color": "green",
145
+ "value": null
146
+ }
147
+ ]
148
+ },
149
+ "unit": "none"
150
+ },
151
+ "overrides": []
152
+ },
153
+ "gridPos": {
154
+ "h": 5,
155
+ "w": 6,
156
+ "x": 12,
157
+ "y": 0
158
+ },
159
+ "id": 3,
160
+ "options": {
161
+ "colorMode": "value",
162
+ "graphMode": "none",
163
+ "justifyMode": "auto",
164
+ "orientation": "auto",
165
+ "reduceOptions": {
166
+ "calcs": [
167
+ "lastNotNull"
168
+ ],
169
+ "fields": "",
170
+ "values": false
171
+ },
172
+ "textMode": "value"
173
+ },
174
+ "targets": [
175
+ {
176
+ "datasource": {
177
+ "type": "prometheus",
178
+ "uid": "prometheus"
179
+ },
180
+ "expr": "count(kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) - sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
181
+ "instant": true,
182
+ "refId": "A"
183
+ }
184
+ ],
185
+ "title": "Healthy combos",
186
+ "type": "stat"
187
+ },
188
+ {
189
+ "datasource": {
190
+ "type": "prometheus",
191
+ "uid": "prometheus"
192
+ },
193
+ "fieldConfig": {
194
+ "defaults": {
195
+ "color": {
196
+ "mode": "thresholds"
197
+ },
198
+ "thresholds": {
199
+ "mode": "absolute",
200
+ "steps": [
201
+ {
202
+ "color": "green",
203
+ "value": null
204
+ },
205
+ {
206
+ "color": "orange",
207
+ "value": 6
208
+ },
209
+ {
210
+ "color": "red",
211
+ "value": 24
212
+ }
213
+ ]
214
+ },
215
+ "unit": "h"
216
+ },
217
+ "overrides": []
218
+ },
219
+ "gridPos": {
220
+ "h": 5,
221
+ "w": 6,
222
+ "x": 18,
223
+ "y": 0
224
+ },
225
+ "id": 4,
226
+ "options": {
227
+ "colorMode": "value",
228
+ "graphMode": "none",
229
+ "justifyMode": "auto",
230
+ "orientation": "auto",
231
+ "reduceOptions": {
232
+ "calcs": [
233
+ "lastNotNull"
234
+ ],
235
+ "fields": "",
236
+ "values": false
237
+ },
238
+ "textMode": "value"
239
+ },
240
+ "targets": [
241
+ {
242
+ "datasource": {
243
+ "type": "prometheus",
244
+ "uid": "prometheus"
245
+ },
246
+ "expr": "max((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
247
+ "instant": true,
248
+ "refId": "A"
249
+ }
250
+ ],
251
+ "title": "Oldest metric age",
252
+ "type": "stat"
253
+ },
254
+ {
255
+ "datasource": {
256
+ "type": "prometheus",
257
+ "uid": "prometheus"
258
+ },
259
+ "fieldConfig": {
260
+ "defaults": {
261
+ "color": {
262
+ "mode": "continuous-GrYlRd"
263
+ },
264
+ "unit": "s"
265
+ },
266
+ "overrides": []
267
+ },
268
+ "gridPos": {
269
+ "h": 8,
270
+ "w": 8,
271
+ "x": 0,
272
+ "y": 5
273
+ },
274
+ "id": 5,
275
+ "options": {
276
+ "displayMode": "gradient",
277
+ "orientation": "horizontal",
278
+ "reduceOptions": {
279
+ "calcs": [
280
+ "lastNotNull"
281
+ ],
282
+ "fields": "",
283
+ "values": false
284
+ },
285
+ "showUnfilled": true
286
+ },
287
+ "targets": [
288
+ {
289
+ "datasource": {
290
+ "type": "prometheus",
291
+ "uid": "prometheus"
292
+ },
293
+ "expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
294
+ "instant": true,
295
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
296
+ "refId": "A"
297
+ }
298
+ ],
299
+ "title": "Current duration by combo",
300
+ "type": "bargauge"
301
+ },
302
+ {
303
+ "datasource": {
304
+ "type": "prometheus",
305
+ "uid": "prometheus"
306
+ },
307
+ "fieldConfig": {
308
+ "defaults": {
309
+ "color": {
310
+ "mode": "palette-classic"
311
+ },
312
+ "custom": {
313
+ "axisBorderShow": false,
314
+ "axisCenteredZero": false,
315
+ "drawStyle": "line",
316
+ "fillOpacity": 20,
317
+ "lineInterpolation": "stepAfter",
318
+ "lineWidth": 2,
319
+ "pointSize": 4,
320
+ "showPoints": "never",
321
+ "spanNulls": true
322
+ },
323
+ "max": 3,
324
+ "min": 0,
325
+ "unit": "none"
326
+ },
327
+ "overrides": []
328
+ },
329
+ "gridPos": {
330
+ "h": 8,
331
+ "w": 16,
332
+ "x": 8,
333
+ "y": 5
334
+ },
335
+ "id": 6,
336
+ "options": {
337
+ "legend": {
338
+ "displayMode": "list",
339
+ "placement": "bottom"
340
+ },
341
+ "tooltip": {
342
+ "mode": "multi"
343
+ }
344
+ },
345
+ "targets": [
346
+ {
347
+ "datasource": {
348
+ "type": "prometheus",
349
+ "uid": "prometheus"
350
+ },
351
+ "expr": "kc_build_last_run_result_code{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
352
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
353
+ "refId": "A"
354
+ }
355
+ ],
356
+ "title": "Latest result code over time",
357
+ "type": "timeseries"
358
+ },
359
+ {
360
+ "datasource": {
361
+ "type": "prometheus",
362
+ "uid": "prometheus"
363
+ },
364
+ "fieldConfig": {
365
+ "defaults": {
366
+ "color": {
367
+ "mode": "continuous-BlYlRd"
368
+ },
369
+ "unit": "s"
370
+ },
371
+ "overrides": []
372
+ },
373
+ "gridPos": {
374
+ "h": 8,
375
+ "w": 24,
376
+ "x": 0,
377
+ "y": 13
378
+ },
379
+ "id": 7,
380
+ "options": {
381
+ "legend": {
382
+ "displayMode": "list",
383
+ "placement": "bottom"
384
+ },
385
+ "tooltip": {
386
+ "mode": "multi"
387
+ }
388
+ },
389
+ "targets": [
390
+ {
391
+ "datasource": {
392
+ "type": "prometheus",
393
+ "uid": "prometheus"
394
+ },
395
+ "expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
396
+ "legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
397
+ "refId": "A"
398
+ }
399
+ ],
400
+ "title": "Duration history",
401
+ "type": "timeseries"
402
+ }
403
+ ],
404
+ "refresh": "5m",
405
+ "schemaVersion": 39,
406
+ "style": "dark",
407
+ "tags": [
408
+ "kernels-community",
409
+ "ci",
410
+ "matrix"
411
+ ],
412
+ "templating": {
413
+ "list": [
414
+ {
415
+ "current": {
416
+ "selected": true,
417
+ "text": [
418
+ "All"
419
+ ],
420
+ "value": [
421
+ "$__all"
422
+ ]
423
+ },
424
+ "datasource": {
425
+ "type": "prometheus",
426
+ "uid": "prometheus"
427
+ },
428
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
429
+ "includeAll": true,
430
+ "label": "Kernel",
431
+ "multi": true,
432
+ "name": "kernel",
433
+ "options": [],
434
+ "query": {
435
+ "query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
436
+ "refId": "PrometheusVariableQueryEditor-kernel"
437
+ },
438
+ "refresh": 1,
439
+ "sort": 1,
440
+ "type": "query"
441
+ },
442
+ {
443
+ "current": {
444
+ "selected": true,
445
+ "text": [
446
+ "All"
447
+ ],
448
+ "value": [
449
+ "$__all"
450
+ ]
451
+ },
452
+ "datasource": {
453
+ "type": "prometheus",
454
+ "uid": "prometheus"
455
+ },
456
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
457
+ "includeAll": true,
458
+ "label": "Backend",
459
+ "multi": true,
460
+ "name": "backend",
461
+ "options": [],
462
+ "query": {
463
+ "query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
464
+ "refId": "PrometheusVariableQueryEditor-backend"
465
+ },
466
+ "refresh": 1,
467
+ "sort": 1,
468
+ "type": "query"
469
+ },
470
+ {
471
+ "current": {
472
+ "selected": true,
473
+ "text": [
474
+ "All"
475
+ ],
476
+ "value": [
477
+ "$__all"
478
+ ]
479
+ },
480
+ "datasource": {
481
+ "type": "prometheus",
482
+ "uid": "prometheus"
483
+ },
484
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
485
+ "includeAll": true,
486
+ "label": "Compute backend",
487
+ "multi": true,
488
+ "name": "compute_backend",
489
+ "options": [],
490
+ "query": {
491
+ "query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
492
+ "refId": "PrometheusVariableQueryEditor-compute_backend"
493
+ },
494
+ "refresh": 1,
495
+ "sort": 1,
496
+ "type": "query"
497
+ },
498
+ {
499
+ "current": {
500
+ "selected": true,
501
+ "text": [
502
+ "All"
503
+ ],
504
+ "value": [
505
+ "$__all"
506
+ ]
507
+ },
508
+ "datasource": {
509
+ "type": "prometheus",
510
+ "uid": "prometheus"
511
+ },
512
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
513
+ "includeAll": true,
514
+ "label": "CUDA",
515
+ "multi": true,
516
+ "name": "cuda_version",
517
+ "options": [],
518
+ "query": {
519
+ "query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
520
+ "refId": "PrometheusVariableQueryEditor-cuda_version"
521
+ },
522
+ "refresh": 1,
523
+ "sort": 1,
524
+ "type": "query"
525
+ },
526
+ {
527
+ "current": {
528
+ "selected": true,
529
+ "text": [
530
+ "All"
531
+ ],
532
+ "value": [
533
+ "$__all"
534
+ ]
535
+ },
536
+ "datasource": {
537
+ "type": "prometheus",
538
+ "uid": "prometheus"
539
+ },
540
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
541
+ "includeAll": true,
542
+ "label": "PyTorch",
543
+ "multi": true,
544
+ "name": "pytorch_version",
545
+ "options": [],
546
+ "query": {
547
+ "query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
548
+ "refId": "PrometheusVariableQueryEditor-pytorch_version"
549
+ },
550
+ "refresh": 1,
551
+ "sort": 1,
552
+ "type": "query"
553
+ },
554
+ {
555
+ "current": {
556
+ "selected": true,
557
+ "text": [
558
+ "All"
559
+ ],
560
+ "value": [
561
+ "$__all"
562
+ ]
563
+ },
564
+ "datasource": {
565
+ "type": "prometheus",
566
+ "uid": "prometheus"
567
+ },
568
+ "definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
569
+ "includeAll": true,
570
+ "label": "Python",
571
+ "multi": true,
572
+ "name": "python_version",
573
+ "options": [],
574
+ "query": {
575
+ "query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
576
+ "refId": "PrometheusVariableQueryEditor-python_version"
577
+ },
578
+ "refresh": 1,
579
+ "sort": 1,
580
+ "type": "query"
581
+ }
582
+ ]
583
+ },
584
+ "time": {
585
+ "from": "now-30d",
586
+ "to": "now"
587
+ },
588
+ "timezone": "browser",
589
+ "title": "Kernels Build Matrix Overview",
590
+ "uid": "kernels-build-matrix",
591
+ "version": 1,
592
+ "weekStart": ""
593
+ }
monitoring/grafana/provisioning/dashboards/dashboards.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: kernels-community
5
+ orgId: 1
6
+ folder: Kernels Community
7
+ type: file
8
+ disableDeletion: false
9
+ editable: true
10
+ options:
11
+ path: /var/lib/grafana/dashboards
monitoring/grafana/provisioning/datasources/prometheus.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ uid: prometheus
6
+ type: prometheus
7
+ access: proxy
8
+ url: http://prometheus:9090
9
+ isDefault: true
10
+ editable: false
monitoring/prometheus/prometheus.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+
5
+ rule_files:
6
+ - /etc/prometheus/rules/*.yml
7
+
8
+ scrape_configs:
9
+ - job_name: prometheus
10
+ static_configs:
11
+ - targets:
12
+ - prometheus:9090
13
+
14
+ - job_name: pushgateway
15
+ honor_labels: true
16
+ static_configs:
17
+ - targets:
18
+ - pushgateway:9091
monitoring/prometheus/rules/build-alerts.yml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groups:
2
+ - name: kernels-community-build-alerts
3
+ rules:
4
+ - alert: KernelsBuildMatrixComboFailing
5
+ expr: kc_build_last_run_failed == 1
6
+ for: 10m
7
+ labels:
8
+ severity: warning
9
+ annotations:
10
+ summary: "Kernel build matrix combo failing"
11
+ description: "{{ $labels.kernel }} backend={{ $labels.backend }} compute={{ $labels.compute_backend }} cuda={{ $labels.cuda_version }} torch={{ $labels.pytorch_version }} python={{ $labels.python_version }} is currently failing."
12
+
13
+ - alert: KernelsBuildMetricsStale
14
+ expr: (time() - kc_build_last_run_timestamp_seconds) > 86400
15
+ for: 30m
16
+ labels:
17
+ severity: warning
18
+ annotations:
19
+ summary: "Kernel build metrics stale"
20
+ description: "{{ $labels.kernel }} backend={{ $labels.backend }} compute={{ $labels.compute_backend }} cuda={{ $labels.cuda_version }} torch={{ $labels.pytorch_version }} python={{ $labels.python_version }} has not pushed fresh metrics for more than 24 hours."
requirements-dev.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ -r requirements.txt
2
+ pytest>=8.3,<9
3
+ ruff>=0.11,<0.12
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=6.10,<7
2
+ httpx>=0.27,<1
3
+ pydantic>=2.7,<3
4
+ PyYAML>=6.0,<7
5
+ cachetools>=5.3,<6
6
+ python-dateutil>=2.9,<3
7
+ python-dotenv>=1.0,<2
8
+ huggingface_hub>=0.30,<1
9
+ beautifulsoup4>=4.14,<5
scripts/bootstrap_space.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import os
6
+ import subprocess
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from dotenv import load_dotenv
11
+ from huggingface_hub import HfApi
12
+ from huggingface_hub.utils import get_token
13
+
14
+
15
+ ROOT_DIR = Path(__file__).resolve().parents[1]
16
+ SRC_DIR = ROOT_DIR / "src"
17
+ if str(SRC_DIR) not in sys.path:
18
+ sys.path.insert(0, str(SRC_DIR))
19
+
20
+ from kc_monitor.config import load_config
21
+
22
+
23
+ def _cached_github_token() -> str | None:
24
+ try:
25
+ completed = subprocess.run(
26
+ ["gh", "auth", "token"],
27
+ capture_output=True,
28
+ text=True,
29
+ check=True,
30
+ )
31
+ except (OSError, subprocess.CalledProcessError):
32
+ return None
33
+ token = completed.stdout.strip()
34
+ return token or None
35
+
36
+
37
+ def build_parser() -> argparse.ArgumentParser:
38
+ parser = argparse.ArgumentParser(description="Create or update the Kernels Community monitor Space.")
39
+ parser.add_argument(
40
+ "--space-id",
41
+ default=os.getenv("KCM_SPACE_ID", "adarshxs/kernels-community-monitor"),
42
+ help="Target Hugging Face Space repo ID.",
43
+ )
44
+ parser.add_argument(
45
+ "--private",
46
+ action="store_true",
47
+ help="Create the Space as private if it does not already exist.",
48
+ )
49
+ parser.add_argument(
50
+ "--skip-secret",
51
+ action="store_true",
52
+ help="Do not update the GITHUB_TOKEN Space secret.",
53
+ )
54
+ parser.add_argument(
55
+ "--skip-variables",
56
+ action="store_true",
57
+ help="Do not update Space variables/settings.",
58
+ )
59
+ parser.add_argument(
60
+ "--create-pr",
61
+ action="store_true",
62
+ help="Open a Hub pull request instead of pushing directly when write access is unavailable.",
63
+ )
64
+ return parser
65
+
66
+
67
+ def main() -> int:
68
+ load_dotenv()
69
+ parser = build_parser()
70
+ args = parser.parse_args()
71
+
72
+ hf_token = os.getenv("HF_TOKEN") or get_token()
73
+ github_token = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") or _cached_github_token()
74
+ if not hf_token:
75
+ parser.error("HF_TOKEN must be set in the environment or available from a local Hugging Face login.")
76
+
77
+ config = load_config(ROOT_DIR / "config" / "monitor.yaml")
78
+ api = HfApi(token=hf_token)
79
+
80
+ api.create_repo(
81
+ repo_id=args.space_id,
82
+ repo_type="space",
83
+ space_sdk="gradio",
84
+ private=args.private,
85
+ exist_ok=True,
86
+ )
87
+
88
+ if github_token and not args.skip_secret:
89
+ api.add_space_secret(repo_id=args.space_id, key="GITHUB_TOKEN", value=github_token)
90
+
91
+ if not args.skip_variables:
92
+ github_vars = {
93
+ "KCM_GITHUB_OWNER": config.github.owner,
94
+ "KCM_GITHUB_REPO": config.github.repo,
95
+ "KCM_GITHUB_BRANCH": config.github.branch,
96
+ "KCM_REFRESH_INTERVAL_SECONDS": str(config.monitor.refresh_interval_seconds),
97
+ "KCM_WORKFLOW_RUN_PAGE_SIZE": str(config.monitor.workflow_run_page_size),
98
+ "KCM_WORKFLOW_RUN_PAGES": str(config.monitor.workflow_run_pages),
99
+ }
100
+ if config.monitor.critical_kernels:
101
+ github_vars["KCM_CRITICAL_KERNELS"] = ",".join(config.monitor.critical_kernels)
102
+ for key, value in github_vars.items():
103
+ if value:
104
+ api.add_space_variable(repo_id=args.space_id, key=key, value=value)
105
+
106
+ grafana_vars = {
107
+ "KCM_GRAFANA_BASE_URL": config.grafana.base_url,
108
+ "KCM_GRAFANA_ORG_ID": str(config.grafana.org_id),
109
+ "KCM_GRAFANA_THEME": config.grafana.theme,
110
+ "KCM_GRAFANA_OVERVIEW_UID": config.grafana.overview_dashboard_uid,
111
+ "KCM_GRAFANA_DURATION_UID": config.grafana.duration_dashboard_uid,
112
+ "KCM_GRAFANA_FAILURE_UID": config.grafana.failure_dashboard_uid,
113
+ "KCM_PROMETHEUS_BASE_URL": config.prometheus.base_url,
114
+ "KCM_PUSHGATEWAY_URL": config.pushgateway.url,
115
+ "KCM_PUSHGATEWAY_JOB_NAME": config.pushgateway.job_name,
116
+ }
117
+ for key, value in grafana_vars.items():
118
+ if value:
119
+ api.add_space_variable(repo_id=args.space_id, key=key, value=value)
120
+
121
+ api.upload_folder(
122
+ repo_id=args.space_id,
123
+ repo_type="space",
124
+ folder_path=str(ROOT_DIR),
125
+ create_pr=args.create_pr,
126
+ ignore_patterns=[
127
+ ".env",
128
+ ".git",
129
+ ".git/*",
130
+ ".venv/*",
131
+ "venv/*",
132
+ "__pycache__/*",
133
+ ".pytest_cache/*",
134
+ ".ruff_cache/*",
135
+ "*.log",
136
+ ],
137
+ )
138
+
139
+ print(f"Space URL: https://huggingface.co/spaces/{args.space_id}")
140
+ try:
141
+ runtime = api.get_space_runtime(repo_id=args.space_id)
142
+ print(f"Runtime stage: {runtime.stage}")
143
+ print(f"Hardware: {runtime.hardware}")
144
+ except Exception:
145
+ print("Runtime not yet available (Space is provisioning).")
146
+ return 0
147
+
148
+
149
+ if __name__ == "__main__":
150
+ raise SystemExit(main())
scripts/push_build_metrics.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from dotenv import load_dotenv
9
+
10
+
11
+ ROOT_DIR = Path(__file__).resolve().parents[1]
12
+ SRC_DIR = ROOT_DIR / "src"
13
+ if str(SRC_DIR) not in sys.path:
14
+ sys.path.insert(0, str(SRC_DIR))
15
+
16
+ from kc_monitor.config import load_config
17
+ from kc_monitor.metrics_push import BuildMetricSample, push_build_metrics
18
+
19
+
20
+ def main() -> int:
21
+ load_dotenv()
22
+ config = load_config(ROOT_DIR / "config" / "monitor.yaml")
23
+ pushgateway_url = os.getenv("PUSHGATEWAY_URL") or config.pushgateway.url
24
+ if not pushgateway_url:
25
+ raise SystemExit("Pushgateway URL is required via PUSHGATEWAY_URL or KCM_PUSHGATEWAY_URL.")
26
+
27
+ job_name = os.getenv("KCM_PUSHGATEWAY_JOB_NAME") or config.pushgateway.job_name
28
+ sample = BuildMetricSample.from_env(os.environ)
29
+ push_url = push_build_metrics(
30
+ sample,
31
+ pushgateway_url=pushgateway_url,
32
+ job_name=job_name,
33
+ )
34
+
35
+ print(f"Pushed metrics to {push_url}")
36
+ print(f"Matrix combo: {sample.grouping_key}")
37
+ print(
38
+ "Outcome:"
39
+ f" result={sample.result}"
40
+ f" result_code={sample.result_code}"
41
+ f" failed={sample.failed}"
42
+ f" duration_seconds={sample.duration_seconds:.3f}"
43
+ )
44
+ return 0
45
+
46
+
47
+ if __name__ == "__main__":
48
+ raise SystemExit(main())
scripts/smoke_check.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ from __future__ import annotations
3
+
4
+ import sys
5
+ from pathlib import Path
6
+
7
+
8
+ ROOT_DIR = Path(__file__).resolve().parents[1]
9
+ SRC_DIR = ROOT_DIR / "src"
10
+ if str(SRC_DIR) not in sys.path:
11
+ sys.path.insert(0, str(SRC_DIR))
12
+
13
+ from kc_monitor.config import load_config
14
+ from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
15
+ from kc_monitor.service import MonitorService
16
+
17
+
18
+ def main() -> int:
19
+ config = load_config(ROOT_DIR / "config" / "monitor.yaml")
20
+ service = MonitorService(config)
21
+ try:
22
+ snapshot = service.get_snapshot(force_refresh=True)
23
+ finally:
24
+ service.close()
25
+
26
+ print(f"Generated at: {snapshot.generated_at.isoformat()}")
27
+ print(
28
+ "Summary:"
29
+ f" tracked={snapshot.summary.tracked_kernels}"
30
+ f" active={snapshot.summary.active_builds}"
31
+ f" uploading={snapshot.summary.uploading_builds}"
32
+ f" failed={snapshot.summary.failed_builds}"
33
+ )
34
+ for row in snapshot.kernel_rows[:10]:
35
+ primary = row.primary_group
36
+ run_url = primary.run.html_url if primary else "n/a"
37
+ print(
38
+ f"- {row.kernel_name:20}"
39
+ f" status={row.row_status_label:10}"
40
+ f" runs={row.recent_run_count:2}"
41
+ f" run={run_url}"
42
+ )
43
+
44
+ print(f"Grafana enabled: {config.grafana.enabled}")
45
+ print(f"Grafana base URL: {config.grafana.base_url or 'not configured'}")
46
+ print(f"Prometheus base URL: {config.prometheus.base_url or 'not configured'}")
47
+ print(f"Pushgateway URL: {config.pushgateway.url or 'not configured'}")
48
+
49
+ dashboards = dashboard_catalog(config.grafana)
50
+ for dashboard in dashboards:
51
+ print(
52
+ f"- {dashboard.title:18}"
53
+ f" uid={dashboard.uid:24}"
54
+ f" view={build_dashboard_url(config.grafana, dashboard.uid, embed=False) or 'not configured'}"
55
+ )
56
+
57
+ if snapshot.errors and not snapshot.kernel_rows:
58
+ return 1
59
+ return 0
60
+
61
+
62
+ if __name__ == "__main__":
63
+ raise SystemExit(main())
src/kc_monitor/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Kernels Community Monitor package."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.2.0"
src/kc_monitor/config.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Literal
6
+
7
+ import yaml
8
+ from dotenv import load_dotenv
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ from kc_monitor.models import WorkflowTarget
12
+
13
+
14
+ ROOT_DIR = Path(__file__).resolve().parents[2]
15
+ DEFAULT_CONFIG_PATH = ROOT_DIR / "config" / "monitor.yaml"
16
+
17
+
18
+ class GitHubSettings(BaseModel):
19
+ model_config = ConfigDict(extra="ignore")
20
+
21
+ owner: str = "huggingface"
22
+ repo: str = "kernels-community"
23
+ branch: str = "main"
24
+ per_page: int = 30
25
+ request_timeout_seconds: float = 25.0
26
+ user_agent: str = "kernels-community-monitor/0.1"
27
+ token: str | None = None
28
+ workflows: list[WorkflowTarget] = Field(default_factory=list)
29
+
30
+ @property
31
+ def repo_slug(self) -> str:
32
+ return f"{self.owner}/{self.repo}"
33
+
34
+
35
+ class MonitorSettings(BaseModel):
36
+ model_config = ConfigDict(extra="ignore")
37
+
38
+ refresh_interval_seconds: int = 120
39
+ snapshot_ttl_seconds: int = 45
40
+ workflow_run_page_size: int = 100
41
+ workflow_run_pages: int = 10
42
+ recent_completed_hours: int = 72
43
+ recent_limit: int = 40
44
+ completed_runs_per_workflow: int = 3
45
+ log_line_limit: int = 400
46
+ log_char_limit: int = 35000
47
+ detail_event_limit: int = 25
48
+ stall_without_log_minutes: int = 45
49
+ stall_active_phase_minutes: int = 180
50
+ critical_kernels: list[str] = Field(default_factory=list)
51
+
52
+ @property
53
+ def critical_kernel_set(self) -> set[str]:
54
+ return {item.strip() for item in self.critical_kernels if item.strip()}
55
+
56
+
57
+ class GrafanaSettings(BaseModel):
58
+ model_config = ConfigDict(extra="ignore")
59
+
60
+ base_url: str | None = None
61
+ org_id: int = 1
62
+ theme: Literal["dark", "light"] = "dark"
63
+ default_from: str = "now-30d"
64
+ default_to: str = "now"
65
+ default_refresh: str = "5m"
66
+ overview_dashboard_uid: str = "kernels-build-matrix"
67
+ duration_dashboard_uid: str = "kernels-build-durations"
68
+ failure_dashboard_uid: str = "kernels-build-failures"
69
+
70
+ @property
71
+ def enabled(self) -> bool:
72
+ return bool(self.base_url)
73
+
74
+
75
+ class PrometheusSettings(BaseModel):
76
+ model_config = ConfigDict(extra="ignore")
77
+
78
+ base_url: str | None = None
79
+
80
+
81
+ class PushgatewaySettings(BaseModel):
82
+ model_config = ConfigDict(extra="ignore")
83
+
84
+ url: str | None = None
85
+ job_name: str = "kernels-community-build-matrix"
86
+
87
+
88
+ class AppConfig(BaseModel):
89
+ model_config = ConfigDict(extra="ignore")
90
+
91
+ github: GitHubSettings = Field(default_factory=GitHubSettings)
92
+ monitor: MonitorSettings = Field(default_factory=MonitorSettings)
93
+ grafana: GrafanaSettings = Field(default_factory=GrafanaSettings)
94
+ prometheus: PrometheusSettings = Field(default_factory=PrometheusSettings)
95
+ pushgateway: PushgatewaySettings = Field(default_factory=PushgatewaySettings)
96
+
97
+ @property
98
+ def workflow_targets(self) -> list[WorkflowTarget]:
99
+ return [workflow for workflow in self.github.workflows if workflow.enabled]
100
+
101
+
102
+ def _deep_merge(base: dict[str, Any], updates: dict[str, Any]) -> dict[str, Any]:
103
+ merged = dict(base)
104
+ for key, value in updates.items():
105
+ if isinstance(value, dict) and isinstance(merged.get(key), dict):
106
+ merged[key] = _deep_merge(merged[key], value)
107
+ else:
108
+ merged[key] = value
109
+ return merged
110
+
111
+
112
+ def _load_yaml(path: Path) -> dict[str, Any]:
113
+ if not path.exists():
114
+ return {}
115
+ with path.open("r", encoding="utf-8") as handle:
116
+ return yaml.safe_load(handle) or {}
117
+
118
+
119
+ def _csv_env(name: str) -> list[str] | None:
120
+ raw = os.getenv(name)
121
+ if raw is None:
122
+ return None
123
+ return [item.strip() for item in raw.split(",") if item.strip()]
124
+
125
+
126
+ def _env_overrides() -> dict[str, Any]:
127
+ critical_kernels = _csv_env("KCM_CRITICAL_KERNELS")
128
+
129
+ overrides: dict[str, Any] = {
130
+ "github": {
131
+ "owner": os.getenv("KCM_GITHUB_OWNER"),
132
+ "repo": os.getenv("KCM_GITHUB_REPO"),
133
+ "branch": os.getenv("KCM_GITHUB_BRANCH"),
134
+ "token": os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN"),
135
+ },
136
+ "monitor": {
137
+ "refresh_interval_seconds": os.getenv("KCM_REFRESH_INTERVAL_SECONDS"),
138
+ "snapshot_ttl_seconds": os.getenv("KCM_SNAPSHOT_TTL_SECONDS"),
139
+ "workflow_run_page_size": os.getenv("KCM_WORKFLOW_RUN_PAGE_SIZE"),
140
+ "workflow_run_pages": os.getenv("KCM_WORKFLOW_RUN_PAGES"),
141
+ "recent_completed_hours": os.getenv("KCM_RECENT_COMPLETED_HOURS"),
142
+ "recent_limit": os.getenv("KCM_RECENT_LIMIT"),
143
+ "completed_runs_per_workflow": os.getenv("KCM_COMPLETED_RUNS_PER_WORKFLOW"),
144
+ "log_line_limit": os.getenv("KCM_LOG_LINE_LIMIT"),
145
+ "log_char_limit": os.getenv("KCM_LOG_CHAR_LIMIT"),
146
+ "detail_event_limit": os.getenv("KCM_DETAIL_EVENT_LIMIT"),
147
+ "stall_without_log_minutes": os.getenv("KCM_STALL_WITHOUT_LOG_MINUTES"),
148
+ "stall_active_phase_minutes": os.getenv("KCM_STALL_ACTIVE_PHASE_MINUTES"),
149
+ "critical_kernels": critical_kernels,
150
+ },
151
+ "grafana": {
152
+ "base_url": os.getenv("KCM_GRAFANA_BASE_URL"),
153
+ "org_id": os.getenv("KCM_GRAFANA_ORG_ID"),
154
+ "theme": os.getenv("KCM_GRAFANA_THEME"),
155
+ "default_from": os.getenv("KCM_GRAFANA_FROM"),
156
+ "default_to": os.getenv("KCM_GRAFANA_TO"),
157
+ "default_refresh": os.getenv("KCM_GRAFANA_REFRESH"),
158
+ "overview_dashboard_uid": os.getenv("KCM_GRAFANA_OVERVIEW_UID"),
159
+ "duration_dashboard_uid": os.getenv("KCM_GRAFANA_DURATION_UID"),
160
+ "failure_dashboard_uid": os.getenv("KCM_GRAFANA_FAILURE_UID"),
161
+ },
162
+ "prometheus": {
163
+ "base_url": os.getenv("KCM_PROMETHEUS_BASE_URL"),
164
+ },
165
+ "pushgateway": {
166
+ "url": os.getenv("KCM_PUSHGATEWAY_URL"),
167
+ "job_name": os.getenv("KCM_PUSHGATEWAY_JOB_NAME"),
168
+ },
169
+ }
170
+
171
+ github = {key: value for key, value in overrides["github"].items() if value is not None}
172
+ monitor = {key: value for key, value in overrides["monitor"].items() if value is not None}
173
+ grafana = {key: value for key, value in overrides["grafana"].items() if value is not None}
174
+ prometheus = {key: value for key, value in overrides["prometheus"].items() if value is not None}
175
+ pushgateway = {key: value for key, value in overrides["pushgateway"].items() if value is not None}
176
+ return {
177
+ "github": github,
178
+ "monitor": monitor,
179
+ "grafana": grafana,
180
+ "prometheus": prometheus,
181
+ "pushgateway": pushgateway,
182
+ }
183
+
184
+
185
+ def load_config(config_path: str | Path | None = None) -> AppConfig:
186
+ load_dotenv()
187
+ path = Path(config_path) if config_path else DEFAULT_CONFIG_PATH
188
+ raw = _load_yaml(path)
189
+ merged = _deep_merge(raw, _env_overrides())
190
+ return AppConfig.model_validate(merged)
src/kc_monitor/github_client.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import html
5
+ import json
6
+ import re
7
+ import subprocess
8
+ from typing import Any
9
+
10
+ from bs4 import BeautifulSoup
11
+ import httpx
12
+
13
+ from kc_monitor.models import GitHubJob, GitHubRun, parse_github_datetime, utcnow
14
+
15
+
16
+ class GitHubActionsError(RuntimeError):
17
+ """Raised when the GitHub API returns an unexpected response."""
18
+
19
+
20
+ class GitHubActionsClient:
21
+ def __init__(
22
+ self,
23
+ owner: str,
24
+ repo: str,
25
+ token: str | None = None,
26
+ request_timeout_seconds: float = 25.0,
27
+ user_agent: str = "kernels-community-monitor/0.1",
28
+ ) -> None:
29
+ if not token:
30
+ token = self._token_from_gh_cli()
31
+ headers = {
32
+ "Accept": "application/vnd.github+json",
33
+ "User-Agent": user_agent,
34
+ "X-GitHub-Api-Version": "2022-11-28",
35
+ }
36
+ if token:
37
+ headers["Authorization"] = f"Bearer {token}"
38
+
39
+ self.owner = owner
40
+ self.repo = repo
41
+ self._client = httpx.Client(
42
+ base_url="https://api.github.com",
43
+ headers=headers,
44
+ timeout=request_timeout_seconds,
45
+ follow_redirects=False,
46
+ )
47
+ self._anonymous_client = httpx.Client(
48
+ base_url="https://api.github.com",
49
+ headers={
50
+ "Accept": "application/vnd.github+json",
51
+ "User-Agent": user_agent,
52
+ "X-GitHub-Api-Version": "2022-11-28",
53
+ },
54
+ timeout=request_timeout_seconds,
55
+ follow_redirects=False,
56
+ )
57
+ self._web_client = httpx.Client(
58
+ base_url="https://github.com",
59
+ headers={"User-Agent": user_agent},
60
+ timeout=request_timeout_seconds,
61
+ follow_redirects=True,
62
+ )
63
+ self._raw_client = httpx.Client(
64
+ base_url="https://raw.githubusercontent.com",
65
+ headers={"User-Agent": user_agent},
66
+ timeout=request_timeout_seconds,
67
+ follow_redirects=True,
68
+ )
69
+
70
+ @staticmethod
71
+ def _token_from_gh_cli() -> str | None:
72
+ try:
73
+ completed = subprocess.run(
74
+ ["gh", "auth", "token"],
75
+ capture_output=True,
76
+ text=True,
77
+ check=True,
78
+ )
79
+ except (OSError, subprocess.CalledProcessError):
80
+ return None
81
+ token = completed.stdout.strip()
82
+ return token or None
83
+
84
+ def close(self) -> None:
85
+ self._client.close()
86
+ self._anonymous_client.close()
87
+ self._web_client.close()
88
+ self._raw_client.close()
89
+
90
+ @staticmethod
91
+ def _is_classic_pat_forbidden(response: httpx.Response) -> bool:
92
+ return response.status_code == 403 and "forbids access via a personal access token (classic)" in response.text
93
+
94
+ def _request_with_fallback(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
95
+ response = self._client.request(method, path, **kwargs)
96
+ if self._is_classic_pat_forbidden(response):
97
+ response = self._anonymous_client.request(method, path, **kwargs)
98
+ return response
99
+
100
+ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
101
+ response = self._request_with_fallback(method, path, **kwargs)
102
+ if response.status_code >= 400:
103
+ raise GitHubActionsError(
104
+ f"GitHub API request failed for {path}: {response.status_code} {response.text}"
105
+ )
106
+ return response
107
+
108
+ @staticmethod
109
+ def _should_use_public_fallback(response: httpx.Response) -> bool:
110
+ text = response.text.lower()
111
+ return response.status_code in {403, 404, 429} or "rate limit exceeded" in text
112
+
113
+ @staticmethod
114
+ def _workflow_path(workflow_file: str) -> str:
115
+ if workflow_file.startswith(".github/workflows/"):
116
+ return workflow_file
117
+ return f".github/workflows/{workflow_file}"
118
+
119
+ @staticmethod
120
+ def _parse_run_state(aria_label: str) -> tuple[str, str | None]:
121
+ normalized = aria_label.lower()
122
+ if "completed successfully" in normalized:
123
+ return "completed", "success"
124
+ if "cancel" in normalized:
125
+ return "completed", "cancelled"
126
+ if "fail" in normalized:
127
+ return "completed", "failure"
128
+ if "queued" in normalized:
129
+ return "queued", None
130
+ if "in progress" in normalized or "running" in normalized:
131
+ return "in_progress", None
132
+ return "completed", None
133
+
134
+ def _list_workflow_runs_public(self, workflow_file: str, page: int = 1) -> list[GitHubRun]:
135
+ response = self._web_client.get(
136
+ f"/{self.owner}/{self.repo}/actions/workflows/{workflow_file}",
137
+ params={"page": page},
138
+ )
139
+ response.raise_for_status()
140
+ soup = BeautifulSoup(response.text, "html.parser")
141
+ rows = soup.find_all("div", class_="Box-row")
142
+ runs: list[GitHubRun] = []
143
+ run_prefix = f"/{self.owner}/{self.repo}/actions/runs/"
144
+ branch_prefix = f"/{self.owner}/{self.repo}/tree/refs/heads/"
145
+ pull_prefix = f"/{self.owner}/{self.repo}/pull/"
146
+ workflow_path = self._workflow_path(workflow_file)
147
+
148
+ for row in rows:
149
+ run_link = next(
150
+ (a for a in row.find_all("a") if (a.get("href") or "").startswith(run_prefix)),
151
+ None,
152
+ )
153
+ if not run_link:
154
+ continue
155
+
156
+ run_href = run_link.get("href") or ""
157
+ try:
158
+ run_id = int(run_href.rstrip("/").split("/")[-1])
159
+ except ValueError:
160
+ continue
161
+
162
+ display_title = run_link.get_text(" ", strip=True)
163
+ aria_label = run_link.get("aria-label") or ""
164
+ status, conclusion = self._parse_run_state(aria_label)
165
+ relative_time = row.find("relative-time")
166
+ timestamp = parse_github_datetime(relative_time.get("datetime")) if relative_time else None
167
+ branch_link = next(
168
+ (a for a in row.find_all("a") if (a.get("href") or "").startswith(branch_prefix)),
169
+ None,
170
+ )
171
+ actor_link = next(
172
+ (
173
+ a
174
+ for a in row.find_all("a")
175
+ if (href := a.get("href") or "")
176
+ and href.startswith("/")
177
+ and not href.startswith(run_prefix)
178
+ and not href.startswith(branch_prefix)
179
+ and not href.startswith(pull_prefix)
180
+ and href.count("/") == 1
181
+ ),
182
+ None,
183
+ )
184
+ workflow_name = row.find("span", class_="text-bold")
185
+ pull_link = next(
186
+ (a for a in row.find_all("a") if (a.get("href") or "").startswith(pull_prefix)),
187
+ None,
188
+ )
189
+ event = "pull_request" if pull_link else "workflow_dispatch"
190
+ head_branch = branch_link.get_text(" ", strip=True) if branch_link else ""
191
+ actor_login = actor_link.get_text(" ", strip=True) if actor_link else None
192
+ run_time = timestamp or utcnow()
193
+ runs.append(
194
+ GitHubRun(
195
+ id=run_id,
196
+ name=workflow_name.get_text(" ", strip=True) if workflow_name else workflow_file,
197
+ display_title=display_title,
198
+ path=workflow_path,
199
+ status=status,
200
+ conclusion=conclusion,
201
+ head_branch=head_branch,
202
+ head_sha="",
203
+ event=event,
204
+ html_url=f"https://github.com{run_href}",
205
+ jobs_url=f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs",
206
+ created_at=run_time,
207
+ updated_at=run_time,
208
+ run_started_at=run_time,
209
+ actor_login=actor_login,
210
+ raw={"source": "public_html"},
211
+ )
212
+ )
213
+ return runs
214
+
215
+ @staticmethod
216
+ def _runner_group_from_job_name(job_name: str) -> str | None:
217
+ match = re.search(r"\(([^)]+)\)", job_name)
218
+ if not match:
219
+ return None
220
+ parts = [part.strip() for part in match.group(1).split(",") if part.strip()]
221
+ if len(parts) < 2:
222
+ return None
223
+ return parts[1]
224
+
225
+ def _list_jobs_public(self, run_id: int) -> list[GitHubJob]:
226
+ response = self._web_client.get(f"/{self.owner}/{self.repo}/actions/runs/{run_id}")
227
+ response.raise_for_status()
228
+ soup = BeautifulSoup(response.text, "html.parser")
229
+ scripts = [
230
+ script
231
+ for script in soup.find_all("script")
232
+ if script.get("data-target") == "react-partial.embeddedData"
233
+ ]
234
+ jobs_script = next(
235
+ (
236
+ script
237
+ for script in scripts
238
+ if (parent := script.find_parent("react-partial"))
239
+ and parent.get("partial-name") == "actions-run-jobs-list"
240
+ ),
241
+ None,
242
+ )
243
+ if jobs_script is None or not jobs_script.string:
244
+ raise GitHubActionsError(f"Could not locate jobs list for run {run_id} in the public page.")
245
+
246
+ embedded = json.loads(jobs_script.string)
247
+ props = embedded.get("props") or {}
248
+ fetch_url = props.get("jobGroupsFetchUrl")
249
+ if not fetch_url:
250
+ raise GitHubActionsError(f"Public run page for {run_id} did not expose job groups fetch URL.")
251
+
252
+ batch_response = self._web_client.get(
253
+ fetch_url,
254
+ params={"batch": 0, "size": 200},
255
+ headers={"X-Requested-With": "XMLHttpRequest"},
256
+ )
257
+ batch_response.raise_for_status()
258
+ payload = batch_response.json()
259
+ jobs: list[GitHubJob] = []
260
+ run_url = f"https://github.com/{self.owner}/{self.repo}/actions/runs/{run_id}"
261
+
262
+ for group in payload.get("jobGroups") or []:
263
+ non_nested = group.get("nonNested") or {}
264
+ for job_payload in non_nested.get("jobs") or []:
265
+ job_name = job_payload.get("displayName") or group.get("name") or ""
266
+ job_href = job_payload.get("href") or ""
267
+ jobs.append(
268
+ GitHubJob(
269
+ id=job_payload["id"],
270
+ run_id=run_id,
271
+ workflow_name="",
272
+ head_branch="",
273
+ run_url=run_url,
274
+ run_attempt=1,
275
+ head_sha="",
276
+ url="",
277
+ html_url=f"https://github.com{job_href}",
278
+ status=job_payload.get("status") or "unknown",
279
+ conclusion=job_payload.get("conclusion"),
280
+ created_at=utcnow(),
281
+ started_at=None,
282
+ completed_at=None,
283
+ name=job_name,
284
+ steps=[],
285
+ runner_group_name=self._runner_group_from_job_name(job_name),
286
+ )
287
+ )
288
+ return jobs
289
+
290
+ def _list_repo_tree_paths_public(self, ref: str = "main") -> list[str]:
291
+ response = self._web_client.get(f"/{self.owner}/{self.repo}/tree/{ref}")
292
+ response.raise_for_status()
293
+ soup = BeautifulSoup(response.text, "html.parser")
294
+ prefix = f"/{self.owner}/{self.repo}/tree/{ref}/"
295
+ candidates = sorted(
296
+ {
297
+ href.removeprefix(prefix).split("/", 1)[0]
298
+ for anchor in soup.find_all("a")
299
+ if (href := anchor.get("href") or "").startswith(prefix)
300
+ and "/" not in href.removeprefix(prefix)
301
+ }
302
+ )
303
+ paths: list[str] = []
304
+ for candidate in candidates:
305
+ if candidate.startswith("."):
306
+ continue
307
+ raw_response = self._raw_client.get(f"/{self.owner}/{self.repo}/{ref}/{candidate}/build.toml")
308
+ if raw_response.status_code == 200:
309
+ paths.append(f"{candidate}/build.toml")
310
+ return paths
311
+
312
+ def _get_file_text_public(self, path: str, ref: str | None = None) -> str | None:
313
+ target_ref = ref or "main"
314
+ response = self._raw_client.get(f"/{self.owner}/{self.repo}/{target_ref}/{path}")
315
+ if response.status_code == 404:
316
+ return None
317
+ response.raise_for_status()
318
+ return response.text
319
+
320
+ def list_runs(self, per_page: int = 30, page: int = 1) -> list[GitHubRun]:
321
+ response = self._request(
322
+ "GET",
323
+ f"/repos/{self.owner}/{self.repo}/actions/runs",
324
+ params={"per_page": per_page, "page": page},
325
+ )
326
+ payload = response.json()
327
+ return [GitHubRun.from_api(item) for item in payload.get("workflow_runs") or []]
328
+
329
+ def list_workflow_runs(
330
+ self,
331
+ workflow_file: str,
332
+ per_page: int = 30,
333
+ page: int = 1,
334
+ ) -> list[GitHubRun]:
335
+ response = self._request_with_fallback(
336
+ "GET",
337
+ f"/repos/{self.owner}/{self.repo}/actions/workflows/{workflow_file}/runs",
338
+ params={"per_page": per_page, "page": page},
339
+ )
340
+ if self._should_use_public_fallback(response):
341
+ return self._list_workflow_runs_public(workflow_file, page=page)
342
+ if response.status_code >= 400:
343
+ raise GitHubActionsError(
344
+ f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/workflows/{workflow_file}/runs: "
345
+ f"{response.status_code} {response.text}"
346
+ )
347
+ payload = response.json()
348
+ return [GitHubRun.from_api(item) for item in payload.get("workflow_runs") or []]
349
+
350
+ def list_jobs(self, run_id: int) -> list[GitHubJob]:
351
+ response = self._request_with_fallback(
352
+ "GET",
353
+ f"/repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs",
354
+ params={"per_page": 100},
355
+ )
356
+ if self._should_use_public_fallback(response):
357
+ return self._list_jobs_public(run_id)
358
+ if response.status_code >= 400:
359
+ raise GitHubActionsError(
360
+ f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs: "
361
+ f"{response.status_code} {response.text}"
362
+ )
363
+ payload = response.json()
364
+ return [GitHubJob.from_api(item) for item in payload.get("jobs") or []]
365
+
366
+ def list_repo_tree_paths(self, ref: str = "main") -> list[str]:
367
+ response = self._request_with_fallback(
368
+ "GET",
369
+ f"/repos/{self.owner}/{self.repo}/git/trees/{ref}",
370
+ params={"recursive": 1},
371
+ )
372
+ if self._should_use_public_fallback(response):
373
+ return self._list_repo_tree_paths_public(ref=ref)
374
+ if response.status_code >= 400:
375
+ raise GitHubActionsError(
376
+ f"GitHub API request failed for /repos/{self.owner}/{self.repo}/git/trees/{ref}: "
377
+ f"{response.status_code} {response.text}"
378
+ )
379
+ payload = response.json()
380
+ return [item["path"] for item in payload.get("tree") or [] if item.get("path")]
381
+
382
+ def get_job_logs(
383
+ self,
384
+ job_id: int,
385
+ line_limit: int = 400,
386
+ char_limit: int = 35000,
387
+ job_html_url: str | None = None,
388
+ ) -> str | None:
389
+ response = self._request_with_fallback(
390
+ "GET",
391
+ f"/repos/{self.owner}/{self.repo}/actions/jobs/{job_id}/logs",
392
+ )
393
+
394
+ if response.status_code in {301, 302, 307, 308}:
395
+ location = response.headers.get("Location")
396
+ if not location:
397
+ return None
398
+ redirected = self._anonymous_client.get(location, follow_redirects=True)
399
+ if redirected.status_code in {404, 410}:
400
+ return None
401
+ redirected.raise_for_status()
402
+ text = redirected.text
403
+ elif response.status_code in {404, 410}:
404
+ return None
405
+ elif response.status_code == 403 and job_html_url:
406
+ text = self._fetch_public_job_page(job_html_url)
407
+ elif response.status_code >= 400:
408
+ raise GitHubActionsError(
409
+ f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/jobs/{job_id}/logs: "
410
+ f"{response.status_code} {response.text}"
411
+ )
412
+ else:
413
+ text = response.text
414
+
415
+ if not text:
416
+ return None
417
+
418
+ lines = text.splitlines()
419
+ if line_limit and len(lines) > line_limit:
420
+ lines = lines[-line_limit:]
421
+ trimmed = "\n".join(lines)
422
+ if char_limit and len(trimmed) > char_limit:
423
+ trimmed = trimmed[-char_limit:]
424
+ return trimmed
425
+
426
+ def _fetch_public_job_page(self, job_html_url: str) -> str | None:
427
+ response = self._anonymous_client.get(job_html_url, follow_redirects=True)
428
+ response.raise_for_status()
429
+ text = response.text
430
+ text = re.sub(r"(?is)<script.*?</script>", " ", text)
431
+ text = re.sub(r"(?is)<style.*?</style>", " ", text)
432
+ text = re.sub(r"(?s)<[^>]+>", "\n", text)
433
+ text = html.unescape(text)
434
+ normalized_lines = [line.strip() for line in text.splitlines() if line.strip()]
435
+ return "\n".join(normalized_lines)
436
+
437
+ def get_file_text(self, path: str, ref: str | None = None) -> str | None:
438
+ params = {"ref": ref} if ref else None
439
+ response = self._request_with_fallback(
440
+ "GET",
441
+ f"/repos/{self.owner}/{self.repo}/contents/{path}",
442
+ params=params,
443
+ )
444
+ if self._should_use_public_fallback(response):
445
+ return self._get_file_text_public(path, ref=ref)
446
+ if response.status_code >= 400:
447
+ raise GitHubActionsError(
448
+ f"GitHub API request failed for /repos/{self.owner}/{self.repo}/contents/{path}: "
449
+ f"{response.status_code} {response.text}"
450
+ )
451
+ payload = response.json()
452
+ encoded = payload.get("content")
453
+ if not encoded:
454
+ return None
455
+ content = base64.b64decode(encoded)
456
+ return content.decode("utf-8", errors="replace")
src/kc_monitor/grafana.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from urllib.parse import urlencode
5
+
6
+ from kc_monitor.config import GrafanaSettings
7
+
8
+
9
+ @dataclass(frozen=True, slots=True)
10
+ class GrafanaDashboard:
11
+ key: str
12
+ title: str
13
+ description: str
14
+ uid: str
15
+ height: int
16
+
17
+
18
+ def dashboard_catalog(settings: GrafanaSettings) -> list[GrafanaDashboard]:
19
+ return [
20
+ GrafanaDashboard(
21
+ key="overview",
22
+ title="Matrix overview",
23
+ description="Latest outcome per build matrix combo, with fast filters across kernel, backend, CUDA, PyTorch, and Python.",
24
+ uid=settings.overview_dashboard_uid,
25
+ height=420,
26
+ ),
27
+ GrafanaDashboard(
28
+ key="durations",
29
+ title="Duration trends",
30
+ description="Compilation and upload duration trends, so regressions show up as rising wall time instead of surprise failures.",
31
+ uid=settings.duration_dashboard_uid,
32
+ height=460,
33
+ ),
34
+ GrafanaDashboard(
35
+ key="failures",
36
+ title="Failure overview",
37
+ description="Current failing combinations and stale metrics signals, tuned for alert-driven triage instead of log scraping.",
38
+ uid=settings.failure_dashboard_uid,
39
+ height=420,
40
+ ),
41
+ ]
42
+
43
+
44
+ def build_dashboard_url(
45
+ settings: GrafanaSettings,
46
+ uid: str,
47
+ *,
48
+ embed: bool,
49
+ ) -> str:
50
+ base_url = (settings.base_url or "").rstrip("/")
51
+ if not base_url:
52
+ return ""
53
+
54
+ query = {
55
+ "orgId": settings.org_id,
56
+ "from": settings.default_from,
57
+ "to": settings.default_to,
58
+ "theme": settings.theme,
59
+ "refresh": settings.default_refresh,
60
+ }
61
+ if embed:
62
+ query["kiosk"] = "tv"
63
+
64
+ return f"{base_url}/d/{uid}/_?{urlencode(query)}"
65
+
src/kc_monitor/kernel_index.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import tomllib
5
+
6
+ from cachetools import TTLCache
7
+
8
+ from kc_monitor.github_client import GitHubActionsClient, GitHubActionsError
9
+ from kc_monitor.models import GitHubRun, KernelInfo
10
+
11
+
12
+ PR_TITLE_RE = re.compile(r"^\s*([A-Za-z0-9_-]+)\s*:")
13
+ MANUAL_BUILD_RE = re.compile(
14
+ r"Manual Kernel Build\s*/\s*([A-Za-z0-9_-]+)\s*/",
15
+ flags=re.IGNORECASE,
16
+ )
17
+
18
+
19
+ class KernelIndex:
20
+ def __init__(
21
+ self,
22
+ client: GitHubActionsClient,
23
+ branch: str = "main",
24
+ cache_ttl_seconds: int = 900,
25
+ ) -> None:
26
+ self.client = client
27
+ self.branch = branch
28
+ self._cache: TTLCache[str, KernelInfo] = TTLCache(maxsize=256, ttl=cache_ttl_seconds)
29
+ self._catalog_cache: TTLCache[str, list[KernelInfo]] = TTLCache(maxsize=1, ttl=cache_ttl_seconds)
30
+
31
+ @staticmethod
32
+ def infer_kernel_name(run: GitHubRun) -> str | None:
33
+ candidates = [run.display_title, run.name]
34
+ for candidate in candidates:
35
+ if not candidate:
36
+ continue
37
+ match = PR_TITLE_RE.match(candidate)
38
+ if match:
39
+ return match.group(1)
40
+
41
+ match = MANUAL_BUILD_RE.search(candidate)
42
+ if match:
43
+ return match.group(1)
44
+
45
+ return None
46
+
47
+ @staticmethod
48
+ def _fallback_kernel_info(kernel_name: str) -> KernelInfo:
49
+ repo_id = f"kernels-community/{kernel_name}"
50
+ return KernelInfo(
51
+ kernel_name=kernel_name,
52
+ repo_id=repo_id,
53
+ hub_url=f"https://huggingface.co/{repo_id}",
54
+ )
55
+
56
+ def get_kernel_info(self, kernel_name: str) -> KernelInfo:
57
+ if kernel_name in self._cache:
58
+ return self._cache[kernel_name]
59
+
60
+ info = self._fallback_kernel_info(kernel_name)
61
+ try:
62
+ content = self.client.get_file_text(f"{kernel_name}/build.toml", ref=self.branch)
63
+ except GitHubActionsError:
64
+ self._cache[kernel_name] = info
65
+ return info
66
+
67
+ if not content:
68
+ self._cache[kernel_name] = info
69
+ return info
70
+
71
+ try:
72
+ data = tomllib.loads(content)
73
+ except tomllib.TOMLDecodeError:
74
+ self._cache[kernel_name] = info
75
+ return info
76
+
77
+ general = data.get("general") or {}
78
+ hub = general.get("hub") or {}
79
+ repo_id = hub.get("repo-id") or info.repo_id
80
+ parsed = KernelInfo(
81
+ kernel_name=general.get("name") or kernel_name,
82
+ repo_id=repo_id,
83
+ hub_url=f"https://huggingface.co/{repo_id}",
84
+ version=general.get("version"),
85
+ backends=list(general.get("backends") or []),
86
+ )
87
+ self._cache[kernel_name] = parsed
88
+ return parsed
89
+
90
+ def list_kernel_catalog(self) -> list[KernelInfo]:
91
+ if "catalog" in self._catalog_cache:
92
+ return self._catalog_cache["catalog"]
93
+
94
+ try:
95
+ paths = self.client.list_repo_tree_paths(ref=self.branch)
96
+ except GitHubActionsError:
97
+ return []
98
+
99
+ kernel_names = sorted(
100
+ {
101
+ path.split("/", 1)[0]
102
+ for path in paths
103
+ if path.endswith("/build.toml") and "/" in path and path.count("/") == 1
104
+ }
105
+ )
106
+ catalog = [self._cache.get(name) or self._fallback_kernel_info(name) for name in kernel_names]
107
+ self._catalog_cache["catalog"] = catalog
108
+ return catalog
src/kc_monitor/log_parser.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ from kc_monitor.models import (
6
+ FAILING_CONCLUSIONS,
7
+ GitHubJob,
8
+ GitHubJobStep,
9
+ GitHubRun,
10
+ ParsedJobState,
11
+ ParsedLogEvent,
12
+ parse_github_datetime,
13
+ )
14
+
15
+
16
+ PHASE_LABELS = {
17
+ "queued": "Queued",
18
+ "setup": "Setup",
19
+ "validating": "Validating",
20
+ "building": "Building",
21
+ "uploading": "Uploading",
22
+ "upload_complete": "Upload complete",
23
+ "testing": "Testing",
24
+ "completed": "Completed",
25
+ "failed": "Failed",
26
+ "cancelled": "Cancelled",
27
+ "stalled": "Stalled",
28
+ }
29
+
30
+ UPLOAD_LABELS = {
31
+ "not_started": "Not started",
32
+ "running": "Running",
33
+ "completed": "Completed",
34
+ "failed": "Failed",
35
+ "skipped": "Skipped",
36
+ }
37
+
38
+ STEP_PHASE_RULES: list[tuple[re.Pattern[str], str]] = [
39
+ (re.compile(r"Set up job|checkout|nix-installer|Nix info|cachix", re.IGNORECASE), "setup"),
40
+ (re.compile(r"Validate kernel directory", re.IGNORECASE), "validating"),
41
+ (re.compile(r"Build and upload kernel|Build kernel|Build and copy kernel", re.IGNORECASE), "building"),
42
+ (re.compile(r"Upload kernel|Upload v1 kernels to main|Upload ci-test closure", re.IGNORECASE), "uploading"),
43
+ (re.compile(r"Run GPU tests", re.IGNORECASE), "testing"),
44
+ ]
45
+
46
+ TIMESTAMP_RE = re.compile(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)")
47
+ REPO_ID_RE = re.compile(r"--repo-id(?:=|\s+)?\"?([A-Za-z0-9._-]+/[A-Za-z0-9._-]+)\"?")
48
+ UPLOAD_START_RE = re.compile(r"(kernels\s+--\s+upload|upload\s+--repo-id|Uploading\s+[A-Za-z0-9._-]+/[A-Za-z0-9._-]+)", re.IGNORECASE)
49
+ UPLOAD_SUCCESS_RE = re.compile(r"(Upload finished|Upload complete|Committed|commit created|pushed to hub)", re.IGNORECASE)
50
+ ERROR_RE = re.compile(r"(error:|Process completed with exit code|Traceback|FAILED|fatal:)", re.IGNORECASE)
51
+
52
+
53
+ def classify_step_name(step_name: str | None) -> str | None:
54
+ if not step_name:
55
+ return None
56
+ for pattern, phase in STEP_PHASE_RULES:
57
+ if pattern.search(step_name):
58
+ return phase
59
+ return None
60
+
61
+
62
+ def _interesting_category(line: str) -> str | None:
63
+ if ERROR_RE.search(line):
64
+ return "error"
65
+ if UPLOAD_START_RE.search(line) or "upload" in line.lower():
66
+ return "upload"
67
+ if "build-and-upload" in line or "build-and-copy" in line or "nix build" in line.lower():
68
+ return "build"
69
+ if "validate kernel directory" in line.lower():
70
+ return "validation"
71
+ return None
72
+
73
+
74
+ class JobLogParser:
75
+ def parse(
76
+ self,
77
+ run: GitHubRun,
78
+ job: GitHubJob,
79
+ log_text: str | None,
80
+ event_limit: int = 20,
81
+ ) -> ParsedJobState:
82
+ lines = log_text.splitlines() if log_text else []
83
+ latest_log_at = self._latest_log_timestamp(lines)
84
+ repo_id = self._extract_repo_id(lines)
85
+ events = self._extract_events(lines, limit=event_limit)
86
+ failure_excerpt = self._failure_excerpt(lines) if (job.conclusion or "") in FAILING_CONCLUSIONS else None
87
+
88
+ active_step = job.active_step or job.last_step
89
+ step_phase = classify_step_name(active_step.name if active_step else None)
90
+ upload_status = self._upload_status(job, lines)
91
+ phase, reason = self._phase_for_job(job, step_phase, upload_status, lines, active_step)
92
+
93
+ return ParsedJobState(
94
+ phase=phase,
95
+ phase_label=PHASE_LABELS.get(phase, phase.title()),
96
+ phase_reason=reason,
97
+ upload_status=upload_status,
98
+ upload_status_label=UPLOAD_LABELS[upload_status],
99
+ repo_id=repo_id,
100
+ latest_log_at=latest_log_at,
101
+ active_step_name=active_step.name if active_step else None,
102
+ active_step_started_at=active_step.started_at if active_step else None,
103
+ events=events,
104
+ failure_excerpt=failure_excerpt,
105
+ )
106
+
107
+ def _phase_for_job(
108
+ self,
109
+ job: GitHubJob,
110
+ step_phase: str | None,
111
+ upload_status: str,
112
+ lines: list[str],
113
+ active_step: GitHubJobStep | None,
114
+ ) -> tuple[str, str]:
115
+ upload_started = any(UPLOAD_START_RE.search(line) for line in lines)
116
+ combined_step = any("Build and upload kernel" in step.name for step in job.steps)
117
+
118
+ if job.status != "completed":
119
+ if upload_started or upload_status == "running":
120
+ return "uploading", "Upload command detected in the active job log."
121
+ if step_phase:
122
+ return step_phase, f"Current GitHub Actions step: {active_step.name}."
123
+ return "queued", "Job is queued or still waiting for the first step to start."
124
+
125
+ conclusion = job.conclusion or "completed"
126
+ if conclusion == "success":
127
+ if upload_status == "completed" or (combined_step and upload_started):
128
+ return "upload_complete", "Build finished and upload markers were detected."
129
+ return "completed", "Job completed successfully."
130
+
131
+ if conclusion == "cancelled":
132
+ return "cancelled", "GitHub marked the job as cancelled."
133
+
134
+ if upload_status == "failed":
135
+ return "failed", "Job failed after upload started or inside an upload step."
136
+
137
+ return "failed", "GitHub marked the job as failed."
138
+
139
+ def _upload_status(self, job: GitHubJob, lines: list[str]) -> str:
140
+ upload_steps = [step for step in job.steps if classify_step_name(step.name) == "uploading"]
141
+ if any(step.is_running for step in upload_steps):
142
+ return "running"
143
+ if any((step.conclusion or "") == "success" for step in upload_steps):
144
+ return "completed"
145
+ if any((step.conclusion or "") in FAILING_CONCLUSIONS for step in upload_steps):
146
+ return "failed"
147
+ if upload_steps and all((step.conclusion or "") == "skipped" for step in upload_steps):
148
+ return "skipped"
149
+
150
+ upload_started = any(UPLOAD_START_RE.search(line) for line in lines)
151
+ upload_succeeded = any(UPLOAD_SUCCESS_RE.search(line) for line in lines)
152
+ combined_step_success = any(
153
+ "Build and upload kernel" in step.name and (step.conclusion or "") == "success"
154
+ for step in job.steps
155
+ )
156
+
157
+ if job.status != "completed":
158
+ return "running" if upload_started else "not_started"
159
+
160
+ if upload_succeeded or upload_started or combined_step_success:
161
+ return "completed"
162
+ if upload_started and (job.conclusion or "") in FAILING_CONCLUSIONS:
163
+ return "failed"
164
+ if (job.conclusion or "") == "cancelled":
165
+ return "skipped"
166
+ return "not_started"
167
+
168
+ def _latest_log_timestamp(self, lines: list[str]) -> None | object:
169
+ timestamps = []
170
+ for line in lines:
171
+ match = TIMESTAMP_RE.search(line)
172
+ if match:
173
+ parsed = parse_github_datetime(match.group(1))
174
+ if parsed:
175
+ timestamps.append(parsed)
176
+ return max(timestamps) if timestamps else None
177
+
178
+ def _extract_repo_id(self, lines: list[str]) -> str | None:
179
+ for line in lines:
180
+ match = REPO_ID_RE.search(line)
181
+ if match:
182
+ return match.group(1)
183
+ return None
184
+
185
+ def _extract_events(self, lines: list[str], limit: int) -> list[ParsedLogEvent]:
186
+ events: list[ParsedLogEvent] = []
187
+ for index, line in enumerate(lines, start=1):
188
+ category = _interesting_category(line)
189
+ if not category:
190
+ continue
191
+ timestamp = None
192
+ match = TIMESTAMP_RE.search(line)
193
+ if match:
194
+ timestamp = parse_github_datetime(match.group(1))
195
+ events.append(
196
+ ParsedLogEvent(
197
+ category=category,
198
+ message=line.strip(),
199
+ line_number=index,
200
+ timestamp=timestamp,
201
+ )
202
+ )
203
+ return events[-limit:]
204
+
205
+ def _failure_excerpt(self, lines: list[str]) -> str | None:
206
+ if not lines:
207
+ return None
208
+
209
+ failure_lines = [line.strip() for line in lines if line.strip() and ERROR_RE.search(line)]
210
+ if failure_lines:
211
+ return "\n".join(failure_lines[-8:])
212
+
213
+ non_empty = [line.strip() for line in lines if line.strip()]
214
+ if not non_empty:
215
+ return None
216
+ return "\n".join(non_empty[-10:])
src/kc_monitor/metrics_push.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timezone
5
+ import time
6
+ from typing import Mapping
7
+ from urllib.parse import quote
8
+
9
+ import httpx
10
+
11
+
12
+ GROUPING_LABEL_ORDER = (
13
+ "kernel",
14
+ "backend",
15
+ "compute_backend",
16
+ "cuda_version",
17
+ "pytorch_version",
18
+ "python_version",
19
+ )
20
+
21
+ METRIC_LABEL_ORDER = (
22
+ "repository",
23
+ "workflow",
24
+ "branch",
25
+ "job",
26
+ "runner_os",
27
+ "runner_arch",
28
+ )
29
+
30
+ RESULT_CODE_BY_STATUS = {
31
+ "success": 0,
32
+ "cancelled": 1,
33
+ "skipped": 1,
34
+ "neutral": 1,
35
+ "failure": 2,
36
+ "timed_out": 2,
37
+ "startup_failure": 2,
38
+ "action_required": 2,
39
+ }
40
+
41
+
42
+ def _coalesce(value: str | None, default: str = "unknown") -> str:
43
+ cleaned = (value or "").strip()
44
+ return cleaned or default
45
+
46
+
47
+ def _escape_label_value(value: str) -> str:
48
+ return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
49
+
50
+
51
+ def _parse_unix_or_iso(value: str) -> float:
52
+ raw = value.strip()
53
+ try:
54
+ return float(raw)
55
+ except ValueError:
56
+ normalized = raw.replace("Z", "+00:00")
57
+ return datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp()
58
+
59
+
60
+ def resolve_duration_seconds(env: Mapping[str, str], completed_at_seconds: float) -> float:
61
+ explicit_duration = env.get("KCM_BUILD_DURATION_SECONDS")
62
+ if explicit_duration:
63
+ return max(float(explicit_duration), 0.0)
64
+
65
+ started_at = env.get("KCM_JOB_STARTED_AT")
66
+ if not started_at:
67
+ return 0.0
68
+
69
+ started_at_seconds = _parse_unix_or_iso(started_at)
70
+ return max(completed_at_seconds - started_at_seconds, 0.0)
71
+
72
+
73
+ def result_code_for_status(status: str) -> int:
74
+ return RESULT_CODE_BY_STATUS.get(status.strip().lower(), 3)
75
+
76
+
77
+ @dataclass(frozen=True, slots=True)
78
+ class BuildMetricSample:
79
+ grouping_key: dict[str, str]
80
+ metric_labels: dict[str, str]
81
+ duration_seconds: float
82
+ completed_at_seconds: int
83
+ result_code: int
84
+ failed: int
85
+ result: str
86
+
87
+ @classmethod
88
+ def from_env(
89
+ cls,
90
+ env: Mapping[str, str],
91
+ *,
92
+ completed_at_seconds: int | None = None,
93
+ ) -> "BuildMetricSample":
94
+ completed_at = completed_at_seconds or int(time.time())
95
+ result = _coalesce(env.get("KCM_JOB_STATUS") or env.get("JOB_STATUS")).lower()
96
+ result_code = result_code_for_status(result)
97
+
98
+ grouping_key = {
99
+ "kernel": _coalesce(env.get("KCM_KERNEL")),
100
+ "backend": _coalesce(env.get("KCM_BACKEND")),
101
+ "compute_backend": _coalesce(env.get("KCM_COMPUTE_BACKEND")),
102
+ "cuda_version": _coalesce(env.get("KCM_CUDA_VERSION")),
103
+ "pytorch_version": _coalesce(env.get("KCM_PYTORCH_VERSION")),
104
+ "python_version": _coalesce(env.get("KCM_PYTHON_VERSION")),
105
+ }
106
+ metric_labels = {
107
+ "repository": _coalesce(env.get("GITHUB_REPOSITORY")),
108
+ "workflow": _coalesce(env.get("GITHUB_WORKFLOW")),
109
+ "branch": _coalesce(
110
+ env.get("GITHUB_REF_NAME")
111
+ or env.get("GITHUB_HEAD_REF")
112
+ or env.get("GITHUB_REF")
113
+ ),
114
+ "job": _coalesce(env.get("GITHUB_JOB")),
115
+ "runner_os": _coalesce(env.get("RUNNER_OS")),
116
+ "runner_arch": _coalesce(env.get("RUNNER_ARCH")),
117
+ }
118
+ return cls(
119
+ grouping_key=grouping_key,
120
+ metric_labels=metric_labels,
121
+ duration_seconds=resolve_duration_seconds(env, completed_at),
122
+ completed_at_seconds=completed_at,
123
+ result_code=result_code,
124
+ failed=1 if result_code == 2 else 0,
125
+ result=result,
126
+ )
127
+
128
+
129
+ def build_pushgateway_url(base_url: str, job_name: str, grouping_key: Mapping[str, str]) -> str:
130
+ path = [base_url.rstrip("/"), "metrics", "job", quote(job_name, safe="")]
131
+ for label in GROUPING_LABEL_ORDER:
132
+ path.append(quote(label, safe=""))
133
+ path.append(quote(grouping_key[label], safe=""))
134
+ return "/".join(path)
135
+
136
+
137
+ def format_prometheus_metrics(sample: BuildMetricSample) -> str:
138
+ labels = {
139
+ key: sample.metric_labels[key]
140
+ for key in METRIC_LABEL_ORDER
141
+ }
142
+ label_blob = ",".join(
143
+ f'{key}="{_escape_label_value(value)}"'
144
+ for key, value in labels.items()
145
+ )
146
+ info_labels = f'{label_blob},result="{_escape_label_value(sample.result)}"'
147
+ lines = [
148
+ "# TYPE kc_build_last_run_result_code gauge",
149
+ f"kc_build_last_run_result_code{{{label_blob}}} {sample.result_code}",
150
+ "# TYPE kc_build_last_run_failed gauge",
151
+ f"kc_build_last_run_failed{{{label_blob}}} {sample.failed}",
152
+ "# TYPE kc_build_last_run_duration_seconds gauge",
153
+ f"kc_build_last_run_duration_seconds{{{label_blob}}} {sample.duration_seconds:.3f}",
154
+ "# TYPE kc_build_last_run_timestamp_seconds gauge",
155
+ f"kc_build_last_run_timestamp_seconds{{{label_blob}}} {sample.completed_at_seconds}",
156
+ "# TYPE kc_build_last_run_info gauge",
157
+ f"kc_build_last_run_info{{{info_labels}}} 1",
158
+ ]
159
+ return "\n".join(lines) + "\n"
160
+
161
+
162
+ def push_build_metrics(
163
+ sample: BuildMetricSample,
164
+ *,
165
+ pushgateway_url: str,
166
+ job_name: str,
167
+ timeout_seconds: float = 10.0,
168
+ max_attempts: int = 3,
169
+ ) -> str:
170
+ url = build_pushgateway_url(pushgateway_url, job_name, sample.grouping_key)
171
+ payload = format_prometheus_metrics(sample)
172
+ headers = {"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}
173
+
174
+ last_error: httpx.HTTPError | None = None
175
+ with httpx.Client(timeout=timeout_seconds) as client:
176
+ for attempt in range(1, max_attempts + 1):
177
+ try:
178
+ response = client.put(url, content=payload.encode("utf-8"), headers=headers)
179
+ response.raise_for_status()
180
+ return url
181
+ except httpx.HTTPError as exc:
182
+ last_error = exc
183
+ if attempt == max_attempts:
184
+ break
185
+ time.sleep(0.5 * attempt)
186
+
187
+ if last_error is not None:
188
+ raise last_error
189
+ return url
190
+
src/kc_monitor/models.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from datetime import datetime, timezone
5
+ from typing import Any
6
+
7
+ from dateutil import parser as date_parser
8
+
9
+
10
+ FAILING_CONCLUSIONS = {"failure", "timed_out", "cancelled", "startup_failure"}
11
+
12
+
13
+ def utcnow() -> datetime:
14
+ return datetime.now(timezone.utc)
15
+
16
+
17
+ def parse_github_datetime(value: str | None) -> datetime | None:
18
+ if not value:
19
+ return None
20
+ return date_parser.isoparse(value).astimezone(timezone.utc)
21
+
22
+
23
+ @dataclass(slots=True)
24
+ class WorkflowTarget:
25
+ path: str
26
+ label: str
27
+ enabled: bool = True
28
+
29
+ @property
30
+ def basename(self) -> str:
31
+ return self.path.rsplit("/", 1)[-1]
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class GitHubRun:
36
+ id: int
37
+ name: str
38
+ display_title: str
39
+ path: str
40
+ status: str
41
+ conclusion: str | None
42
+ head_branch: str
43
+ head_sha: str
44
+ event: str
45
+ html_url: str
46
+ jobs_url: str
47
+ created_at: datetime
48
+ updated_at: datetime
49
+ run_started_at: datetime | None
50
+ actor_login: str | None = None
51
+ raw: dict[str, Any] = field(default_factory=dict)
52
+
53
+ @classmethod
54
+ def from_api(cls, payload: dict[str, Any]) -> "GitHubRun":
55
+ actor = payload.get("actor") or {}
56
+ return cls(
57
+ id=payload["id"],
58
+ name=payload.get("name") or "",
59
+ display_title=payload.get("display_title") or payload.get("name") or "",
60
+ path=payload.get("path") or "",
61
+ status=payload.get("status") or "unknown",
62
+ conclusion=payload.get("conclusion"),
63
+ head_branch=payload.get("head_branch") or "",
64
+ head_sha=payload.get("head_sha") or "",
65
+ event=payload.get("event") or "",
66
+ html_url=payload.get("html_url") or "",
67
+ jobs_url=payload.get("jobs_url") or "",
68
+ created_at=parse_github_datetime(payload.get("created_at")) or utcnow(),
69
+ updated_at=parse_github_datetime(payload.get("updated_at")) or utcnow(),
70
+ run_started_at=parse_github_datetime(payload.get("run_started_at")),
71
+ actor_login=actor.get("login"),
72
+ raw=payload,
73
+ )
74
+
75
+ @property
76
+ def is_active(self) -> bool:
77
+ return self.status != "completed"
78
+
79
+ @property
80
+ def sort_time(self) -> datetime:
81
+ return self.run_started_at or self.created_at
82
+
83
+
84
+ @dataclass(slots=True)
85
+ class GitHubJobStep:
86
+ name: str
87
+ status: str
88
+ conclusion: str | None
89
+ number: int
90
+ started_at: datetime | None
91
+ completed_at: datetime | None
92
+
93
+ @classmethod
94
+ def from_api(cls, payload: dict[str, Any]) -> "GitHubJobStep":
95
+ return cls(
96
+ name=payload.get("name") or "",
97
+ status=payload.get("status") or "unknown",
98
+ conclusion=payload.get("conclusion"),
99
+ number=payload.get("number") or 0,
100
+ started_at=parse_github_datetime(payload.get("started_at")),
101
+ completed_at=parse_github_datetime(payload.get("completed_at")),
102
+ )
103
+
104
+ @property
105
+ def is_running(self) -> bool:
106
+ return self.status != "completed"
107
+
108
+ @property
109
+ def is_failed(self) -> bool:
110
+ return (self.conclusion or "") in FAILING_CONCLUSIONS
111
+
112
+ @property
113
+ def duration_seconds(self) -> float | None:
114
+ if not self.started_at:
115
+ return None
116
+ end = self.completed_at or utcnow()
117
+ return max((end - self.started_at).total_seconds(), 0.0)
118
+
119
+
120
+ @dataclass(slots=True)
121
+ class GitHubJob:
122
+ id: int
123
+ run_id: int
124
+ workflow_name: str
125
+ head_branch: str
126
+ run_url: str
127
+ run_attempt: int
128
+ head_sha: str
129
+ url: str
130
+ html_url: str
131
+ status: str
132
+ conclusion: str | None
133
+ created_at: datetime
134
+ started_at: datetime | None
135
+ completed_at: datetime | None
136
+ name: str
137
+ steps: list[GitHubJobStep]
138
+ runner_name: str | None = None
139
+ runner_group_name: str | None = None
140
+
141
+ @classmethod
142
+ def from_api(cls, payload: dict[str, Any]) -> "GitHubJob":
143
+ steps = [GitHubJobStep.from_api(item) for item in payload.get("steps") or []]
144
+ return cls(
145
+ id=payload["id"],
146
+ run_id=payload.get("run_id") or 0,
147
+ workflow_name=payload.get("workflow_name") or "",
148
+ head_branch=payload.get("head_branch") or "",
149
+ run_url=payload.get("run_url") or "",
150
+ run_attempt=payload.get("run_attempt") or 1,
151
+ head_sha=payload.get("head_sha") or "",
152
+ url=payload.get("url") or "",
153
+ html_url=payload.get("html_url") or "",
154
+ status=payload.get("status") or "unknown",
155
+ conclusion=payload.get("conclusion"),
156
+ created_at=parse_github_datetime(payload.get("created_at")) or utcnow(),
157
+ started_at=parse_github_datetime(payload.get("started_at")),
158
+ completed_at=parse_github_datetime(payload.get("completed_at")),
159
+ name=payload.get("name") or "",
160
+ steps=steps,
161
+ runner_name=payload.get("runner_name"),
162
+ runner_group_name=payload.get("runner_group_name"),
163
+ )
164
+
165
+ @property
166
+ def is_active(self) -> bool:
167
+ return self.status != "completed"
168
+
169
+ @property
170
+ def active_step(self) -> GitHubJobStep | None:
171
+ for step in self.steps:
172
+ if step.is_running:
173
+ return step
174
+ return None
175
+
176
+ @property
177
+ def last_step(self) -> GitHubJobStep | None:
178
+ return self.steps[-1] if self.steps else None
179
+
180
+ @property
181
+ def duration_seconds(self) -> float | None:
182
+ if not self.started_at:
183
+ return None
184
+ end = self.completed_at or utcnow()
185
+ return max((end - self.started_at).total_seconds(), 0.0)
186
+
187
+
188
+ @dataclass(slots=True)
189
+ class KernelInfo:
190
+ kernel_name: str
191
+ repo_id: str
192
+ hub_url: str
193
+ version: int | None = None
194
+ backends: list[str] = field(default_factory=list)
195
+
196
+
197
+ @dataclass(slots=True)
198
+ class ParsedLogEvent:
199
+ category: str
200
+ message: str
201
+ line_number: int
202
+ timestamp: datetime | None = None
203
+
204
+
205
+ @dataclass(slots=True)
206
+ class ParsedJobState:
207
+ phase: str
208
+ phase_label: str
209
+ phase_reason: str
210
+ upload_status: str
211
+ upload_status_label: str
212
+ repo_id: str | None
213
+ latest_log_at: datetime | None
214
+ active_step_name: str | None
215
+ active_step_started_at: datetime | None
216
+ events: list[ParsedLogEvent] = field(default_factory=list)
217
+ failure_excerpt: str | None = None
218
+
219
+
220
+ @dataclass(slots=True)
221
+ class MonitorRecord:
222
+ key: str
223
+ kernel_name: str
224
+ critical: bool
225
+ kernel_info: KernelInfo
226
+ workflow_name: str
227
+ workflow_path: str
228
+ run: GitHubRun
229
+ job: GitHubJob
230
+ phase: str
231
+ phase_label: str
232
+ phase_reason: str
233
+ upload_status: str
234
+ upload_status_label: str
235
+ arch: str
236
+ runner_group: str | None
237
+ suspected_stalled: bool
238
+ stall_reason: str | None
239
+ latest_signal_at: datetime | None
240
+ events: list[ParsedLogEvent] = field(default_factory=list)
241
+ failure_excerpt: str | None = None
242
+ active_step_name: str | None = None
243
+ active_step_started_at: datetime | None = None
244
+
245
+ @property
246
+ def is_active(self) -> bool:
247
+ return self.job.is_active
248
+
249
+ @property
250
+ def started_at(self) -> datetime | None:
251
+ return self.job.started_at or self.run.run_started_at
252
+
253
+ @property
254
+ def completed_at(self) -> datetime | None:
255
+ return self.job.completed_at
256
+
257
+ @property
258
+ def elapsed_seconds(self) -> float | None:
259
+ start = self.started_at
260
+ if not start:
261
+ return None
262
+ end = self.completed_at or utcnow()
263
+ return max((end - start).total_seconds(), 0.0)
264
+
265
+
266
+ @dataclass(slots=True)
267
+ class KernelRunGroup:
268
+ kernel_name: str
269
+ run: GitHubRun
270
+ workflow_name: str
271
+ records: list[MonitorRecord]
272
+
273
+ @property
274
+ def is_active(self) -> bool:
275
+ return any(record.is_active for record in self.records)
276
+
277
+ @property
278
+ def has_failure(self) -> bool:
279
+ return any((record.job.conclusion or "") in FAILING_CONCLUSIONS for record in self.records)
280
+
281
+ @property
282
+ def has_stall(self) -> bool:
283
+ return any(record.suspected_stalled for record in self.records)
284
+
285
+ @property
286
+ def has_uploading(self) -> bool:
287
+ return any(record.upload_status == "running" for record in self.records)
288
+
289
+ @property
290
+ def triggered_at(self) -> datetime:
291
+ return self.run.run_started_at or self.run.created_at
292
+
293
+ @property
294
+ def latest_update_at(self) -> datetime:
295
+ candidates = [record.latest_signal_at for record in self.records if record.latest_signal_at]
296
+ if candidates:
297
+ return max(candidates)
298
+ return self.run.updated_at
299
+
300
+
301
+ @dataclass(slots=True)
302
+ class KernelRow:
303
+ kernel_name: str
304
+ kernel_info: KernelInfo
305
+ critical: bool
306
+ current_group: KernelRunGroup | None
307
+ recent_groups: list[KernelRunGroup]
308
+ row_status_kind: str
309
+ row_status_label: str
310
+ row_reason: str
311
+ upload_label: str
312
+ last_triggered_at: datetime | None
313
+
314
+ @property
315
+ def primary_group(self) -> KernelRunGroup | None:
316
+ if self.current_group:
317
+ return self.current_group
318
+ return self.recent_groups[0] if self.recent_groups else None
319
+
320
+ @property
321
+ def recent_run_count(self) -> int:
322
+ return len(self.recent_groups)
323
+
324
+
325
+ @dataclass(slots=True)
326
+ class DashboardSummary:
327
+ tracked_kernels: int = 0
328
+ active_builds: int = 0
329
+ uploading_builds: int = 0
330
+ stalled_builds: int = 0
331
+ failed_builds: int = 0
332
+ completed_uploads: int = 0
333
+
334
+
335
+ @dataclass(slots=True)
336
+ class DashboardSnapshot:
337
+ generated_at: datetime
338
+ summary: DashboardSummary
339
+ kernel_rows: list[KernelRow]
340
+ active_records: list[MonitorRecord]
341
+ recent_records: list[MonitorRecord]
342
+ errors: list[str] = field(default_factory=list)
src/kc_monitor/service.py ADDED
@@ -0,0 +1,572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ import re
5
+ from datetime import timedelta
6
+
7
+ from cachetools import TTLCache
8
+
9
+ from kc_monitor.config import AppConfig
10
+ from kc_monitor.github_client import GitHubActionsClient
11
+ from kc_monitor.kernel_index import KernelIndex
12
+ from kc_monitor.log_parser import JobLogParser, classify_step_name
13
+ from kc_monitor.models import (
14
+ DashboardSnapshot,
15
+ DashboardSummary,
16
+ FAILING_CONCLUSIONS,
17
+ GitHubJob,
18
+ GitHubJobStep,
19
+ GitHubRun,
20
+ KernelInfo,
21
+ KernelRow,
22
+ KernelRunGroup,
23
+ MonitorRecord,
24
+ utcnow,
25
+ )
26
+ from kc_monitor.stall_detector import detect_stall
27
+
28
+
29
+ ARCH_RE = re.compile(r"\(([^,]+),")
30
+
31
+
32
+ class MonitorService:
33
+ def __init__(
34
+ self,
35
+ config: AppConfig,
36
+ client: GitHubActionsClient | None = None,
37
+ parser: JobLogParser | None = None,
38
+ kernel_index: KernelIndex | None = None,
39
+ ) -> None:
40
+ self.config = config
41
+ self.client = client or GitHubActionsClient(
42
+ owner=config.github.owner,
43
+ repo=config.github.repo,
44
+ token=config.github.token,
45
+ request_timeout_seconds=config.github.request_timeout_seconds,
46
+ user_agent=config.github.user_agent,
47
+ )
48
+ self.parser = parser or JobLogParser()
49
+ self.kernel_index = kernel_index or KernelIndex(self.client, branch=config.github.branch)
50
+ self._snapshot_cache: TTLCache[str, DashboardSnapshot] = TTLCache(
51
+ maxsize=1,
52
+ ttl=max(5, config.monitor.snapshot_ttl_seconds),
53
+ )
54
+ self._workflow_labels = {
55
+ workflow.path: workflow.label for workflow in config.workflow_targets
56
+ }
57
+ self._workflow_paths = set(self._workflow_labels)
58
+
59
+ def close(self) -> None:
60
+ self.client.close()
61
+
62
+ def get_snapshot(self, force_refresh: bool = False) -> DashboardSnapshot:
63
+ if not force_refresh and "snapshot" in self._snapshot_cache:
64
+ return self._snapshot_cache["snapshot"]
65
+
66
+ snapshot = self._build_snapshot()
67
+ self._snapshot_cache["snapshot"] = snapshot
68
+ return snapshot
69
+
70
+ def _build_snapshot(self) -> DashboardSnapshot:
71
+ errors: list[str] = []
72
+ records: list[MonitorRecord] = []
73
+
74
+ kernel_catalog = self.kernel_index.list_kernel_catalog()
75
+ catalog_names = {info.kernel_name for info in kernel_catalog}
76
+ selected_runs = self._collect_runs(catalog_names, errors)
77
+
78
+ if not selected_runs and not errors:
79
+ errors.append("No kernel runs returned from any tracked workflow.")
80
+
81
+ needs_job_detail: set[int] = {run.id for run in selected_runs}
82
+ for run in selected_runs:
83
+ if run.id in needs_job_detail:
84
+ try:
85
+ jobs = self.client.list_jobs(run.id)
86
+ except Exception as exc: # noqa: BLE001
87
+ errors.append(f"Run {run.id}: {exc}")
88
+ records.append(self._build_lightweight_record(run))
89
+ continue
90
+ for job in jobs:
91
+ try:
92
+ records.append(self._build_record(run, job))
93
+ except Exception as exc: # noqa: BLE001
94
+ errors.append(f"Job {job.id}: {exc}")
95
+ else:
96
+ records.append(self._build_lightweight_record(run))
97
+
98
+ records.sort(key=self._record_sort_key)
99
+ active_records = [record for record in records if record.is_active]
100
+ recent_records = records[: self.config.monitor.recent_limit]
101
+ kernel_rows = self._build_kernel_rows(records)
102
+
103
+ summary = DashboardSummary(
104
+ tracked_kernels=len(kernel_rows),
105
+ active_builds=sum(1 for row in kernel_rows if row.current_group is not None),
106
+ uploading_builds=sum(
107
+ 1 for row in kernel_rows if row.current_group is not None and row.current_group.has_uploading
108
+ ),
109
+ stalled_builds=sum(1 for row in kernel_rows if row.row_status_kind == "stalled"),
110
+ failed_builds=sum(
111
+ 1 for row in kernel_rows if any(group.has_failure for group in row.recent_groups)
112
+ ),
113
+ completed_uploads=sum(
114
+ 1
115
+ for row in kernel_rows
116
+ if any(
117
+ any(record.upload_status == "completed" for record in group.records)
118
+ for group in row.recent_groups
119
+ )
120
+ ),
121
+ )
122
+
123
+ return DashboardSnapshot(
124
+ generated_at=utcnow(),
125
+ summary=summary,
126
+ kernel_rows=kernel_rows,
127
+ active_records=active_records,
128
+ recent_records=recent_records,
129
+ errors=errors,
130
+ )
131
+
132
+ def _collect_runs(
133
+ self,
134
+ catalog_names: set[str],
135
+ errors: list[str],
136
+ ) -> list[GitHubRun]:
137
+ latest_by_workflow_kernel: dict[tuple[str, str], GitHubRun] = {}
138
+ active_runs: dict[int, GitHubRun] = {}
139
+ per_page = max(1, self.config.monitor.workflow_run_page_size)
140
+ max_pages = max(1, self.config.monitor.workflow_run_pages)
141
+
142
+ for workflow in self.config.workflow_targets:
143
+ seen_for_workflow: set[str] = set()
144
+ for page in range(1, max_pages + 1):
145
+ try:
146
+ workflow_runs = self.client.list_workflow_runs(
147
+ workflow.basename,
148
+ per_page=per_page,
149
+ page=page,
150
+ )
151
+ except Exception as exc: # noqa: BLE001
152
+ errors.append(f"Workflow {workflow.label}: {exc}")
153
+ break
154
+
155
+ if not workflow_runs:
156
+ break
157
+
158
+ for run in workflow_runs:
159
+ if run.path not in self._workflow_paths:
160
+ continue
161
+
162
+ kernel = KernelIndex.infer_kernel_name(run)
163
+ if not kernel:
164
+ continue
165
+ if catalog_names and kernel not in catalog_names:
166
+ continue
167
+
168
+ seen_for_workflow.add(kernel)
169
+ if run.is_active:
170
+ active_runs[run.id] = run
171
+ continue
172
+
173
+ key = (workflow.path, kernel)
174
+ if key not in latest_by_workflow_kernel:
175
+ latest_by_workflow_kernel[key] = run
176
+
177
+ if len(workflow_runs) < per_page:
178
+ break
179
+ if catalog_names and seen_for_workflow >= catalog_names:
180
+ break
181
+
182
+ selected = list(active_runs.values())
183
+ selected.extend(latest_by_workflow_kernel.values())
184
+ deduped = {run.id: run for run in selected}
185
+ return sorted(deduped.values(), key=lambda run: (0 if run.is_active else 1, -run.sort_time.timestamp()))
186
+
187
+ def _filter_runs(self, runs: list[GitHubRun]) -> list[GitHubRun]:
188
+ now = utcnow()
189
+ cutoff = now - timedelta(hours=self.config.monitor.recent_completed_hours)
190
+ filtered: list[GitHubRun] = []
191
+ completed_counts: dict[str, int] = {}
192
+ for run in runs:
193
+ if run.path not in self._workflow_paths:
194
+ continue
195
+ if run.is_active:
196
+ filtered.append(run)
197
+ continue
198
+
199
+ if run.updated_at < cutoff:
200
+ continue
201
+
202
+ seen = completed_counts.get(run.path, 0)
203
+ if seen >= self.config.monitor.completed_runs_per_workflow:
204
+ continue
205
+
206
+ completed_counts[run.path] = seen + 1
207
+ filtered.append(run)
208
+ return filtered
209
+
210
+ def _build_lightweight_record(self, run: GitHubRun) -> MonitorRecord:
211
+ kernel_name = KernelIndex.infer_kernel_name(run) or "unknown"
212
+ kernel_info = self._kernel_info_for(kernel_name, None)
213
+ critical = kernel_name in self.config.monitor.critical_kernel_set
214
+ conclusion = run.conclusion or ""
215
+
216
+ if conclusion == "success":
217
+ phase, phase_label = "completed", "Completed"
218
+ elif conclusion == "failure":
219
+ phase, phase_label = "failed", "Failed"
220
+ elif conclusion == "cancelled":
221
+ phase, phase_label = "cancelled", "Cancelled"
222
+ elif run.is_active:
223
+ phase, phase_label = "running", "Running"
224
+ else:
225
+ phase, phase_label = "completed", conclusion.title() or "Done"
226
+
227
+ stub_job = GitHubJob(
228
+ id=0, run_id=run.id, workflow_name=run.name, head_branch=run.head_branch,
229
+ run_url=run.html_url, run_attempt=1, head_sha=run.head_sha, url="",
230
+ html_url=run.html_url, status=run.status, conclusion=run.conclusion,
231
+ created_at=run.created_at, started_at=run.run_started_at,
232
+ completed_at=run.updated_at, name=run.name, steps=[],
233
+ )
234
+
235
+ return MonitorRecord(
236
+ key=f"{run.id}:0",
237
+ kernel_name=kernel_name,
238
+ critical=critical,
239
+ kernel_info=kernel_info,
240
+ workflow_name=self._workflow_labels.get(run.path, run.name),
241
+ workflow_path=run.path,
242
+ run=run,
243
+ job=stub_job,
244
+ phase=phase,
245
+ phase_label=phase_label,
246
+ phase_reason=f"Run {conclusion or run.status} (summary only).",
247
+ upload_status="not_started",
248
+ upload_status_label="Unknown",
249
+ arch="all",
250
+ runner_group=None,
251
+ suspected_stalled=False,
252
+ stall_reason=None,
253
+ latest_signal_at=run.updated_at,
254
+ )
255
+
256
+ def _build_record(self, run: GitHubRun, job: GitHubJob) -> MonitorRecord:
257
+ job = self._normalize_job(run, job)
258
+ log_text = None
259
+ if self._should_fetch_logs(job):
260
+ log_text = self.client.get_job_logs(
261
+ job.id,
262
+ line_limit=self.config.monitor.log_line_limit,
263
+ char_limit=self.config.monitor.log_char_limit,
264
+ job_html_url=job.html_url,
265
+ )
266
+
267
+ parsed = self.parser.parse(
268
+ run,
269
+ job,
270
+ log_text,
271
+ event_limit=self.config.monitor.detail_event_limit,
272
+ )
273
+
274
+ kernel_name = KernelIndex.infer_kernel_name(run) or "unknown"
275
+ kernel_info = self._kernel_info_for(kernel_name, parsed.repo_id)
276
+ latest_signal_at = parsed.latest_log_at or run.updated_at or job.started_at
277
+ critical = kernel_name in self.config.monitor.critical_kernel_set
278
+
279
+ record = MonitorRecord(
280
+ key=f"{run.id}:{job.id}",
281
+ kernel_name=kernel_name,
282
+ critical=critical,
283
+ kernel_info=kernel_info,
284
+ workflow_name=self._workflow_labels.get(run.path, run.name),
285
+ workflow_path=run.path,
286
+ run=run,
287
+ job=job,
288
+ phase=parsed.phase,
289
+ phase_label=parsed.phase_label,
290
+ phase_reason=parsed.phase_reason,
291
+ upload_status=parsed.upload_status,
292
+ upload_status_label=parsed.upload_status_label,
293
+ arch=self._extract_arch(job.name),
294
+ runner_group=job.runner_group_name,
295
+ suspected_stalled=False,
296
+ stall_reason=None,
297
+ latest_signal_at=latest_signal_at,
298
+ events=parsed.events,
299
+ failure_excerpt=parsed.failure_excerpt,
300
+ active_step_name=parsed.active_step_name,
301
+ active_step_started_at=parsed.active_step_started_at,
302
+ )
303
+ stalled, stall_reason = detect_stall(record, self.config.monitor)
304
+ record.suspected_stalled = stalled
305
+ record.stall_reason = stall_reason
306
+ return record
307
+
308
+ @staticmethod
309
+ def _normalize_job(run: GitHubRun, job: GitHubJob) -> GitHubJob:
310
+ if job.steps:
311
+ return job
312
+
313
+ started_at = run.run_started_at or run.created_at
314
+ completed_at = None if job.is_active else run.updated_at
315
+ synthetic_steps: list[GitHubJobStep] = []
316
+
317
+ if run.path.endswith("build-release.yaml"):
318
+ synthetic_steps.append(
319
+ GitHubJobStep(
320
+ name="Build and upload kernel",
321
+ status=job.status,
322
+ conclusion=job.conclusion,
323
+ number=1,
324
+ started_at=started_at,
325
+ completed_at=completed_at,
326
+ )
327
+ )
328
+ if (job.conclusion or "") == "success":
329
+ synthetic_steps.append(
330
+ GitHubJobStep(
331
+ name="Upload v1 kernels to main",
332
+ status="completed",
333
+ conclusion="success",
334
+ number=2,
335
+ started_at=completed_at or started_at,
336
+ completed_at=completed_at,
337
+ )
338
+ )
339
+ elif run.path.endswith("manual-build-upload.yaml"):
340
+ synthetic_steps.append(
341
+ GitHubJobStep(
342
+ name="Build and copy kernel",
343
+ status=job.status,
344
+ conclusion=job.conclusion,
345
+ number=1,
346
+ started_at=started_at,
347
+ completed_at=completed_at,
348
+ )
349
+ )
350
+ if (job.conclusion or "") == "success":
351
+ synthetic_steps.append(
352
+ GitHubJobStep(
353
+ name="Upload kernel",
354
+ status="completed",
355
+ conclusion="success",
356
+ number=2,
357
+ started_at=completed_at or started_at,
358
+ completed_at=completed_at,
359
+ )
360
+ )
361
+
362
+ if synthetic_steps:
363
+ job.steps = synthetic_steps
364
+ return job
365
+
366
+ def _kernel_info_for(self, kernel_name: str, parsed_repo_id: str | None) -> KernelInfo:
367
+ if kernel_name == "unknown":
368
+ repo_id = parsed_repo_id or f"{self.config.github.owner}/{self.config.github.repo}"
369
+ return KernelInfo(
370
+ kernel_name=kernel_name,
371
+ repo_id=repo_id,
372
+ hub_url=f"https://huggingface.co/{repo_id}",
373
+ )
374
+
375
+ info = self.kernel_index.get_kernel_info(kernel_name)
376
+ if not parsed_repo_id or parsed_repo_id == info.repo_id:
377
+ return info
378
+
379
+ return KernelInfo(
380
+ kernel_name=info.kernel_name,
381
+ repo_id=parsed_repo_id,
382
+ hub_url=f"https://huggingface.co/{parsed_repo_id}",
383
+ version=info.version,
384
+ backends=info.backends,
385
+ )
386
+
387
+ def _should_fetch_logs(self, job: GitHubJob) -> bool:
388
+ if job.is_active:
389
+ return True
390
+ if (job.conclusion or "") in FAILING_CONCLUSIONS:
391
+ return True
392
+ return False
393
+
394
+ @staticmethod
395
+ def _extract_arch(job_name: str) -> str:
396
+ match = ARCH_RE.search(job_name)
397
+ if match:
398
+ return match.group(1).strip()
399
+ return "n/a"
400
+
401
+ @staticmethod
402
+ def _record_sort_key(record: MonitorRecord) -> tuple[int, int, float]:
403
+ started_at = record.started_at or utcnow()
404
+ return (
405
+ 0 if record.is_active else 1,
406
+ 0 if record.critical else 1,
407
+ -started_at.timestamp(),
408
+ )
409
+
410
+ def _build_kernel_rows(self, records: list[MonitorRecord]) -> list[KernelRow]:
411
+ grouped_records: dict[str, list[MonitorRecord]] = defaultdict(list)
412
+ for record in records:
413
+ grouped_records[record.kernel_name].append(record)
414
+
415
+ info_map = {info.kernel_name: info for info in self.kernel_index.list_kernel_catalog()}
416
+ for record in records:
417
+ info_map[record.kernel_name] = record.kernel_info
418
+
419
+ rows: list[KernelRow] = []
420
+ for kernel_name, kernel_info in info_map.items():
421
+ kernel_records = sorted(grouped_records.get(kernel_name, []), key=self._record_sort_key)
422
+ recent_groups = self._group_kernel_runs(kernel_name, kernel_records)
423
+ current_group = next((group for group in recent_groups if group.is_active), None)
424
+ row_status_kind, row_status_label, row_reason, upload_label = self._summarize_kernel(
425
+ current_group,
426
+ recent_groups,
427
+ )
428
+ rows.append(
429
+ KernelRow(
430
+ kernel_name=kernel_name,
431
+ kernel_info=kernel_info,
432
+ critical=kernel_name in self.config.monitor.critical_kernel_set,
433
+ current_group=current_group,
434
+ recent_groups=recent_groups,
435
+ row_status_kind=row_status_kind,
436
+ row_status_label=row_status_label,
437
+ row_reason=row_reason,
438
+ upload_label=upload_label,
439
+ last_triggered_at=recent_groups[0].triggered_at if recent_groups else None,
440
+ )
441
+ )
442
+
443
+ rows.sort(key=self._kernel_row_sort_key)
444
+ return rows
445
+
446
+ def _group_kernel_runs(
447
+ self,
448
+ kernel_name: str,
449
+ records: list[MonitorRecord],
450
+ ) -> list[KernelRunGroup]:
451
+ grouped: dict[int, list[MonitorRecord]] = defaultdict(list)
452
+ run_lookup: dict[int, GitHubRun] = {}
453
+ workflow_lookup: dict[int, str] = {}
454
+ for record in records:
455
+ grouped[record.run.id].append(record)
456
+ run_lookup[record.run.id] = record.run
457
+ workflow_lookup[record.run.id] = record.workflow_name
458
+
459
+ groups: list[KernelRunGroup] = []
460
+ for run_id, run_records in grouped.items():
461
+ sorted_records = sorted(
462
+ run_records,
463
+ key=lambda record: (
464
+ 0 if record.is_active else 1,
465
+ 0 if record.arch == "x86_64-linux" else 1,
466
+ record.arch,
467
+ ),
468
+ )
469
+ groups.append(
470
+ KernelRunGroup(
471
+ kernel_name=kernel_name,
472
+ run=run_lookup[run_id],
473
+ workflow_name=workflow_lookup[run_id],
474
+ records=sorted_records,
475
+ )
476
+ )
477
+
478
+ groups.sort(key=lambda group: -group.triggered_at.timestamp())
479
+ return groups
480
+
481
+ @staticmethod
482
+ def _summarize_kernel(
483
+ current_group: KernelRunGroup | None,
484
+ recent_groups: list[KernelRunGroup],
485
+ ) -> tuple[str, str, str, str]:
486
+ if current_group is not None:
487
+ if current_group.has_stall:
488
+ status_kind = "stalled"
489
+ status_label = "Stalled"
490
+ elif current_group.has_uploading:
491
+ status_kind = "uploading"
492
+ status_label = "Uploading"
493
+ else:
494
+ status_kind = "running"
495
+ status_label = "Running"
496
+ return (
497
+ status_kind,
498
+ status_label,
499
+ MonitorService._arch_summary(current_group.records),
500
+ MonitorService._upload_summary(current_group.records),
501
+ )
502
+
503
+ if not recent_groups:
504
+ return ("idle", "Idle", "No recent tracked CI run.", "No recent upload")
505
+
506
+ latest_group = recent_groups[0]
507
+ if latest_group.has_failure:
508
+ status_kind = "failed"
509
+ status_label = "Failed"
510
+ elif any(record.upload_status == "completed" for record in latest_group.records):
511
+ status_kind = "completed"
512
+ status_label = "Completed"
513
+ elif all((record.job.conclusion or "") == "cancelled" for record in latest_group.records):
514
+ status_kind = "cancelled"
515
+ status_label = "Cancelled"
516
+ else:
517
+ status_kind = "recent"
518
+ status_label = "Recent"
519
+
520
+ return (
521
+ status_kind,
522
+ status_label,
523
+ MonitorService._arch_summary(latest_group.records),
524
+ MonitorService._upload_summary(latest_group.records),
525
+ )
526
+
527
+ @staticmethod
528
+ def _arch_summary(records: list[MonitorRecord]) -> str:
529
+ if not records:
530
+ return "No job details."
531
+ return " | ".join(
532
+ f"{MonitorService._short_arch(record.arch)}: {record.phase_label}"
533
+ for record in records
534
+ )
535
+
536
+ @staticmethod
537
+ def _upload_summary(records: list[MonitorRecord]) -> str:
538
+ if not records:
539
+ return "No upload"
540
+ return " | ".join(
541
+ f"{MonitorService._short_arch(record.arch)}: {record.upload_status_label}"
542
+ for record in records
543
+ )
544
+
545
+ @staticmethod
546
+ def _short_arch(arch: str) -> str:
547
+ mapping = {
548
+ "x86_64-linux": "x86",
549
+ "aarch64-linux": "arm64",
550
+ "x86_64-darwin": "mac",
551
+ "aarch64-darwin": "mac-arm",
552
+ }
553
+ return mapping.get(arch, arch)
554
+
555
+ @staticmethod
556
+ def _kernel_row_sort_key(row: KernelRow) -> tuple[int, int, int, str]:
557
+ status_rank = {
558
+ "stalled": 0,
559
+ "uploading": 1,
560
+ "running": 2,
561
+ "failed": 3,
562
+ "completed": 4,
563
+ "cancelled": 5,
564
+ "recent": 6,
565
+ "idle": 7,
566
+ }
567
+ return (
568
+ status_rank.get(row.row_status_kind, 99),
569
+ 0 if row.critical else 1,
570
+ 0 if row.last_triggered_at else 1,
571
+ row.kernel_name,
572
+ )
src/kc_monitor/stall_detector.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timedelta
4
+
5
+ from kc_monitor.config import MonitorSettings
6
+ from kc_monitor.models import MonitorRecord, utcnow
7
+
8
+
9
+ ACTIVE_PHASES = {"building", "uploading", "testing"}
10
+
11
+
12
+ def _format_duration(delta: timedelta) -> str:
13
+ total_seconds = int(delta.total_seconds())
14
+ if total_seconds < 60:
15
+ return f"{total_seconds}s"
16
+ if total_seconds < 3600:
17
+ return f"{total_seconds // 60}m"
18
+ hours, remainder = divmod(total_seconds, 3600)
19
+ minutes = remainder // 60
20
+ if minutes:
21
+ return f"{hours}h {minutes}m"
22
+ return f"{hours}h"
23
+
24
+
25
+ def detect_stall(
26
+ record: MonitorRecord,
27
+ settings: MonitorSettings,
28
+ now: datetime | None = None,
29
+ ) -> tuple[bool, str | None]:
30
+ if not record.is_active:
31
+ return False, None
32
+
33
+ if record.phase not in ACTIVE_PHASES:
34
+ return False, None
35
+
36
+ now = now or utcnow()
37
+ latest_signal = record.latest_signal_at or record.run.updated_at or record.started_at
38
+ if latest_signal:
39
+ silent_for = now - latest_signal
40
+ if silent_for >= timedelta(minutes=settings.stall_without_log_minutes):
41
+ return True, f"No fresh signal for { _format_duration(silent_for) }."
42
+
43
+ if record.active_step_started_at:
44
+ phase_duration = now - record.active_step_started_at
45
+ if phase_duration >= timedelta(minutes=settings.stall_active_phase_minutes):
46
+ return True, f"{record.phase_label} has been running for { _format_duration(phase_duration) }."
47
+
48
+ return False, None
src/kc_monitor/ui.py ADDED
@@ -0,0 +1,1110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import re
5
+ from datetime import datetime, timezone
6
+
7
+ import gradio as gr
8
+
9
+ from kc_monitor.config import AppConfig
10
+ from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
11
+ from kc_monitor.models import DashboardSnapshot, KernelRow, KernelRunGroup, MonitorRecord
12
+ from kc_monitor.service import MonitorService
13
+
14
+
15
+ VARIANT_RE = re.compile(r"\(([^)]+)\)")
16
+
17
+ THEME = gr.themes.Base()
18
+
19
+ PAGE_JS = """
20
+ function kcmBoot() {
21
+ if (window._kcmBooted) return;
22
+ window._kcmBooted = true;
23
+
24
+ function applyFilters() {
25
+ var search = document.querySelector('.kcm-search');
26
+ var status = document.querySelector('.kcm-status-filter');
27
+ var searchValue = search ? search.value.toLowerCase().trim() : '';
28
+ var statusValue = status ? status.value : 'all';
29
+
30
+ document.querySelectorAll('#kernelTable tbody tr[data-idx]').forEach(function(row) {
31
+ var kernel = (row.getAttribute('data-kernel') || '').toLowerCase();
32
+ var rowStatus = row.getAttribute('data-status') || 'all';
33
+ var workflow = (row.getAttribute('data-workflow') || '').toLowerCase();
34
+ var searchOk = !searchValue || kernel.indexOf(searchValue) >= 0 || workflow.indexOf(searchValue) >= 0;
35
+ var statusOk = statusValue === 'all' || rowStatus === statusValue;
36
+ row.style.display = searchOk && statusOk ? '' : 'none';
37
+ });
38
+ }
39
+
40
+ document.addEventListener('click', function(e) {
41
+ var row = e.target.closest('tr[data-idx]');
42
+ if (row && !e.target.closest('a')) {
43
+ var idx = row.getAttribute('data-idx');
44
+ var el = document.getElementById('modal-content-' + idx);
45
+ if (!el) return;
46
+ document.getElementById('kcmModal').innerHTML = el.innerHTML;
47
+ document.getElementById('kcmOverlay').classList.add('open');
48
+ document.body.style.overflow = 'hidden';
49
+ return;
50
+ }
51
+ if (e.target.closest('.kcm-modal-close') || e.target.id === 'kcmOverlay') {
52
+ document.getElementById('kcmOverlay').classList.remove('open');
53
+ document.body.style.overflow = '';
54
+ }
55
+ });
56
+
57
+ document.addEventListener('input', function(e) {
58
+ if (e.target.classList.contains('kcm-search')) applyFilters();
59
+ });
60
+
61
+ document.addEventListener('change', function(e) {
62
+ if (e.target.classList.contains('kcm-status-filter')) applyFilters();
63
+ });
64
+
65
+ document.addEventListener('keydown', function(e) {
66
+ if (e.key === 'Escape') {
67
+ document.getElementById('kcmOverlay').classList.remove('open');
68
+ document.body.style.overflow = '';
69
+ }
70
+ });
71
+
72
+ applyFilters();
73
+ }
74
+
75
+ kcmBoot();
76
+ new MutationObserver(function() {
77
+ window._kcmBooted = false;
78
+ kcmBoot();
79
+ }).observe(document.body, { childList: true, subtree: true });
80
+ """
81
+
82
+ CSS = """
83
+ :root {
84
+ --bg: #050711;
85
+ --surface: rgba(11, 16, 30, 0.92);
86
+ --surface-2: rgba(14, 22, 40, 0.94);
87
+ --surface-3: rgba(19, 29, 53, 0.98);
88
+ --surface-hover: rgba(121, 171, 255, 0.06);
89
+ --text: #f4f7ff;
90
+ --text-secondary: #98a7c4;
91
+ --text-tertiary: #6d7b98;
92
+ --accent: #86b0ff;
93
+ --accent-2: #6ff0c0;
94
+ --ok: #74efab;
95
+ --warn: #ffca6d;
96
+ --bad: #ff808e;
97
+ --border: rgba(255, 255, 255, 0.08);
98
+ --border-strong: rgba(255, 255, 255, 0.14);
99
+ --radius: 24px;
100
+ --radius-sm: 16px;
101
+ --shadow: 0 28px 90px rgba(0, 0, 0, 0.32);
102
+ }
103
+
104
+ *,
105
+ *::before,
106
+ *::after {
107
+ box-sizing: border-box;
108
+ }
109
+
110
+ body,
111
+ .gradio-container {
112
+ background:
113
+ radial-gradient(circle at 0% 0%, rgba(134, 176, 255, 0.18), transparent 28%),
114
+ radial-gradient(circle at 100% 0%, rgba(111, 240, 192, 0.10), transparent 30%),
115
+ radial-gradient(circle at 50% 100%, rgba(110, 130, 255, 0.08), transparent 40%),
116
+ #050711 !important;
117
+ color: var(--text);
118
+ font-family: "Inter", -apple-system, BlinkMacSystemFont, sans-serif;
119
+ }
120
+
121
+ a {
122
+ color: var(--accent);
123
+ text-decoration: none;
124
+ }
125
+
126
+ a:hover {
127
+ text-decoration: underline;
128
+ }
129
+
130
+ .kcm-shell {
131
+ max-width: 1540px;
132
+ margin: 0 auto;
133
+ padding: 18px 20px 28px;
134
+ }
135
+
136
+ .kcm-hero {
137
+ position: relative;
138
+ overflow: hidden;
139
+ background:
140
+ linear-gradient(135deg, rgba(134, 176, 255, 0.14), rgba(111, 240, 192, 0.06)),
141
+ var(--surface);
142
+ border: 1px solid var(--border);
143
+ border-radius: 30px;
144
+ padding: 30px 34px;
145
+ box-shadow: var(--shadow);
146
+ }
147
+
148
+ .kcm-hero::after {
149
+ content: "";
150
+ position: absolute;
151
+ inset: auto -80px -120px auto;
152
+ width: 320px;
153
+ height: 320px;
154
+ border-radius: 50%;
155
+ background: radial-gradient(circle, rgba(111, 240, 192, 0.16), transparent 62%);
156
+ pointer-events: none;
157
+ }
158
+
159
+ .kcm-eyebrow {
160
+ color: var(--accent-2);
161
+ font-size: 11px;
162
+ text-transform: uppercase;
163
+ letter-spacing: 0.16em;
164
+ }
165
+
166
+ .kcm-hero h1 {
167
+ margin: 10px 0 0;
168
+ font-size: 38px;
169
+ line-height: 1.05;
170
+ letter-spacing: -0.05em;
171
+ }
172
+
173
+ .kcm-hero p {
174
+ margin: 12px 0 0;
175
+ max-width: 1040px;
176
+ color: var(--text-secondary);
177
+ font-size: 15px;
178
+ line-height: 1.65;
179
+ }
180
+
181
+ .kcm-meta,
182
+ .kcm-stats,
183
+ .kcm-graphs {
184
+ display: grid;
185
+ gap: 12px;
186
+ }
187
+
188
+ .kcm-meta {
189
+ grid-template-columns: repeat(3, minmax(0, 1fr));
190
+ margin-top: 18px;
191
+ }
192
+
193
+ .kcm-stats {
194
+ grid-template-columns: repeat(5, minmax(0, 1fr));
195
+ margin-top: 18px;
196
+ }
197
+
198
+ .kcm-meta-card,
199
+ .kcm-stat,
200
+ .kcm-panel-link {
201
+ background: rgba(255, 255, 255, 0.04);
202
+ border: 1px solid var(--border);
203
+ border-radius: 20px;
204
+ padding: 16px 18px;
205
+ }
206
+
207
+ .kcm-meta-card-label,
208
+ .kcm-stat-label {
209
+ font-size: 11px;
210
+ text-transform: uppercase;
211
+ letter-spacing: 0.10em;
212
+ color: var(--text-tertiary);
213
+ }
214
+
215
+ .kcm-meta-card-value {
216
+ margin-top: 8px;
217
+ font-size: 14px;
218
+ color: var(--text-secondary);
219
+ word-break: break-word;
220
+ }
221
+
222
+ .kcm-stat-value {
223
+ margin-top: 8px;
224
+ font-size: 30px;
225
+ font-weight: 700;
226
+ letter-spacing: -0.03em;
227
+ }
228
+
229
+ .kcm-toolbar {
230
+ margin-top: 18px;
231
+ display: flex;
232
+ justify-content: space-between;
233
+ align-items: center;
234
+ gap: 14px;
235
+ }
236
+
237
+ .kcm-toolbar-left {
238
+ color: var(--text-tertiary);
239
+ font-size: 13px;
240
+ }
241
+
242
+ .kcm-toolbar-left code {
243
+ padding: 3px 8px;
244
+ background: rgba(255, 255, 255, 0.05);
245
+ border-radius: 999px;
246
+ color: var(--text-secondary);
247
+ }
248
+
249
+ .kcm-toolbar-right {
250
+ display: flex;
251
+ align-items: center;
252
+ gap: 10px;
253
+ }
254
+
255
+ .kcm-search,
256
+ .kcm-status-filter {
257
+ background: var(--surface-2);
258
+ border: 1px solid var(--border);
259
+ border-radius: 14px;
260
+ padding: 10px 14px;
261
+ color: var(--text);
262
+ font-size: 14px;
263
+ outline: none;
264
+ }
265
+
266
+ .kcm-search {
267
+ min-width: 260px;
268
+ }
269
+
270
+ .kcm-table-shell {
271
+ margin-top: 16px;
272
+ background: var(--surface);
273
+ border: 1px solid var(--border);
274
+ border-radius: 26px;
275
+ overflow: hidden;
276
+ box-shadow: var(--shadow);
277
+ }
278
+
279
+ .kcm-table-wrap {
280
+ overflow-x: auto;
281
+ }
282
+
283
+ .kcm-table {
284
+ width: 100%;
285
+ border-collapse: separate;
286
+ border-spacing: 0;
287
+ }
288
+
289
+ .kcm-table th {
290
+ position: sticky;
291
+ top: 0;
292
+ z-index: 2;
293
+ text-align: left;
294
+ padding: 14px 16px;
295
+ font-size: 11px;
296
+ text-transform: uppercase;
297
+ letter-spacing: 0.12em;
298
+ color: var(--text-tertiary);
299
+ background: rgba(7, 11, 23, 0.96);
300
+ border-bottom: 1px solid var(--border-strong);
301
+ }
302
+
303
+ .kcm-table td {
304
+ padding: 16px;
305
+ vertical-align: top;
306
+ border-bottom: 1px solid var(--border);
307
+ font-size: 14px;
308
+ }
309
+
310
+ .kcm-table tbody tr {
311
+ cursor: pointer;
312
+ transition: background 0.16s ease;
313
+ }
314
+
315
+ .kcm-table tbody tr:hover td {
316
+ background: var(--surface-hover);
317
+ }
318
+
319
+ .kcm-table tbody tr:last-child td {
320
+ border-bottom: none;
321
+ }
322
+
323
+ .kcm-kernel-name {
324
+ font-size: 16px;
325
+ font-weight: 700;
326
+ letter-spacing: -0.02em;
327
+ }
328
+
329
+ .kcm-kernel-meta,
330
+ .kcm-subtle,
331
+ .kcm-activity-sub {
332
+ margin-top: 4px;
333
+ color: var(--text-tertiary);
334
+ font-size: 12px;
335
+ line-height: 1.45;
336
+ }
337
+
338
+ .kcm-badges,
339
+ .kcm-variant-stack,
340
+ .kcm-actions {
341
+ display: flex;
342
+ flex-wrap: wrap;
343
+ gap: 8px;
344
+ }
345
+
346
+ .kcm-badge {
347
+ display: inline-flex;
348
+ align-items: center;
349
+ gap: 6px;
350
+ padding: 5px 10px;
351
+ border-radius: 999px;
352
+ font-size: 11px;
353
+ font-weight: 700;
354
+ white-space: nowrap;
355
+ border: 1px solid transparent;
356
+ }
357
+
358
+ .kcm-badge.ok {
359
+ color: var(--ok);
360
+ background: rgba(116, 239, 171, 0.10);
361
+ border-color: rgba(116, 239, 171, 0.14);
362
+ }
363
+
364
+ .kcm-badge.warn {
365
+ color: var(--warn);
366
+ background: rgba(255, 202, 109, 0.10);
367
+ border-color: rgba(255, 202, 109, 0.15);
368
+ }
369
+
370
+ .kcm-badge.bad {
371
+ color: var(--bad);
372
+ background: rgba(255, 128, 142, 0.10);
373
+ border-color: rgba(255, 128, 142, 0.14);
374
+ }
375
+
376
+ .kcm-badge.info {
377
+ color: var(--accent);
378
+ background: rgba(134, 176, 255, 0.12);
379
+ border-color: rgba(134, 176, 255, 0.16);
380
+ }
381
+
382
+ .kcm-badge.muted {
383
+ color: var(--text-tertiary);
384
+ background: rgba(255, 255, 255, 0.05);
385
+ border-color: rgba(255, 255, 255, 0.06);
386
+ }
387
+
388
+ .kcm-badge.critical {
389
+ color: var(--bad);
390
+ background: rgba(255, 128, 142, 0.10);
391
+ border-color: rgba(255, 128, 142, 0.14);
392
+ text-transform: uppercase;
393
+ letter-spacing: 0.12em;
394
+ }
395
+
396
+ .kcm-variant {
397
+ min-width: 180px;
398
+ padding: 10px 12px;
399
+ border-radius: 16px;
400
+ background: rgba(255, 255, 255, 0.04);
401
+ border: 1px solid var(--border);
402
+ }
403
+
404
+ .kcm-variant-head {
405
+ display: flex;
406
+ justify-content: space-between;
407
+ gap: 8px;
408
+ align-items: center;
409
+ }
410
+
411
+ .kcm-variant-name {
412
+ font-size: 12px;
413
+ font-weight: 700;
414
+ }
415
+
416
+ .kcm-variant-sub {
417
+ margin-top: 6px;
418
+ font-size: 11px;
419
+ color: var(--text-tertiary);
420
+ line-height: 1.45;
421
+ }
422
+
423
+ .kcm-action {
424
+ display: inline-flex;
425
+ align-items: center;
426
+ padding: 8px 12px;
427
+ border-radius: 12px;
428
+ background: rgba(255, 255, 255, 0.05);
429
+ border: 1px solid var(--border);
430
+ color: var(--text-secondary);
431
+ font-size: 12px;
432
+ font-weight: 600;
433
+ }
434
+
435
+ .kcm-action:hover {
436
+ text-decoration: none;
437
+ border-color: var(--border-strong);
438
+ color: var(--text);
439
+ }
440
+
441
+ .kcm-section {
442
+ margin-top: 22px;
443
+ }
444
+
445
+ .kcm-section-title {
446
+ margin: 0 0 12px;
447
+ font-size: 18px;
448
+ letter-spacing: -0.02em;
449
+ }
450
+
451
+ .kcm-graphs {
452
+ grid-template-columns: repeat(3, minmax(0, 1fr));
453
+ }
454
+
455
+ .kcm-panel-link {
456
+ transition: transform 0.15s ease, border-color 0.15s ease;
457
+ }
458
+
459
+ .kcm-panel-link:hover {
460
+ transform: translateY(-2px);
461
+ border-color: var(--border-strong);
462
+ text-decoration: none;
463
+ }
464
+
465
+ .kcm-panel-label {
466
+ color: var(--accent-2);
467
+ font-size: 11px;
468
+ text-transform: uppercase;
469
+ letter-spacing: 0.12em;
470
+ }
471
+
472
+ .kcm-panel-title {
473
+ margin-top: 8px;
474
+ font-size: 18px;
475
+ font-weight: 700;
476
+ }
477
+
478
+ .kcm-panel-copy {
479
+ margin-top: 8px;
480
+ color: var(--text-secondary);
481
+ font-size: 13px;
482
+ line-height: 1.55;
483
+ }
484
+
485
+ .kcm-frame {
486
+ margin-top: 16px;
487
+ background: var(--surface-3);
488
+ border: 1px solid var(--border);
489
+ border-radius: 24px;
490
+ overflow: hidden;
491
+ box-shadow: var(--shadow);
492
+ }
493
+
494
+ .kcm-frame-head {
495
+ padding: 14px 18px;
496
+ display: flex;
497
+ justify-content: space-between;
498
+ align-items: center;
499
+ gap: 12px;
500
+ border-bottom: 1px solid var(--border);
501
+ }
502
+
503
+ .kcm-frame-title {
504
+ font-size: 15px;
505
+ font-weight: 700;
506
+ }
507
+
508
+ .kcm-frame-copy {
509
+ font-size: 13px;
510
+ color: var(--text-secondary);
511
+ line-height: 1.45;
512
+ }
513
+
514
+ .kcm-open {
515
+ font-size: 12px;
516
+ font-weight: 700;
517
+ }
518
+
519
+ .kcm-frame iframe {
520
+ display: block;
521
+ width: 100%;
522
+ border: none;
523
+ background: #0b1020;
524
+ }
525
+
526
+ .kcm-overlay {
527
+ position: fixed;
528
+ inset: 0;
529
+ z-index: 9999;
530
+ display: none;
531
+ padding: 26px 16px;
532
+ overflow-y: auto;
533
+ background: rgba(4, 7, 16, 0.82);
534
+ backdrop-filter: blur(16px);
535
+ }
536
+
537
+ .kcm-overlay.open {
538
+ display: block;
539
+ }
540
+
541
+ .kcm-modal {
542
+ max-width: 1180px;
543
+ margin: 0 auto;
544
+ background: var(--surface-3);
545
+ border: 1px solid var(--border-strong);
546
+ border-radius: 28px;
547
+ overflow: hidden;
548
+ box-shadow: 0 40px 140px rgba(0, 0, 0, 0.42);
549
+ }
550
+
551
+ .kcm-modal-header {
552
+ padding: 24px 28px;
553
+ border-bottom: 1px solid var(--border);
554
+ display: flex;
555
+ justify-content: space-between;
556
+ align-items: flex-start;
557
+ gap: 20px;
558
+ }
559
+
560
+ .kcm-modal-header h2 {
561
+ margin: 0;
562
+ font-size: 28px;
563
+ letter-spacing: -0.04em;
564
+ }
565
+
566
+ .kcm-modal-header p {
567
+ margin: 8px 0 0;
568
+ color: var(--text-secondary);
569
+ font-size: 14px;
570
+ line-height: 1.55;
571
+ }
572
+
573
+ .kcm-modal-close {
574
+ padding: 9px 14px;
575
+ border-radius: 12px;
576
+ border: 1px solid var(--border);
577
+ background: rgba(255, 255, 255, 0.05);
578
+ color: var(--text-secondary);
579
+ cursor: pointer;
580
+ font-size: 12px;
581
+ font-weight: 700;
582
+ }
583
+
584
+ .kcm-modal-body {
585
+ padding: 24px 28px 30px;
586
+ }
587
+
588
+ .kcm-run-card {
589
+ margin-top: 14px;
590
+ background: rgba(255, 255, 255, 0.03);
591
+ border: 1px solid var(--border);
592
+ border-radius: 22px;
593
+ padding: 18px;
594
+ }
595
+
596
+ .kcm-run-card-head {
597
+ display: flex;
598
+ justify-content: space-between;
599
+ align-items: flex-start;
600
+ gap: 14px;
601
+ margin-bottom: 14px;
602
+ }
603
+
604
+ .kcm-run-card-title {
605
+ font-size: 16px;
606
+ font-weight: 700;
607
+ }
608
+
609
+ .kcm-run-card-meta {
610
+ margin-top: 6px;
611
+ color: var(--text-tertiary);
612
+ font-size: 12px;
613
+ line-height: 1.55;
614
+ }
615
+
616
+ .kcm-arch-grid {
617
+ display: grid;
618
+ grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
619
+ gap: 12px;
620
+ }
621
+
622
+ .kcm-arch-card {
623
+ background: rgba(255, 255, 255, 0.03);
624
+ border: 1px solid var(--border);
625
+ border-radius: 18px;
626
+ padding: 14px;
627
+ }
628
+
629
+ .kcm-arch-head {
630
+ display: flex;
631
+ justify-content: space-between;
632
+ align-items: center;
633
+ gap: 10px;
634
+ }
635
+
636
+ .kcm-arch-name {
637
+ font-size: 14px;
638
+ font-weight: 700;
639
+ }
640
+
641
+ .kcm-arch-detail {
642
+ margin-top: 8px;
643
+ font-size: 12px;
644
+ color: var(--text-secondary);
645
+ line-height: 1.55;
646
+ }
647
+
648
+ .kcm-failure-box {
649
+ margin-top: 10px;
650
+ padding: 10px 12px;
651
+ border-radius: 14px;
652
+ background: rgba(255, 128, 142, 0.08);
653
+ border: 1px solid rgba(255, 128, 142, 0.12);
654
+ color: var(--bad);
655
+ font-family: "JetBrains Mono", Consolas, monospace;
656
+ font-size: 12px;
657
+ white-space: pre-wrap;
658
+ max-height: 200px;
659
+ overflow-y: auto;
660
+ }
661
+
662
+ .kcm-empty {
663
+ padding: 16px 0;
664
+ color: var(--text-tertiary);
665
+ font-size: 14px;
666
+ }
667
+
668
+ @media (max-width: 1260px) {
669
+ .kcm-stats,
670
+ .kcm-meta,
671
+ .kcm-graphs {
672
+ grid-template-columns: repeat(2, minmax(0, 1fr));
673
+ }
674
+ }
675
+
676
+ @media (max-width: 900px) {
677
+ .kcm-stats,
678
+ .kcm-meta,
679
+ .kcm-graphs,
680
+ .kcm-arch-grid {
681
+ grid-template-columns: 1fr;
682
+ }
683
+
684
+ .kcm-toolbar,
685
+ .kcm-run-card-head,
686
+ .kcm-modal-header {
687
+ flex-direction: column;
688
+ align-items: stretch;
689
+ }
690
+
691
+ .kcm-search {
692
+ min-width: 0;
693
+ width: 100%;
694
+ }
695
+ }
696
+ """
697
+
698
+
699
+ def _dt(value: datetime | None) -> str:
700
+ if not value:
701
+ return "n/a"
702
+ return value.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
703
+
704
+
705
+ def _short_dt(value: datetime | None) -> str:
706
+ if not value:
707
+ return "Never"
708
+ return value.astimezone(timezone.utc).strftime("%b %d, %H:%M")
709
+
710
+
711
+ def _badge(label: str, kind: str) -> str:
712
+ css = {
713
+ "completed": "ok",
714
+ "uploading": "warn",
715
+ "running": "info",
716
+ "recent": "info",
717
+ "failed": "bad",
718
+ "cancelled": "bad",
719
+ "stalled": "warn",
720
+ "idle": "muted",
721
+ "success": "ok",
722
+ "not_started": "muted",
723
+ "skipped": "muted",
724
+ }.get(kind, "info")
725
+ return f'<span class="kcm-badge {css}">{html.escape(label)}</span>'
726
+
727
+
728
+ def _short_arch(arch: str) -> str:
729
+ return {
730
+ "x86_64-linux": "x86_64-linux",
731
+ "aarch64-linux": "aarch64-linux",
732
+ "x86_64-darwin": "x86_64-darwin",
733
+ "aarch64-darwin": "aarch64-darwin",
734
+ }.get(arch, arch)
735
+
736
+
737
+ def _variant_label(record: MonitorRecord) -> str:
738
+ match = VARIANT_RE.search(record.job.name)
739
+ if match:
740
+ parts = [part.strip() for part in match.group(1).split(",") if part.strip()]
741
+ if parts:
742
+ parts[0] = _short_arch(parts[0])
743
+ return " | ".join(parts)
744
+ if record.workflow_name.lower().startswith("manual"):
745
+ return "manual upload"
746
+ if record.arch and record.arch not in {"all", "n/a"}:
747
+ return _short_arch(record.arch)
748
+ return record.job.name or "job"
749
+
750
+
751
+ def _variant_chip(record: MonitorRecord) -> str:
752
+ phase_kind = "stalled" if record.suspected_stalled else record.phase
753
+ upload = _badge(record.upload_status_label, record.upload_status)
754
+ return f"""
755
+ <div class="kcm-variant">
756
+ <div class="kcm-variant-head">
757
+ <div class="kcm-variant-name">{html.escape(_variant_label(record))}</div>
758
+ {_badge(record.phase_label, phase_kind)}
759
+ </div>
760
+ <div class="kcm-variant-sub">Upload {upload}</div>
761
+ <div class="kcm-variant-sub">Runner {html.escape(record.runner_group or 'n/a')}</div>
762
+ </div>
763
+ """
764
+
765
+
766
+ def _group_badges(group: KernelRunGroup) -> str:
767
+ badges = []
768
+ if group.is_active:
769
+ badges.append(_badge("Running", "running"))
770
+ elif group.has_failure:
771
+ badges.append(_badge("Failed", "failed"))
772
+ else:
773
+ badges.append(_badge("Completed", "completed"))
774
+ if group.has_uploading:
775
+ badges.append(_badge("Uploading", "uploading"))
776
+ if group.has_stall:
777
+ badges.append(_badge("Stalled", "stalled"))
778
+ return " ".join(badges)
779
+
780
+
781
+ def _latest_group_for_workflow(row: KernelRow, workflow_path: str) -> KernelRunGroup | None:
782
+ return next((group for group in row.recent_groups if group.run.path == workflow_path), None)
783
+
784
+
785
+ def _workflow_cell(group: KernelRunGroup | None, empty_label: str) -> str:
786
+ if not group:
787
+ return f'<div class="kcm-subtle">{html.escape(empty_label)}</div>'
788
+ variant_stack = "".join(_variant_chip(record) for record in group.records)
789
+ return f"""
790
+ <div class="kcm-badges">{_group_badges(group)}</div>
791
+ <div class="kcm-subtle">{html.escape(group.run.display_title or group.run.name)}</div>
792
+ <div class="kcm-variant-stack" style="margin-top:10px">{variant_stack}</div>
793
+ """
794
+
795
+
796
+ def _actions_cell(row: KernelRow, config: AppConfig) -> str:
797
+ actions: list[str] = []
798
+ release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
799
+ manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
800
+ if release_group:
801
+ actions.append(
802
+ f'<a class="kcm-action" href="{html.escape(release_group.run.html_url)}" target="_blank">Release run</a>'
803
+ )
804
+ if manual_group:
805
+ actions.append(
806
+ f'<a class="kcm-action" href="{html.escape(manual_group.run.html_url)}" target="_blank">Manual run</a>'
807
+ )
808
+ if config.grafana.enabled:
809
+ overview_url = build_dashboard_url(config.grafana, config.grafana.overview_dashboard_uid, embed=False)
810
+ actions.append(
811
+ f'<a class="kcm-action" href="{html.escape(overview_url)}" target="_blank">Grafana</a>'
812
+ )
813
+ return "".join(actions) or '<span class="kcm-subtle">No links</span>'
814
+
815
+
816
+ def _render_kernel_row(row: KernelRow, idx: int, config: AppConfig) -> str:
817
+ release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
818
+ manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
819
+ critical_tag = '<span class="kcm-badge critical">critical</span>' if row.critical else ""
820
+ workflows_text = " / ".join(
821
+ group.workflow_name for group in [release_group, manual_group] if group is not None
822
+ )
823
+ activity = row.primary_group
824
+ activity_title = html.escape(activity.run.display_title or activity.run.name) if activity else "No tracked run yet"
825
+ activity_sub = html.escape(_short_dt(row.last_triggered_at)) if row.last_triggered_at else "No activity"
826
+ return f"""
827
+ <tr
828
+ data-idx="{idx}"
829
+ data-kernel="{html.escape(row.kernel_name.lower())}"
830
+ data-status="{html.escape(row.row_status_kind)}"
831
+ data-workflow="{html.escape(workflows_text.lower())}"
832
+ >
833
+ <td style="min-width:220px">
834
+ <div class="kcm-kernel-name">{html.escape(row.kernel_name)} {critical_tag}</div>
835
+ <div class="kcm-kernel-meta">{html.escape(row.kernel_info.repo_id)}</div>
836
+ <div class="kcm-kernel-meta">{html.escape(", ".join(row.kernel_info.backends) or "backend metadata unavailable")}</div>
837
+ </td>
838
+ <td style="min-width:360px">{_workflow_cell(release_group, "No release workflow run found in the scanned history.")}</td>
839
+ <td style="min-width:280px">{_workflow_cell(manual_group, "No manual upload run found in the scanned history.")}</td>
840
+ <td style="min-width:240px">
841
+ <div class="kcm-badges">{_badge(row.row_status_label, row.row_status_kind)}</div>
842
+ <div class="kcm-activity-sub">{activity_title}</div>
843
+ <div class="kcm-activity-sub">{activity_sub}</div>
844
+ </td>
845
+ <td style="min-width:220px"><div class="kcm-actions">{_actions_cell(row, config)}</div></td>
846
+ </tr>
847
+ """
848
+
849
+
850
+ def _render_arch_card(record: MonitorRecord) -> str:
851
+ phase_kind = "stalled" if record.suspected_stalled else record.phase
852
+ stall_line = (
853
+ f'<div class="kcm-arch-detail" style="color:var(--warn)">{html.escape(record.stall_reason or "")}</div>'
854
+ if record.suspected_stalled
855
+ else ""
856
+ )
857
+ failure = (
858
+ f'<div class="kcm-failure-box">{html.escape(record.failure_excerpt)}</div>'
859
+ if record.failure_excerpt
860
+ else ""
861
+ )
862
+ return f"""
863
+ <div class="kcm-arch-card">
864
+ <div class="kcm-arch-head">
865
+ <span class="kcm-arch-name">{html.escape(_variant_label(record))}</span>
866
+ {_badge(record.phase_label, phase_kind)}
867
+ </div>
868
+ <div class="kcm-arch-detail">Upload { _badge(record.upload_status_label, record.upload_status) }</div>
869
+ <div class="kcm-arch-detail">Runner {html.escape(record.runner_group or 'n/a')}</div>
870
+ <div class="kcm-arch-detail">Started {_dt(record.started_at)} | Latest signal {_dt(record.latest_signal_at)}</div>
871
+ <div class="kcm-arch-detail"><a href="{html.escape(record.job.html_url)}" target="_blank">Open job</a></div>
872
+ {stall_line}
873
+ {failure}
874
+ </div>
875
+ """
876
+
877
+
878
+ def _render_group(group: KernelRunGroup) -> str:
879
+ arch_cards = "".join(_render_arch_card(record) for record in group.records)
880
+ return f"""
881
+ <div class="kcm-run-card">
882
+ <div class="kcm-run-card-head">
883
+ <div>
884
+ <div class="kcm-run-card-title">{html.escape(group.run.display_title or group.run.name)}</div>
885
+ <div class="kcm-run-card-meta">
886
+ {html.escape(group.workflow_name)} | branch {html.escape(group.run.head_branch or 'n/a')} | actor {html.escape(group.run.actor_login or 'n/a')}<br>
887
+ Triggered {_dt(group.triggered_at)}
888
+ </div>
889
+ </div>
890
+ <div>
891
+ <div class="kcm-badges">{_group_badges(group)}</div>
892
+ <div class="kcm-run-card-meta" style="margin-top:8px">
893
+ <a href="{html.escape(group.run.html_url)}" target="_blank">Open Actions run</a>
894
+ </div>
895
+ </div>
896
+ </div>
897
+ <div class="kcm-arch-grid">{arch_cards}</div>
898
+ </div>
899
+ """
900
+
901
+
902
+ def _render_hidden_modal(row: KernelRow, idx: int, config: AppConfig) -> str:
903
+ release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
904
+ manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
905
+ critical_tag = '<span class="kcm-badge critical">critical</span>' if row.critical else ""
906
+ grafana_link = ""
907
+ if config.grafana.enabled:
908
+ grafana_url = build_dashboard_url(config.grafana, config.grafana.overview_dashboard_uid, embed=False)
909
+ grafana_link = f'<a href="{html.escape(grafana_url)}" target="_blank" class="kcm-modal-close">Open Grafana</a>'
910
+
911
+ sections = []
912
+ if release_group:
913
+ sections.append(f'<h3 class="kcm-section-title">Latest release build</h3>{_render_group(release_group)}')
914
+ if manual_group:
915
+ sections.append(f'<h3 class="kcm-section-title">Latest manual upload</h3>{_render_group(manual_group)}')
916
+ if row.recent_groups:
917
+ sections.append(
918
+ "<h3 class=\"kcm-section-title\">Recent tracked runs</h3>"
919
+ + "".join(_render_group(group) for group in row.recent_groups[:8])
920
+ )
921
+ if not sections:
922
+ sections.append('<div class="kcm-empty">No tracked GitHub Actions runs found for this kernel yet.</div>')
923
+
924
+ return f"""
925
+ <div id="modal-content-{idx}" style="display:none">
926
+ <div class="kcm-modal-header">
927
+ <div>
928
+ <h2>{html.escape(row.kernel_name)} {critical_tag}</h2>
929
+ <p>{html.escape(row.kernel_info.repo_id)}</p>
930
+ <p>{_badge(row.row_status_label, row.row_status_kind)} {html.escape(", ".join(row.kernel_info.backends) or "No backend metadata")}</p>
931
+ </div>
932
+ <div style="display:flex;gap:10px;flex-wrap:wrap">
933
+ <a href="{html.escape(row.kernel_info.hub_url)}" target="_blank" class="kcm-modal-close">Open Hub repo</a>
934
+ {grafana_link}
935
+ <button class="kcm-modal-close">Close</button>
936
+ </div>
937
+ </div>
938
+ <div class="kcm-modal-body">
939
+ {"".join(sections)}
940
+ </div>
941
+ </div>
942
+ """
943
+
944
+
945
+ def _render_graph_section(config: AppConfig) -> str:
946
+ if not config.grafana.enabled:
947
+ return """
948
+ <section class="kcm-section">
949
+ <h2 class="kcm-section-title">Metrics + trends</h2>
950
+ <div class="kcm-panel-link">
951
+ <div class="kcm-panel-label">Grafana not configured</div>
952
+ <div class="kcm-panel-title">The live Actions table is active; the Grafana deck is ready to attach.</div>
953
+ <div class="kcm-panel-copy">
954
+ Set <code>KCM_GRAFANA_BASE_URL</code> on the Space once you have a public Grafana endpoint.
955
+ The provisioning and Actions metrics emitter already live in <code>monitoring/</code> and
956
+ <code>scripts/push_build_metrics.py</code>.
957
+ </div>
958
+ </div>
959
+ </section>
960
+ """
961
+ dashboards = dashboard_catalog(config.grafana)
962
+ cards = "".join(
963
+ f"""
964
+ <a class="kcm-panel-link" href="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=False))}" target="_blank">
965
+ <div class="kcm-panel-label">Grafana</div>
966
+ <div class="kcm-panel-title">{html.escape(dashboard.title)}</div>
967
+ <div class="kcm-panel-copy">{html.escape(dashboard.description)}</div>
968
+ </a>
969
+ """
970
+ for dashboard in dashboards
971
+ )
972
+ embeds = "".join(
973
+ f"""
974
+ <div class="kcm-frame">
975
+ <div class="kcm-frame-head">
976
+ <div>
977
+ <div class="kcm-frame-title">{html.escape(dashboard.title)}</div>
978
+ <div class="kcm-frame-copy">{html.escape(dashboard.description)}</div>
979
+ </div>
980
+ <a class="kcm-open" href="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=False))}" target="_blank">Open in Grafana</a>
981
+ </div>
982
+ <iframe src="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=True))}" height="{dashboard.height}" loading="lazy"></iframe>
983
+ </div>
984
+ """
985
+ for dashboard in dashboards
986
+ )
987
+ return f"""
988
+ <section class="kcm-section">
989
+ <h2 class="kcm-section-title">Metrics + trends</h2>
990
+ <div class="kcm-graphs">{cards}</div>
991
+ {embeds}
992
+ </section>
993
+ """
994
+
995
+
996
+ def render_page(snapshot: DashboardSnapshot, config: AppConfig) -> str:
997
+ summary = snapshot.summary
998
+ meta_cards = "".join(
999
+ [
1000
+ f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">Source repo</div><div class="kcm-meta-card-value">{html.escape(config.github.repo_slug)}</div></div>',
1001
+ f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">GitHub scans</div><div class="kcm-meta-card-value">{html.escape(str(config.monitor.workflow_run_pages))} pages x {html.escape(str(config.monitor.workflow_run_page_size))} runs</div></div>',
1002
+ f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">Grafana</div><div class="kcm-meta-card-value">{html.escape(config.grafana.base_url or "not configured")}</div></div>',
1003
+ ]
1004
+ )
1005
+ stats = "".join(
1006
+ f'<div class="kcm-stat"><div class="kcm-stat-label">{label}</div><div class="kcm-stat-value">{value}</div></div>'
1007
+ for label, value in [
1008
+ ("Kernels", summary.tracked_kernels),
1009
+ ("Active", summary.active_builds),
1010
+ ("Uploading", summary.uploading_builds),
1011
+ ("Stalled", summary.stalled_builds),
1012
+ ("Failed", summary.failed_builds),
1013
+ ]
1014
+ )
1015
+ rows_html = "".join(_render_kernel_row(row, idx, config) for idx, row in enumerate(snapshot.kernel_rows))
1016
+ errors_html = ""
1017
+ if snapshot.errors:
1018
+ errors_html = f' | <span style="color:var(--bad)">{html.escape("; ".join(snapshot.errors[:3]))}</span>'
1019
+
1020
+ return f"""
1021
+ <div class="kcm-shell">
1022
+ <section class="kcm-hero">
1023
+ <div class="kcm-eyebrow">Kernels community observatory</div>
1024
+ <h1>Kernel CI Command Center.</h1>
1025
+ <p>
1026
+ Every kernel source directory in <code>{html.escape(config.github.repo_slug)}</code> is enumerated from the repo tree,
1027
+ then matched to its latest release and manual-upload GitHub Actions runs. Variant-level job status stays visible, and
1028
+ Grafana handles the longer-term duration and failure telemetry.
1029
+ </p>
1030
+ <div class="kcm-meta">{meta_cards}</div>
1031
+ <div class="kcm-stats">{stats}</div>
1032
+ </section>
1033
+
1034
+ <div class="kcm-toolbar">
1035
+ <div class="kcm-toolbar-left">
1036
+ Refreshed <code>{html.escape(_dt(snapshot.generated_at))}</code> | <code>{len(snapshot.kernel_rows)}</code> kernels{errors_html}
1037
+ </div>
1038
+ <div class="kcm-toolbar-right">
1039
+ <input class="kcm-search" type="text" placeholder="Filter kernel or workflow..." />
1040
+ <select class="kcm-status-filter">
1041
+ <option value="all">All states</option>
1042
+ <option value="running">Running</option>
1043
+ <option value="uploading">Uploading</option>
1044
+ <option value="stalled">Stalled</option>
1045
+ <option value="failed">Failed</option>
1046
+ <option value="completed">Completed</option>
1047
+ <option value="idle">Idle</option>
1048
+ </select>
1049
+ </div>
1050
+ </div>
1051
+
1052
+ <section class="kcm-table-shell">
1053
+ <div class="kcm-table-wrap">
1054
+ <table class="kcm-table" id="kernelTable">
1055
+ <thead>
1056
+ <tr>
1057
+ <th>Kernel dir</th>
1058
+ <th>Latest release build</th>
1059
+ <th>Latest manual upload</th>
1060
+ <th>Latest activity</th>
1061
+ <th>Actions</th>
1062
+ </tr>
1063
+ </thead>
1064
+ <tbody>{rows_html}</tbody>
1065
+ </table>
1066
+ </div>
1067
+ </section>
1068
+
1069
+ {_render_graph_section(config)}
1070
+ </div>
1071
+ <div class="kcm-overlay" id="kcmOverlay">
1072
+ <div class="kcm-modal" id="kcmModal"></div>
1073
+ </div>
1074
+ {"".join(_render_hidden_modal(row, idx, config) for idx, row in enumerate(snapshot.kernel_rows))}
1075
+ """
1076
+
1077
+
1078
+ LOADING_HTML = """
1079
+ <div class="kcm-shell">
1080
+ <section class="kcm-hero">
1081
+ <div class="kcm-eyebrow">Kernels community observatory</div>
1082
+ <h1>Booting the kernel CI command center...</h1>
1083
+ <p>The first load walks the kernel catalog and scans the latest GitHub Actions runs, so it can take a few seconds.</p>
1084
+ </section>
1085
+ </div>
1086
+ """
1087
+
1088
+
1089
+ def build_dashboard(service: MonitorService, config: AppConfig) -> gr.Blocks:
1090
+ with gr.Blocks() as demo:
1091
+ refresh_timer = gr.Timer(value=8, active=True)
1092
+ loaded_state = gr.State(False)
1093
+
1094
+ with gr.Row():
1095
+ refresh_btn = gr.Button("Refresh now", variant="primary", scale=0, min_width=160)
1096
+
1097
+ page_html = gr.HTML(value=LOADING_HTML)
1098
+
1099
+ def refresh(_=None):
1100
+ snapshot = service.get_snapshot(force_refresh=True)
1101
+ return render_page(snapshot, config), True, gr.Timer(value=config.monitor.refresh_interval_seconds, active=True)
1102
+
1103
+ def tick_refresh(loaded):
1104
+ snapshot = service.get_snapshot(force_refresh=not loaded)
1105
+ return render_page(snapshot, config), True, gr.Timer(value=config.monitor.refresh_interval_seconds, active=True)
1106
+
1107
+ refresh_btn.click(refresh, outputs=[page_html, loaded_state, refresh_timer])
1108
+ refresh_timer.tick(tick_refresh, inputs=[loaded_state], outputs=[page_html, loaded_state, refresh_timer])
1109
+
1110
+ return demo
tests/conftest.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+
7
+ ROOT_DIR = Path(__file__).resolve().parents[1]
8
+ SRC_DIR = ROOT_DIR / "src"
9
+ if str(SRC_DIR) not in sys.path:
10
+ sys.path.insert(0, str(SRC_DIR))
tests/fixtures/active_build_job.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 66947931664,
3
+ "run_id": 23049830725,
4
+ "workflow_name": "Build Release",
5
+ "head_branch": "tiny-build-fix",
6
+ "run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830725",
7
+ "run_attempt": 1,
8
+ "head_sha": "ca745cc4e08039817fc47d780f7dd3126187a6d6",
9
+ "url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931664",
10
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830725/job/66947931664",
11
+ "status": "in_progress",
12
+ "conclusion": null,
13
+ "created_at": "2026-03-22T10:00:00Z",
14
+ "started_at": "2026-03-22T10:00:10Z",
15
+ "completed_at": null,
16
+ "name": "build-kernel (aarch64-linux, aws-r8g-8xl-plus-nix)",
17
+ "steps": [
18
+ {
19
+ "name": "Set up job",
20
+ "status": "completed",
21
+ "conclusion": "success",
22
+ "number": 1,
23
+ "started_at": "2026-03-22T10:00:10Z",
24
+ "completed_at": "2026-03-22T10:00:12Z"
25
+ },
26
+ {
27
+ "name": "Validate kernel directory",
28
+ "status": "completed",
29
+ "conclusion": "success",
30
+ "number": 6,
31
+ "started_at": "2026-03-22T10:00:30Z",
32
+ "completed_at": "2026-03-22T10:00:31Z"
33
+ },
34
+ {
35
+ "name": "Build and upload kernel",
36
+ "status": "in_progress",
37
+ "conclusion": null,
38
+ "number": 7,
39
+ "started_at": "2026-03-22T10:01:00Z",
40
+ "completed_at": null
41
+ }
42
+ ],
43
+ "runner_name": "aws-r8g-8xl-plus-nix-runner",
44
+ "runner_group_name": "aws-r8g-8xl-plus-nix"
45
+ }
tests/fixtures/build_release_run.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 23049830725,
3
+ "name": "Build Release",
4
+ "display_title": "sgl-flash-attn3: upload path sanity",
5
+ "path": ".github/workflows/build-release.yaml",
6
+ "status": "in_progress",
7
+ "conclusion": null,
8
+ "head_branch": "tiny-build-fix",
9
+ "head_sha": "ca745cc4e08039817fc47d780f7dd3126187a6d6",
10
+ "event": "pull_request",
11
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830725",
12
+ "jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830725/jobs",
13
+ "created_at": "2026-03-22T10:00:00Z",
14
+ "updated_at": "2026-03-22T14:20:00Z",
15
+ "run_started_at": "2026-03-22T10:00:00Z",
16
+ "actor": {
17
+ "login": "adarshxs"
18
+ }
19
+ }
tests/fixtures/failed_build_job.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 66947931666,
3
+ "run_id": 23049830726,
4
+ "workflow_name": "Build Release",
5
+ "head_branch": "repo-id-bug",
6
+ "run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830726",
7
+ "run_attempt": 1,
8
+ "head_sha": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
9
+ "url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931666",
10
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830726/job/66947931666",
11
+ "status": "completed",
12
+ "conclusion": "failure",
13
+ "created_at": "2026-03-21T10:00:00Z",
14
+ "started_at": "2026-03-21T10:00:10Z",
15
+ "completed_at": "2026-03-21T10:26:08Z",
16
+ "name": "build-kernel (aarch64-linux, aws-r8g-8xl-plus-nix)",
17
+ "steps": [
18
+ {
19
+ "name": "Set up job",
20
+ "status": "completed",
21
+ "conclusion": "success",
22
+ "number": 1,
23
+ "started_at": "2026-03-21T10:00:10Z",
24
+ "completed_at": "2026-03-21T10:00:12Z"
25
+ },
26
+ {
27
+ "name": "Validate kernel directory",
28
+ "status": "completed",
29
+ "conclusion": "success",
30
+ "number": 6,
31
+ "started_at": "2026-03-21T10:00:30Z",
32
+ "completed_at": "2026-03-21T10:00:31Z"
33
+ },
34
+ {
35
+ "name": "Build and upload kernel",
36
+ "status": "completed",
37
+ "conclusion": "failure",
38
+ "number": 7,
39
+ "started_at": "2026-03-21T10:01:00Z",
40
+ "completed_at": "2026-03-21T10:26:08Z"
41
+ }
42
+ ],
43
+ "runner_name": "aws-r8g-8xl-plus-nix-runner",
44
+ "runner_group_name": "aws-r8g-8xl-plus-nix"
45
+ }
tests/fixtures/failed_build_run.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 23049830726,
3
+ "name": "Build Release",
4
+ "display_title": "sgl-flash-attn3: repo id regression",
5
+ "path": ".github/workflows/build-release.yaml",
6
+ "status": "completed",
7
+ "conclusion": "failure",
8
+ "head_branch": "repo-id-bug",
9
+ "head_sha": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
10
+ "event": "pull_request",
11
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830726",
12
+ "jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830726/jobs",
13
+ "created_at": "2026-03-21T10:00:00Z",
14
+ "updated_at": "2026-03-21T10:26:08Z",
15
+ "run_started_at": "2026-03-21T10:00:00Z",
16
+ "actor": {
17
+ "login": "adarshxs"
18
+ }
19
+ }
tests/fixtures/manual_build_run.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 23049830727,
3
+ "name": "Manual Kernel Build",
4
+ "display_title": "Manual Kernel Build / flash-attn3 / target=main / request=manual",
5
+ "path": ".github/workflows/manual-build-upload.yaml",
6
+ "status": "completed",
7
+ "conclusion": "success",
8
+ "head_branch": "manual-test",
9
+ "head_sha": "cccccccccccccccccccccccccccccccccccccccc",
10
+ "event": "workflow_dispatch",
11
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830727",
12
+ "jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830727/jobs",
13
+ "created_at": "2026-03-21T14:00:00Z",
14
+ "updated_at": "2026-03-21T15:01:00Z",
15
+ "run_started_at": "2026-03-21T14:00:00Z",
16
+ "actor": {
17
+ "login": "adarshxs"
18
+ }
19
+ }
tests/fixtures/manual_upload_job.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "id": 66947931668,
3
+ "run_id": 23049830727,
4
+ "workflow_name": "Manual Kernel Build",
5
+ "head_branch": "manual-test",
6
+ "run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830727",
7
+ "run_attempt": 1,
8
+ "head_sha": "cccccccccccccccccccccccccccccccccccccccc",
9
+ "url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931668",
10
+ "html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830727/job/66947931668",
11
+ "status": "completed",
12
+ "conclusion": "success",
13
+ "created_at": "2026-03-21T14:00:00Z",
14
+ "started_at": "2026-03-21T14:00:10Z",
15
+ "completed_at": "2026-03-21T15:01:00Z",
16
+ "name": "build-and-upload",
17
+ "steps": [
18
+ {
19
+ "name": "Set up job",
20
+ "status": "completed",
21
+ "conclusion": "success",
22
+ "number": 1,
23
+ "started_at": "2026-03-21T14:00:10Z",
24
+ "completed_at": "2026-03-21T14:00:12Z"
25
+ },
26
+ {
27
+ "name": "Validate kernel directory",
28
+ "status": "completed",
29
+ "conclusion": "success",
30
+ "number": 6,
31
+ "started_at": "2026-03-21T14:00:30Z",
32
+ "completed_at": "2026-03-21T14:00:31Z"
33
+ },
34
+ {
35
+ "name": "Build and copy kernel",
36
+ "status": "completed",
37
+ "conclusion": "success",
38
+ "number": 7,
39
+ "started_at": "2026-03-21T14:01:00Z",
40
+ "completed_at": "2026-03-21T14:45:00Z"
41
+ },
42
+ {
43
+ "name": "Upload kernel",
44
+ "status": "completed",
45
+ "conclusion": "success",
46
+ "number": 8,
47
+ "started_at": "2026-03-21T14:45:10Z",
48
+ "completed_at": "2026-03-21T15:01:00Z"
49
+ }
50
+ ],
51
+ "runner_name": "aws-highmemory-32-plus-nix-runner",
52
+ "runner_group_name": "aws-highmemory-32-plus-nix"
53
+ }
tests/test_grafana.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from kc_monitor.config import GrafanaSettings
4
+ from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
5
+
6
+
7
+ def test_dashboard_catalog_uses_configured_uids() -> None:
8
+ settings = GrafanaSettings(
9
+ base_url="https://grafana.example.com",
10
+ overview_dashboard_uid="overview-uid",
11
+ duration_dashboard_uid="durations-uid",
12
+ failure_dashboard_uid="failures-uid",
13
+ )
14
+
15
+ dashboards = dashboard_catalog(settings)
16
+
17
+ assert [dashboard.uid for dashboard in dashboards] == [
18
+ "overview-uid",
19
+ "durations-uid",
20
+ "failures-uid",
21
+ ]
22
+
23
+
24
+ def test_build_dashboard_url_supports_embed_mode() -> None:
25
+ settings = GrafanaSettings(
26
+ base_url="https://grafana.example.com/",
27
+ org_id=7,
28
+ theme="light",
29
+ default_from="now-7d",
30
+ default_to="now",
31
+ default_refresh="30s",
32
+ )
33
+
34
+ embed_url = build_dashboard_url(settings, "overview-uid", embed=True)
35
+ full_url = build_dashboard_url(settings, "overview-uid", embed=False)
36
+
37
+ assert embed_url == (
38
+ "https://grafana.example.com/d/overview-uid/_?"
39
+ "orgId=7&from=now-7d&to=now&theme=light&refresh=30s&kiosk=tv"
40
+ )
41
+ assert full_url == (
42
+ "https://grafana.example.com/d/overview-uid/_?"
43
+ "orgId=7&from=now-7d&to=now&theme=light&refresh=30s"
44
+ )
tests/test_log_parser.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from kc_monitor.log_parser import JobLogParser
7
+ from kc_monitor.models import GitHubJob, GitHubRun
8
+
9
+
10
+ FIXTURES_DIR = Path(__file__).parent / "fixtures"
11
+
12
+
13
+ def load_json_fixture(name: str) -> dict:
14
+ return json.loads((FIXTURES_DIR / name).read_text(encoding="utf-8"))
15
+
16
+
17
+ def load_text_fixture(name: str) -> str:
18
+ return (FIXTURES_DIR / name).read_text(encoding="utf-8")
19
+
20
+
21
+ def test_parser_detects_upload_in_progress_from_combined_step() -> None:
22
+ run = GitHubRun.from_api(load_json_fixture("build_release_run.json"))
23
+ job = GitHubJob.from_api(load_json_fixture("active_build_job.json"))
24
+
25
+ parsed = JobLogParser().parse(run, job, load_text_fixture("running_build_upload.log"))
26
+
27
+ assert parsed.phase == "uploading"
28
+ assert parsed.upload_status == "running"
29
+ assert parsed.repo_id == "kernels-community/sgl-flash-attn3"
30
+ assert parsed.latest_log_at is not None
31
+
32
+
33
+ def test_parser_keeps_upload_not_started_when_build_fails_first() -> None:
34
+ run = GitHubRun.from_api(load_json_fixture("failed_build_run.json"))
35
+ job = GitHubJob.from_api(load_json_fixture("failed_build_job.json"))
36
+
37
+ parsed = JobLogParser().parse(run, job, load_text_fixture("failed_build.log"))
38
+
39
+ assert parsed.phase == "failed"
40
+ assert parsed.upload_status == "not_started"
41
+ assert "Mandatory repo-id is missing" in (parsed.failure_excerpt or "")
42
+
43
+
44
+ def test_parser_marks_manual_upload_as_completed() -> None:
45
+ run = GitHubRun.from_api(load_json_fixture("manual_build_run.json"))
46
+ job = GitHubJob.from_api(load_json_fixture("manual_upload_job.json"))
47
+
48
+ parsed = JobLogParser().parse(run, job, load_text_fixture("manual_upload_success.log"))
49
+
50
+ assert parsed.phase == "upload_complete"
51
+ assert parsed.upload_status == "completed"
52
+ assert parsed.repo_id == "kernels-community/flash-attn3"
tests/test_metrics_push.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from kc_monitor.metrics_push import (
4
+ BuildMetricSample,
5
+ build_pushgateway_url,
6
+ format_prometheus_metrics,
7
+ )
8
+
9
+
10
+ def test_build_metric_sample_uses_matrix_labels_and_duration() -> None:
11
+ sample = BuildMetricSample.from_env(
12
+ {
13
+ "KCM_JOB_STATUS": "failure",
14
+ "KCM_JOB_STARTED_AT": "100",
15
+ "KCM_KERNEL": "flash-attn3",
16
+ "KCM_BACKEND": "cuda",
17
+ "KCM_COMPUTE_BACKEND": "triton",
18
+ "KCM_CUDA_VERSION": "12.4",
19
+ "KCM_PYTORCH_VERSION": "2.5.1",
20
+ "KCM_PYTHON_VERSION": "3.11",
21
+ "GITHUB_REPOSITORY": "huggingface/kernels-community",
22
+ "GITHUB_WORKFLOW": "Build Release",
23
+ "GITHUB_REF_NAME": "main",
24
+ "GITHUB_JOB": "build_kernel",
25
+ "RUNNER_OS": "Linux",
26
+ "RUNNER_ARCH": "X64",
27
+ },
28
+ completed_at_seconds=145,
29
+ )
30
+
31
+ assert sample.grouping_key == {
32
+ "kernel": "flash-attn3",
33
+ "backend": "cuda",
34
+ "compute_backend": "triton",
35
+ "cuda_version": "12.4",
36
+ "pytorch_version": "2.5.1",
37
+ "python_version": "3.11",
38
+ }
39
+ assert sample.metric_labels["repository"] == "huggingface/kernels-community"
40
+ assert sample.result == "failure"
41
+ assert sample.result_code == 2
42
+ assert sample.failed == 1
43
+ assert sample.duration_seconds == 45.0
44
+
45
+
46
+ def test_build_pushgateway_url_is_stable_per_matrix_combo() -> None:
47
+ url = build_pushgateway_url(
48
+ "http://pushgateway:9091",
49
+ "kernels-community-build-matrix",
50
+ {
51
+ "kernel": "flash-attn3",
52
+ "backend": "cuda",
53
+ "compute_backend": "triton",
54
+ "cuda_version": "12.4",
55
+ "pytorch_version": "2.5.1",
56
+ "python_version": "3.11",
57
+ },
58
+ )
59
+
60
+ assert url == (
61
+ "http://pushgateway:9091/metrics/job/kernels-community-build-matrix/"
62
+ "kernel/flash-attn3/backend/cuda/compute_backend/triton/cuda_version/12.4/"
63
+ "pytorch_version/2.5.1/python_version/3.11"
64
+ )
65
+
66
+
67
+ def test_prometheus_payload_contains_expected_metrics() -> None:
68
+ sample = BuildMetricSample.from_env(
69
+ {
70
+ "KCM_JOB_STATUS": "success",
71
+ "KCM_BUILD_DURATION_SECONDS": "12.5",
72
+ "KCM_KERNEL": "flash-attn3",
73
+ "KCM_BACKEND": "cuda",
74
+ "KCM_COMPUTE_BACKEND": "triton",
75
+ "KCM_CUDA_VERSION": "12.4",
76
+ "KCM_PYTORCH_VERSION": "2.5.1",
77
+ "KCM_PYTHON_VERSION": "3.11",
78
+ "GITHUB_REPOSITORY": "huggingface/kernels-community",
79
+ "GITHUB_WORKFLOW": "Build Release",
80
+ "GITHUB_REF_NAME": "main",
81
+ "GITHUB_JOB": "build_kernel",
82
+ "RUNNER_OS": "Linux",
83
+ "RUNNER_ARCH": "X64",
84
+ },
85
+ completed_at_seconds=1700000000,
86
+ )
87
+
88
+ payload = format_prometheus_metrics(sample)
89
+
90
+ assert "kc_build_last_run_result_code" in payload
91
+ assert "kc_build_last_run_failed" in payload
92
+ assert "kc_build_last_run_duration_seconds" in payload
93
+ assert "kc_build_last_run_timestamp_seconds" in payload
94
+ assert 'result="success"' in payload
95
+ assert "12.500" in payload
96
+ assert "1700000000" in payload
tests/test_service.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from kc_monitor.config import AppConfig
7
+ from kc_monitor.models import GitHubJob, GitHubRun
8
+ from kc_monitor.service import MonitorService
9
+
10
+
11
+ FIXTURES_DIR = Path(__file__).parent / "fixtures"
12
+
13
+
14
+ def load_json_fixture(name: str) -> dict:
15
+ return json.loads((FIXTURES_DIR / name).read_text(encoding="utf-8"))
16
+
17
+
18
+ def load_text_fixture(name: str) -> str:
19
+ return (FIXTURES_DIR / name).read_text(encoding="utf-8")
20
+
21
+
22
+ class FakeGitHubClient:
23
+ def __init__(self) -> None:
24
+ self.runs = [
25
+ GitHubRun.from_api(load_json_fixture("build_release_run.json")),
26
+ GitHubRun.from_api(load_json_fixture("failed_build_run.json")),
27
+ GitHubRun.from_api(load_json_fixture("manual_build_run.json")),
28
+ ]
29
+ self.jobs = {
30
+ 23049830725: [GitHubJob.from_api(load_json_fixture("active_build_job.json"))],
31
+ 23049830726: [GitHubJob.from_api(load_json_fixture("failed_build_job.json"))],
32
+ 23049830727: [GitHubJob.from_api(load_json_fixture("manual_upload_job.json"))],
33
+ }
34
+ self.logs = {
35
+ 66947931664: load_text_fixture("running_build_upload.log"),
36
+ 66947931666: load_text_fixture("failed_build.log"),
37
+ 66947931668: load_text_fixture("manual_upload_success.log"),
38
+ }
39
+ self.build_toml = {
40
+ "sgl-flash-attn3/build.toml": """
41
+ [general]
42
+ name = "sgl-flash-attn3"
43
+ version = 1
44
+ backends = ["cuda"]
45
+
46
+ [general.hub]
47
+ repo-id = "kernels-community/sgl-flash-attn3"
48
+ """.strip(),
49
+ "flash-attn3/build.toml": """
50
+ [general]
51
+ name = "flash-attn3"
52
+ version = 1
53
+ backends = ["cuda"]
54
+
55
+ [general.hub]
56
+ repo-id = "kernels-community/flash-attn3"
57
+ """.strip(),
58
+ }
59
+ self.tree_paths = [
60
+ "sgl-flash-attn3/build.toml",
61
+ "flash-attn3/build.toml",
62
+ "deep-gemm/build.toml",
63
+ ]
64
+
65
+ def close(self) -> None:
66
+ return None
67
+
68
+ def list_runs(self, per_page: int = 30, page: int = 1) -> list[GitHubRun]:
69
+ return self.runs[:per_page]
70
+
71
+ def list_workflow_runs(
72
+ self,
73
+ workflow_file: str,
74
+ per_page: int = 30,
75
+ page: int = 1,
76
+ ) -> list[GitHubRun]:
77
+ return [r for r in self.runs if r.path.endswith(workflow_file)][:per_page]
78
+
79
+ def list_jobs(self, run_id: int) -> list[GitHubJob]:
80
+ return self.jobs[run_id]
81
+
82
+ def get_job_logs(
83
+ self,
84
+ job_id: int,
85
+ line_limit: int = 400,
86
+ char_limit: int = 35000,
87
+ job_html_url: str | None = None,
88
+ ) -> str:
89
+ return self.logs[job_id]
90
+
91
+ def get_file_text(self, path: str, ref: str | None = None) -> str | None:
92
+ return self.build_toml.get(path)
93
+
94
+ def list_repo_tree_paths(self, ref: str = "main") -> list[str]:
95
+ return self.tree_paths
96
+
97
+
98
+ def test_service_builds_summary_and_records() -> None:
99
+ config = AppConfig.model_validate(
100
+ {
101
+ "github": {
102
+ "owner": "huggingface",
103
+ "repo": "kernels-community",
104
+ "branch": "main",
105
+ "per_page": 10,
106
+ "workflows": [
107
+ {
108
+ "path": ".github/workflows/build-release.yaml",
109
+ "label": "Build Release",
110
+ "enabled": True,
111
+ },
112
+ {
113
+ "path": ".github/workflows/manual-build-upload.yaml",
114
+ "label": "Manual Kernel Build",
115
+ "enabled": True,
116
+ },
117
+ ],
118
+ },
119
+ "monitor": {
120
+ "recent_completed_hours": 400,
121
+ "critical_kernels": ["flash-attn3", "sgl-flash-attn3"],
122
+ "snapshot_ttl_seconds": 1,
123
+ },
124
+ }
125
+ )
126
+
127
+ service = MonitorService(config, client=FakeGitHubClient())
128
+ snapshot = service.get_snapshot(force_refresh=True)
129
+
130
+ assert snapshot.summary.active_builds == 1
131
+ assert snapshot.summary.completed_uploads == 1
132
+ assert snapshot.summary.failed_builds == 1
133
+ assert snapshot.summary.uploading_builds == 1
134
+ assert snapshot.summary.tracked_kernels == 3
135
+ assert len(snapshot.active_records) == 1
136
+ assert len(snapshot.kernel_rows) == 3
137
+ assert snapshot.active_records[0].kernel_name == "sgl-flash-attn3"
138
+ assert snapshot.active_records[0].critical is True
139
+ assert snapshot.kernel_rows[0].kernel_name == "sgl-flash-attn3"
140
+ assert snapshot.kernel_rows[-1].kernel_name == "deep-gemm"
141
+ assert any(record.upload_status == "completed" for record in snapshot.recent_records)
142
+ assert any(record.phase == "failed" for record in snapshot.recent_records)
143
+
144
+
145
+ def test_service_normalizes_public_jobs_without_steps() -> None:
146
+ run = GitHubRun.from_api(load_json_fixture("build_release_run.json"))
147
+ job = GitHubJob.from_api(load_json_fixture("active_build_job.json"))
148
+ job.steps = []
149
+
150
+ normalized = MonitorService._normalize_job(run, job)
151
+
152
+ assert [step.name for step in normalized.steps] == ["Build and upload kernel"]