Upload folder using huggingface_hub
Browse files- .dockerignore +14 -0
- .env.example +19 -0
- .gitignore +11 -0
- Dockerfile +17 -0
- README.md +212 -12
- app.py +31 -0
- config/monitor.yaml +51 -0
- monitoring/docker-compose.yml +45 -0
- monitoring/github-actions-post-job.yml +28 -0
- monitoring/grafana/dashboards/build-duration-trends.json +474 -0
- monitoring/grafana/dashboards/build-failure-overview.json +500 -0
- monitoring/grafana/dashboards/build-matrix-overview.json +593 -0
- monitoring/grafana/provisioning/dashboards/dashboards.yml +11 -0
- monitoring/grafana/provisioning/datasources/prometheus.yml +10 -0
- monitoring/prometheus/prometheus.yml +18 -0
- monitoring/prometheus/rules/build-alerts.yml +20 -0
- requirements-dev.txt +3 -0
- requirements.txt +9 -0
- scripts/bootstrap_space.py +150 -0
- scripts/push_build_metrics.py +48 -0
- scripts/smoke_check.py +63 -0
- src/kc_monitor/__init__.py +5 -0
- src/kc_monitor/config.py +190 -0
- src/kc_monitor/github_client.py +456 -0
- src/kc_monitor/grafana.py +65 -0
- src/kc_monitor/kernel_index.py +108 -0
- src/kc_monitor/log_parser.py +216 -0
- src/kc_monitor/metrics_push.py +190 -0
- src/kc_monitor/models.py +342 -0
- src/kc_monitor/service.py +572 -0
- src/kc_monitor/stall_detector.py +48 -0
- src/kc_monitor/ui.py +1110 -0
- tests/conftest.py +10 -0
- tests/fixtures/active_build_job.json +45 -0
- tests/fixtures/build_release_run.json +19 -0
- tests/fixtures/failed_build_job.json +45 -0
- tests/fixtures/failed_build_run.json +19 -0
- tests/fixtures/manual_build_run.json +19 -0
- tests/fixtures/manual_upload_job.json +53 -0
- tests/test_grafana.py +44 -0
- tests/test_log_parser.py +52 -0
- tests/test_metrics_push.py +96 -0
- tests/test_service.py +152 -0
.dockerignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.env
|
| 4 |
+
.venv
|
| 5 |
+
venv
|
| 6 |
+
__pycache__
|
| 7 |
+
.pytest_cache
|
| 8 |
+
.ruff_cache
|
| 9 |
+
*.pyc
|
| 10 |
+
*.pyo
|
| 11 |
+
*.pyd
|
| 12 |
+
.cursor
|
| 13 |
+
tests
|
| 14 |
+
requirements-dev.txt
|
.env.example
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GITHUB_TOKEN=your_github_token_here
|
| 2 |
+
HF_TOKEN=your_huggingface_token_here
|
| 3 |
+
KCM_SPACE_ID=adarshxs/kernels-community-monitor
|
| 4 |
+
KCM_GITHUB_OWNER=huggingface
|
| 5 |
+
KCM_GITHUB_REPO=kernels-community
|
| 6 |
+
KCM_GITHUB_BRANCH=main
|
| 7 |
+
KCM_REFRESH_INTERVAL_SECONDS=300
|
| 8 |
+
KCM_WORKFLOW_RUN_PAGE_SIZE=100
|
| 9 |
+
KCM_WORKFLOW_RUN_PAGES=12
|
| 10 |
+
KCM_CRITICAL_KERNELS=flash-attn3,sgl-flash-attn3,flash-attn4,vllm-flash-attn3,deep-gemm
|
| 11 |
+
KCM_GRAFANA_BASE_URL=http://localhost:3000
|
| 12 |
+
KCM_GRAFANA_ORG_ID=1
|
| 13 |
+
KCM_GRAFANA_THEME=dark
|
| 14 |
+
KCM_GRAFANA_OVERVIEW_UID=kernels-build-matrix
|
| 15 |
+
KCM_GRAFANA_DURATION_UID=kernels-build-durations
|
| 16 |
+
KCM_GRAFANA_FAILURE_UID=kernels-build-failures
|
| 17 |
+
KCM_PROMETHEUS_BASE_URL=http://prometheus:9090
|
| 18 |
+
KCM_PUSHGATEWAY_URL=http://pushgateway:9091
|
| 19 |
+
KCM_PUSHGATEWAY_JOB_NAME=kernels-community-build-matrix
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
.env
|
| 4 |
+
.venv/
|
| 5 |
+
venv/
|
| 6 |
+
.pytest_cache/
|
| 7 |
+
.ruff_cache/
|
| 8 |
+
.mypy_cache/
|
| 9 |
+
build/
|
| 10 |
+
dist/
|
| 11 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PORT=7860
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt /app/requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 11 |
+
pip install --no-cache-dir -r /app/requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY . /app
|
| 14 |
+
|
| 15 |
+
EXPOSE 7860
|
| 16 |
+
|
| 17 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,212 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Kernels Community Monitor
|
| 3 |
+
sdk: gradio
|
| 4 |
+
sdk_version: 6.10.0
|
| 5 |
+
python_version: 3.11
|
| 6 |
+
app_file: app.py
|
| 7 |
+
fullWidth: true
|
| 8 |
+
header: mini
|
| 9 |
+
suggested_hardware: cpu-basic
|
| 10 |
+
short_description: Live kernel build table plus optional Grafana metrics deck.
|
| 11 |
+
tags:
|
| 12 |
+
- monitoring
|
| 13 |
+
- github-actions
|
| 14 |
+
- kernels
|
| 15 |
+
- gradio
|
| 16 |
+
- grafana
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
# Kernels Community Monitor
|
| 20 |
+
|
| 21 |
+
`Kernels Community Monitor` now does two things:
|
| 22 |
+
|
| 23 |
+
1. Enumerates every kernel source dir in `huggingface/kernels-community`, scans the latest GitHub Actions runs, and renders a live per-kernel / per-variant status table with Actions links.
|
| 24 |
+
2. Optionally embeds Grafana dashboards for longer-term metrics once a public Grafana endpoint is configured.
|
| 25 |
+
|
| 26 |
+
The app prefers the GitHub Actions REST API when it can, but it also has a public GitHub HTML fallback for workflow runs, job groups, and `build.toml` reads. That avoids the current `huggingface` org restriction that blocks classic PATs on some Actions endpoints.
|
| 27 |
+
|
| 28 |
+
The metrics path is still in the repo:
|
| 29 |
+
|
| 30 |
+
- GitHub Actions pushes per-matrix build metrics to Pushgateway.
|
| 31 |
+
- Prometheus scrapes Pushgateway and evaluates alert rules.
|
| 32 |
+
- Grafana owns the dashboards, filters, and time-series UI.
|
| 33 |
+
- This Hugging Face Space just presents clean links and embeds for those dashboards.
|
| 34 |
+
|
| 35 |
+
## What Changed
|
| 36 |
+
|
| 37 |
+
The old zero-upstream-change monitor worked, but it had three hard limits:
|
| 38 |
+
|
| 39 |
+
- it depended on GitHub API polling and log scraping
|
| 40 |
+
- it could only infer matrix state indirectly
|
| 41 |
+
- it could not give you clean duration trends or robust alerting without more brittle parsing
|
| 42 |
+
|
| 43 |
+
This cutover replaces that with first-class metrics:
|
| 44 |
+
|
| 45 |
+
- `scripts/push_build_metrics.py` pushes the latest status, duration, and timestamp for each matrix combo.
|
| 46 |
+
- `monitoring/docker-compose.yml` provisions `prometheus`, `pushgateway`, and `grafana`.
|
| 47 |
+
- `monitoring/prometheus/rules/build-alerts.yml` alerts on failing or stale combos.
|
| 48 |
+
- `monitoring/grafana/dashboards/` provides ready dashboards with filters for kernel, backend, compute backend, CUDA, PyTorch, and Python.
|
| 49 |
+
- `src/kc_monitor/ui.py` renders the live kernel matrix table first, then the Grafana deck if configured.
|
| 50 |
+
|
| 51 |
+
## Metrics Model
|
| 52 |
+
|
| 53 |
+
Each matrix combo is stored as a stable Pushgateway grouping key:
|
| 54 |
+
|
| 55 |
+
`kernel + backend + compute_backend + cuda_version + pytorch_version + python_version`
|
| 56 |
+
|
| 57 |
+
Each push updates these gauges:
|
| 58 |
+
|
| 59 |
+
- `kc_build_last_run_result_code`
|
| 60 |
+
- `kc_build_last_run_failed`
|
| 61 |
+
- `kc_build_last_run_duration_seconds`
|
| 62 |
+
- `kc_build_last_run_timestamp_seconds`
|
| 63 |
+
- `kc_build_last_run_info`
|
| 64 |
+
|
| 65 |
+
That gives you:
|
| 66 |
+
|
| 67 |
+
- current per-combo health
|
| 68 |
+
- duration history per combo
|
| 69 |
+
- stale build telemetry detection
|
| 70 |
+
- alert-friendly failure signals
|
| 71 |
+
|
| 72 |
+
## Local Setup
|
| 73 |
+
|
| 74 |
+
Install deps:
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
python -m venv .venv
|
| 78 |
+
. .venv/bin/activate
|
| 79 |
+
pip install -r requirements-dev.txt
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
Windows PowerShell activation:
|
| 83 |
+
|
| 84 |
+
```powershell
|
| 85 |
+
python -m venv .venv
|
| 86 |
+
.\.venv\Scripts\Activate.ps1
|
| 87 |
+
pip install -r requirements-dev.txt
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Create `.env` from `.env.example` and set at least:
|
| 91 |
+
|
| 92 |
+
```env
|
| 93 |
+
HF_TOKEN=...
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
If you want local Grafana too, also set:
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
KCM_GRAFANA_BASE_URL=http://localhost:3000
|
| 100 |
+
docker compose -f monitoring/docker-compose.yml up -d
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
Run the app locally:
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
python app.py
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
Run the smoke check:
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
python scripts/smoke_check.py
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
Run tests:
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
pytest
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
## GitHub Actions Step
|
| 122 |
+
|
| 123 |
+
The actual workflow YAMLs live in the `huggingface/kernels-community` repo, not here.
|
| 124 |
+
|
| 125 |
+
Use `monitoring/github-actions-post-job.yml` as the drop-in snippet. The important bit is:
|
| 126 |
+
|
| 127 |
+
```yaml
|
| 128 |
+
- name: Record matrix job start time
|
| 129 |
+
shell: bash
|
| 130 |
+
run: echo "KCM_JOB_STARTED_AT=$(date +%s)" >> "$GITHUB_ENV"
|
| 131 |
+
|
| 132 |
+
- name: Push matrix build metrics
|
| 133 |
+
if: always()
|
| 134 |
+
shell: bash
|
| 135 |
+
env:
|
| 136 |
+
PUSHGATEWAY_URL: ${{ secrets.PUSHGATEWAY_URL }}
|
| 137 |
+
KCM_PUSHGATEWAY_JOB_NAME: kernels-community-build-matrix
|
| 138 |
+
KCM_JOB_STATUS: ${{ job.status }}
|
| 139 |
+
KCM_KERNEL: ${{ matrix.kernel }}
|
| 140 |
+
KCM_BACKEND: ${{ matrix.backend }}
|
| 141 |
+
KCM_COMPUTE_BACKEND: ${{ matrix.compute_backend }}
|
| 142 |
+
KCM_CUDA_VERSION: ${{ matrix.cuda }}
|
| 143 |
+
KCM_PYTORCH_VERSION: ${{ matrix.torch }}
|
| 144 |
+
KCM_PYTHON_VERSION: ${{ matrix.python }}
|
| 145 |
+
run: python scripts/push_build_metrics.py
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
The emitter is intentionally low-cardinality: it tracks the latest state for each stable combo, which is what you want for Grafana filters and Prometheus alerts without Pushgateway turning into a per-run junk drawer.
|
| 149 |
+
|
| 150 |
+
## Dashboards
|
| 151 |
+
|
| 152 |
+
Provisioned dashboards:
|
| 153 |
+
|
| 154 |
+
- `kernels-build-matrix`
|
| 155 |
+
- `kernels-build-durations`
|
| 156 |
+
- `kernels-build-failures`
|
| 157 |
+
|
| 158 |
+
All of them expose variables for:
|
| 159 |
+
|
| 160 |
+
- kernel
|
| 161 |
+
- backend
|
| 162 |
+
- compute backend
|
| 163 |
+
- CUDA version
|
| 164 |
+
- PyTorch version
|
| 165 |
+
- Python version
|
| 166 |
+
|
| 167 |
+
## Alerting
|
| 168 |
+
|
| 169 |
+
Prometheus rules ship in `monitoring/prometheus/rules/build-alerts.yml`.
|
| 170 |
+
|
| 171 |
+
Current rules:
|
| 172 |
+
|
| 173 |
+
- `KernelsBuildMatrixComboFailing`
|
| 174 |
+
- `KernelsBuildMetricsStale`
|
| 175 |
+
|
| 176 |
+
You can route those through Alertmanager later, but the expression layer is already there.
|
| 177 |
+
|
| 178 |
+
## Runtime Configuration
|
| 179 |
+
|
| 180 |
+
Main env/config knobs:
|
| 181 |
+
|
| 182 |
+
- `KCM_GRAFANA_BASE_URL`
|
| 183 |
+
- `KCM_GRAFANA_ORG_ID`
|
| 184 |
+
- `KCM_GRAFANA_THEME`
|
| 185 |
+
- `KCM_GRAFANA_OVERVIEW_UID`
|
| 186 |
+
- `KCM_GRAFANA_DURATION_UID`
|
| 187 |
+
- `KCM_GRAFANA_FAILURE_UID`
|
| 188 |
+
- `KCM_PROMETHEUS_BASE_URL`
|
| 189 |
+
- `KCM_PUSHGATEWAY_URL`
|
| 190 |
+
- `KCM_PUSHGATEWAY_JOB_NAME`
|
| 191 |
+
|
| 192 |
+
If `KCM_GRAFANA_BASE_URL` is not set, the Space still works: the live GitHub Actions table stays active and the Grafana section renders as a setup card instead of broken embeds.
|
| 193 |
+
|
| 194 |
+
The base YAML config lives at `config/monitor.yaml`. Environment variables override it at runtime.
|
| 195 |
+
|
| 196 |
+
## Deploy To Hugging Face Space
|
| 197 |
+
|
| 198 |
+
This repo still includes a bootstrap script that creates or updates the Space and uploads the current folder.
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
python scripts/bootstrap_space.py --space-id adarshxs/kernels-community-monitor
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
What it does:
|
| 205 |
+
|
| 206 |
+
- creates the Space repo if it does not exist
|
| 207 |
+
- uploads this project as a Gradio Space
|
| 208 |
+
- writes the Grafana, Prometheus, and Pushgateway settings into Space variables
|
| 209 |
+
|
| 210 |
+
After upload, the expected Space URL is:
|
| 211 |
+
|
| 212 |
+
- `https://huggingface.co/spaces/adarshxs/kernels-community-monitor`
|
app.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E402
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
ROOT_DIR = Path(__file__).resolve().parent
|
| 10 |
+
SRC_DIR = ROOT_DIR / "src"
|
| 11 |
+
if str(SRC_DIR) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 13 |
+
|
| 14 |
+
from kc_monitor.config import load_config
|
| 15 |
+
from kc_monitor.service import MonitorService
|
| 16 |
+
from kc_monitor.ui import CSS, PAGE_JS, THEME, build_dashboard
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
config = load_config(ROOT_DIR / "config" / "monitor.yaml")
|
| 20 |
+
service = MonitorService(config)
|
| 21 |
+
demo = build_dashboard(service, config)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
demo.launch(
|
| 26 |
+
server_name="0.0.0.0",
|
| 27 |
+
server_port=int(os.getenv("PORT", "7860")),
|
| 28 |
+
theme=THEME,
|
| 29 |
+
css=CSS,
|
| 30 |
+
js=PAGE_JS,
|
| 31 |
+
)
|
config/monitor.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
github:
|
| 2 |
+
owner: huggingface
|
| 3 |
+
repo: kernels-community
|
| 4 |
+
branch: main
|
| 5 |
+
per_page: 100
|
| 6 |
+
request_timeout_seconds: 25
|
| 7 |
+
workflows:
|
| 8 |
+
- path: .github/workflows/build-release.yaml
|
| 9 |
+
label: Build Release
|
| 10 |
+
enabled: true
|
| 11 |
+
- path: .github/workflows/manual-build-upload.yaml
|
| 12 |
+
label: Manual Kernel Build
|
| 13 |
+
enabled: true
|
| 14 |
+
|
| 15 |
+
monitor:
|
| 16 |
+
refresh_interval_seconds: 300
|
| 17 |
+
snapshot_ttl_seconds: 240
|
| 18 |
+
workflow_run_page_size: 100
|
| 19 |
+
workflow_run_pages: 12
|
| 20 |
+
recent_completed_hours: 336
|
| 21 |
+
recent_limit: 30
|
| 22 |
+
completed_runs_per_workflow: 15
|
| 23 |
+
log_line_limit: 400
|
| 24 |
+
log_char_limit: 35000
|
| 25 |
+
detail_event_limit: 25
|
| 26 |
+
stall_without_log_minutes: 45
|
| 27 |
+
stall_active_phase_minutes: 180
|
| 28 |
+
critical_kernels:
|
| 29 |
+
- flash-attn3
|
| 30 |
+
- sgl-flash-attn3
|
| 31 |
+
- flash-attn4
|
| 32 |
+
- vllm-flash-attn3
|
| 33 |
+
- deep-gemm
|
| 34 |
+
|
| 35 |
+
grafana:
|
| 36 |
+
base_url: null
|
| 37 |
+
org_id: 1
|
| 38 |
+
theme: dark
|
| 39 |
+
default_from: now-30d
|
| 40 |
+
default_to: now
|
| 41 |
+
default_refresh: 5m
|
| 42 |
+
overview_dashboard_uid: kernels-build-matrix
|
| 43 |
+
duration_dashboard_uid: kernels-build-durations
|
| 44 |
+
failure_dashboard_uid: kernels-build-failures
|
| 45 |
+
|
| 46 |
+
prometheus:
|
| 47 |
+
base_url: null
|
| 48 |
+
|
| 49 |
+
pushgateway:
|
| 50 |
+
url: null
|
| 51 |
+
job_name: kernels-community-build-matrix
|
monitoring/docker-compose.yml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
prometheus:
|
| 3 |
+
image: prom/prometheus
|
| 4 |
+
command:
|
| 5 |
+
- --config.file=/etc/prometheus/prometheus.yml
|
| 6 |
+
- --web.enable-lifecycle
|
| 7 |
+
ports:
|
| 8 |
+
- "9090:9090"
|
| 9 |
+
volumes:
|
| 10 |
+
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 11 |
+
- ./prometheus/rules:/etc/prometheus/rules:ro
|
| 12 |
+
- prometheus-data:/prometheus
|
| 13 |
+
|
| 14 |
+
pushgateway:
|
| 15 |
+
image: prom/pushgateway
|
| 16 |
+
command:
|
| 17 |
+
- --persistence.file=/data/pushgateway.db
|
| 18 |
+
ports:
|
| 19 |
+
- "9091:9091"
|
| 20 |
+
volumes:
|
| 21 |
+
- pushgateway-data:/data
|
| 22 |
+
|
| 23 |
+
grafana:
|
| 24 |
+
image: grafana/grafana-oss
|
| 25 |
+
depends_on:
|
| 26 |
+
- prometheus
|
| 27 |
+
environment:
|
| 28 |
+
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
| 29 |
+
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
| 30 |
+
GF_AUTH_ANONYMOUS_ENABLED: ${GRAFANA_ANONYMOUS_ENABLED:-true}
|
| 31 |
+
GF_AUTH_ANONYMOUS_ORG_ROLE: Viewer
|
| 32 |
+
GF_SECURITY_ALLOW_EMBEDDING: "true"
|
| 33 |
+
GF_DASHBOARDS_MIN_REFRESH_INTERVAL: 10s
|
| 34 |
+
GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3000}
|
| 35 |
+
ports:
|
| 36 |
+
- "3000:3000"
|
| 37 |
+
volumes:
|
| 38 |
+
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
| 39 |
+
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
| 40 |
+
- grafana-data:/var/lib/grafana
|
| 41 |
+
|
| 42 |
+
volumes:
|
| 43 |
+
prometheus-data:
|
| 44 |
+
pushgateway-data:
|
| 45 |
+
grafana-data:
|
monitoring/github-actions-post-job.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Drop this into the kernels-community workflow repo.
|
| 2 |
+
#
|
| 3 |
+
# Example matrix fields expected below:
|
| 4 |
+
# matrix.kernel
|
| 5 |
+
# matrix.backend
|
| 6 |
+
# matrix.compute_backend
|
| 7 |
+
# matrix.cuda
|
| 8 |
+
# matrix.torch
|
| 9 |
+
# matrix.python
|
| 10 |
+
|
| 11 |
+
- name: Record matrix job start time
|
| 12 |
+
shell: bash
|
| 13 |
+
run: echo "KCM_JOB_STARTED_AT=$(date +%s)" >> "$GITHUB_ENV"
|
| 14 |
+
|
| 15 |
+
- name: Push matrix build metrics
|
| 16 |
+
if: always()
|
| 17 |
+
shell: bash
|
| 18 |
+
env:
|
| 19 |
+
PUSHGATEWAY_URL: ${{ secrets.PUSHGATEWAY_URL }}
|
| 20 |
+
KCM_PUSHGATEWAY_JOB_NAME: kernels-community-build-matrix
|
| 21 |
+
KCM_JOB_STATUS: ${{ job.status }}
|
| 22 |
+
KCM_KERNEL: ${{ matrix.kernel }}
|
| 23 |
+
KCM_BACKEND: ${{ matrix.backend }}
|
| 24 |
+
KCM_COMPUTE_BACKEND: ${{ matrix.compute_backend }}
|
| 25 |
+
KCM_CUDA_VERSION: ${{ matrix.cuda }}
|
| 26 |
+
KCM_PYTORCH_VERSION: ${{ matrix.torch }}
|
| 27 |
+
KCM_PYTHON_VERSION: ${{ matrix.python }}
|
| 28 |
+
run: python scripts/push_build_metrics.py
|
monitoring/grafana/dashboards/build-duration-trends.json
ADDED
|
@@ -0,0 +1,474 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": []
|
| 4 |
+
},
|
| 5 |
+
"editable": true,
|
| 6 |
+
"fiscalYearStartMonth": 0,
|
| 7 |
+
"graphTooltip": 1,
|
| 8 |
+
"links": [],
|
| 9 |
+
"panels": [
|
| 10 |
+
{
|
| 11 |
+
"datasource": {
|
| 12 |
+
"type": "prometheus",
|
| 13 |
+
"uid": "prometheus"
|
| 14 |
+
},
|
| 15 |
+
"fieldConfig": {
|
| 16 |
+
"defaults": {
|
| 17 |
+
"color": {
|
| 18 |
+
"mode": "thresholds"
|
| 19 |
+
},
|
| 20 |
+
"thresholds": {
|
| 21 |
+
"mode": "absolute",
|
| 22 |
+
"steps": [
|
| 23 |
+
{
|
| 24 |
+
"color": "green",
|
| 25 |
+
"value": null
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"unit": "s"
|
| 30 |
+
},
|
| 31 |
+
"overrides": []
|
| 32 |
+
},
|
| 33 |
+
"gridPos": {
|
| 34 |
+
"h": 5,
|
| 35 |
+
"w": 8,
|
| 36 |
+
"x": 0,
|
| 37 |
+
"y": 0
|
| 38 |
+
},
|
| 39 |
+
"id": 1,
|
| 40 |
+
"options": {
|
| 41 |
+
"colorMode": "value",
|
| 42 |
+
"graphMode": "none",
|
| 43 |
+
"justifyMode": "auto",
|
| 44 |
+
"orientation": "auto",
|
| 45 |
+
"reduceOptions": {
|
| 46 |
+
"calcs": [
|
| 47 |
+
"lastNotNull"
|
| 48 |
+
],
|
| 49 |
+
"fields": "",
|
| 50 |
+
"values": false
|
| 51 |
+
},
|
| 52 |
+
"textMode": "value"
|
| 53 |
+
},
|
| 54 |
+
"targets": [
|
| 55 |
+
{
|
| 56 |
+
"datasource": {
|
| 57 |
+
"type": "prometheus",
|
| 58 |
+
"uid": "prometheus"
|
| 59 |
+
},
|
| 60 |
+
"expr": "avg(kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 61 |
+
"instant": true,
|
| 62 |
+
"refId": "A"
|
| 63 |
+
}
|
| 64 |
+
],
|
| 65 |
+
"title": "Average current duration",
|
| 66 |
+
"type": "stat"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"datasource": {
|
| 70 |
+
"type": "prometheus",
|
| 71 |
+
"uid": "prometheus"
|
| 72 |
+
},
|
| 73 |
+
"fieldConfig": {
|
| 74 |
+
"defaults": {
|
| 75 |
+
"color": {
|
| 76 |
+
"mode": "thresholds"
|
| 77 |
+
},
|
| 78 |
+
"thresholds": {
|
| 79 |
+
"mode": "absolute",
|
| 80 |
+
"steps": [
|
| 81 |
+
{
|
| 82 |
+
"color": "green",
|
| 83 |
+
"value": null
|
| 84 |
+
}
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
"unit": "s"
|
| 88 |
+
},
|
| 89 |
+
"overrides": []
|
| 90 |
+
},
|
| 91 |
+
"gridPos": {
|
| 92 |
+
"h": 5,
|
| 93 |
+
"w": 8,
|
| 94 |
+
"x": 8,
|
| 95 |
+
"y": 0
|
| 96 |
+
},
|
| 97 |
+
"id": 2,
|
| 98 |
+
"options": {
|
| 99 |
+
"colorMode": "value",
|
| 100 |
+
"graphMode": "none",
|
| 101 |
+
"justifyMode": "auto",
|
| 102 |
+
"orientation": "auto",
|
| 103 |
+
"reduceOptions": {
|
| 104 |
+
"calcs": [
|
| 105 |
+
"lastNotNull"
|
| 106 |
+
],
|
| 107 |
+
"fields": "",
|
| 108 |
+
"values": false
|
| 109 |
+
},
|
| 110 |
+
"textMode": "value"
|
| 111 |
+
},
|
| 112 |
+
"targets": [
|
| 113 |
+
{
|
| 114 |
+
"datasource": {
|
| 115 |
+
"type": "prometheus",
|
| 116 |
+
"uid": "prometheus"
|
| 117 |
+
},
|
| 118 |
+
"expr": "max(kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 119 |
+
"instant": true,
|
| 120 |
+
"refId": "A"
|
| 121 |
+
}
|
| 122 |
+
],
|
| 123 |
+
"title": "Slowest current combo",
|
| 124 |
+
"type": "stat"
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"datasource": {
|
| 128 |
+
"type": "prometheus",
|
| 129 |
+
"uid": "prometheus"
|
| 130 |
+
},
|
| 131 |
+
"fieldConfig": {
|
| 132 |
+
"defaults": {
|
| 133 |
+
"color": {
|
| 134 |
+
"mode": "thresholds"
|
| 135 |
+
},
|
| 136 |
+
"thresholds": {
|
| 137 |
+
"mode": "absolute",
|
| 138 |
+
"steps": [
|
| 139 |
+
{
|
| 140 |
+
"color": "green",
|
| 141 |
+
"value": null
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"color": "orange",
|
| 145 |
+
"value": 6
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"color": "red",
|
| 149 |
+
"value": 24
|
| 150 |
+
}
|
| 151 |
+
]
|
| 152 |
+
},
|
| 153 |
+
"unit": "h"
|
| 154 |
+
},
|
| 155 |
+
"overrides": []
|
| 156 |
+
},
|
| 157 |
+
"gridPos": {
|
| 158 |
+
"h": 5,
|
| 159 |
+
"w": 8,
|
| 160 |
+
"x": 16,
|
| 161 |
+
"y": 0
|
| 162 |
+
},
|
| 163 |
+
"id": 3,
|
| 164 |
+
"options": {
|
| 165 |
+
"colorMode": "value",
|
| 166 |
+
"graphMode": "none",
|
| 167 |
+
"justifyMode": "auto",
|
| 168 |
+
"orientation": "auto",
|
| 169 |
+
"reduceOptions": {
|
| 170 |
+
"calcs": [
|
| 171 |
+
"lastNotNull"
|
| 172 |
+
],
|
| 173 |
+
"fields": "",
|
| 174 |
+
"values": false
|
| 175 |
+
},
|
| 176 |
+
"textMode": "value"
|
| 177 |
+
},
|
| 178 |
+
"targets": [
|
| 179 |
+
{
|
| 180 |
+
"datasource": {
|
| 181 |
+
"type": "prometheus",
|
| 182 |
+
"uid": "prometheus"
|
| 183 |
+
},
|
| 184 |
+
"expr": "avg((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
|
| 185 |
+
"instant": true,
|
| 186 |
+
"refId": "A"
|
| 187 |
+
}
|
| 188 |
+
],
|
| 189 |
+
"title": "Average age of last sample",
|
| 190 |
+
"type": "stat"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"datasource": {
|
| 194 |
+
"type": "prometheus",
|
| 195 |
+
"uid": "prometheus"
|
| 196 |
+
},
|
| 197 |
+
"fieldConfig": {
|
| 198 |
+
"defaults": {
|
| 199 |
+
"color": {
|
| 200 |
+
"mode": "continuous-BlPu"
|
| 201 |
+
},
|
| 202 |
+
"unit": "s"
|
| 203 |
+
},
|
| 204 |
+
"overrides": []
|
| 205 |
+
},
|
| 206 |
+
"gridPos": {
|
| 207 |
+
"h": 8,
|
| 208 |
+
"w": 24,
|
| 209 |
+
"x": 0,
|
| 210 |
+
"y": 5
|
| 211 |
+
},
|
| 212 |
+
"id": 4,
|
| 213 |
+
"options": {
|
| 214 |
+
"legend": {
|
| 215 |
+
"displayMode": "table",
|
| 216 |
+
"placement": "bottom"
|
| 217 |
+
},
|
| 218 |
+
"tooltip": {
|
| 219 |
+
"mode": "multi"
|
| 220 |
+
}
|
| 221 |
+
},
|
| 222 |
+
"targets": [
|
| 223 |
+
{
|
| 224 |
+
"datasource": {
|
| 225 |
+
"type": "prometheus",
|
| 226 |
+
"uid": "prometheus"
|
| 227 |
+
},
|
| 228 |
+
"expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 229 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 230 |
+
"refId": "A"
|
| 231 |
+
}
|
| 232 |
+
],
|
| 233 |
+
"title": "Duration trends by combo",
|
| 234 |
+
"type": "timeseries"
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"datasource": {
|
| 238 |
+
"type": "prometheus",
|
| 239 |
+
"uid": "prometheus"
|
| 240 |
+
},
|
| 241 |
+
"fieldConfig": {
|
| 242 |
+
"defaults": {
|
| 243 |
+
"color": {
|
| 244 |
+
"mode": "continuous-GrYlRd"
|
| 245 |
+
},
|
| 246 |
+
"unit": "s"
|
| 247 |
+
},
|
| 248 |
+
"overrides": []
|
| 249 |
+
},
|
| 250 |
+
"gridPos": {
|
| 251 |
+
"h": 8,
|
| 252 |
+
"w": 24,
|
| 253 |
+
"x": 0,
|
| 254 |
+
"y": 13
|
| 255 |
+
},
|
| 256 |
+
"id": 5,
|
| 257 |
+
"options": {
|
| 258 |
+
"displayMode": "gradient",
|
| 259 |
+
"orientation": "horizontal",
|
| 260 |
+
"reduceOptions": {
|
| 261 |
+
"calcs": [
|
| 262 |
+
"lastNotNull"
|
| 263 |
+
],
|
| 264 |
+
"fields": "",
|
| 265 |
+
"values": false
|
| 266 |
+
},
|
| 267 |
+
"showUnfilled": true
|
| 268 |
+
},
|
| 269 |
+
"targets": [
|
| 270 |
+
{
|
| 271 |
+
"datasource": {
|
| 272 |
+
"type": "prometheus",
|
| 273 |
+
"uid": "prometheus"
|
| 274 |
+
},
|
| 275 |
+
"expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 276 |
+
"instant": true,
|
| 277 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 278 |
+
"refId": "A"
|
| 279 |
+
}
|
| 280 |
+
],
|
| 281 |
+
"title": "Current duration distribution",
|
| 282 |
+
"type": "bargauge"
|
| 283 |
+
}
|
| 284 |
+
],
|
| 285 |
+
"refresh": "5m",
|
| 286 |
+
"schemaVersion": 39,
|
| 287 |
+
"style": "dark",
|
| 288 |
+
"tags": [
|
| 289 |
+
"kernels-community",
|
| 290 |
+
"ci",
|
| 291 |
+
"durations"
|
| 292 |
+
],
|
| 293 |
+
"templating": {
|
| 294 |
+
"list": [
|
| 295 |
+
{
|
| 296 |
+
"current": {
|
| 297 |
+
"selected": true,
|
| 298 |
+
"text": [
|
| 299 |
+
"All"
|
| 300 |
+
],
|
| 301 |
+
"value": [
|
| 302 |
+
"$__all"
|
| 303 |
+
]
|
| 304 |
+
},
|
| 305 |
+
"datasource": {
|
| 306 |
+
"type": "prometheus",
|
| 307 |
+
"uid": "prometheus"
|
| 308 |
+
},
|
| 309 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 310 |
+
"includeAll": true,
|
| 311 |
+
"label": "Kernel",
|
| 312 |
+
"multi": true,
|
| 313 |
+
"name": "kernel",
|
| 314 |
+
"options": [],
|
| 315 |
+
"query": {
|
| 316 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 317 |
+
"refId": "PrometheusVariableQueryEditor-kernel"
|
| 318 |
+
},
|
| 319 |
+
"refresh": 1,
|
| 320 |
+
"sort": 1,
|
| 321 |
+
"type": "query"
|
| 322 |
+
},
|
| 323 |
+
{
|
| 324 |
+
"current": {
|
| 325 |
+
"selected": true,
|
| 326 |
+
"text": [
|
| 327 |
+
"All"
|
| 328 |
+
],
|
| 329 |
+
"value": [
|
| 330 |
+
"$__all"
|
| 331 |
+
]
|
| 332 |
+
},
|
| 333 |
+
"datasource": {
|
| 334 |
+
"type": "prometheus",
|
| 335 |
+
"uid": "prometheus"
|
| 336 |
+
},
|
| 337 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 338 |
+
"includeAll": true,
|
| 339 |
+
"label": "Backend",
|
| 340 |
+
"multi": true,
|
| 341 |
+
"name": "backend",
|
| 342 |
+
"options": [],
|
| 343 |
+
"query": {
|
| 344 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 345 |
+
"refId": "PrometheusVariableQueryEditor-backend"
|
| 346 |
+
},
|
| 347 |
+
"refresh": 1,
|
| 348 |
+
"sort": 1,
|
| 349 |
+
"type": "query"
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"current": {
|
| 353 |
+
"selected": true,
|
| 354 |
+
"text": [
|
| 355 |
+
"All"
|
| 356 |
+
],
|
| 357 |
+
"value": [
|
| 358 |
+
"$__all"
|
| 359 |
+
]
|
| 360 |
+
},
|
| 361 |
+
"datasource": {
|
| 362 |
+
"type": "prometheus",
|
| 363 |
+
"uid": "prometheus"
|
| 364 |
+
},
|
| 365 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 366 |
+
"includeAll": true,
|
| 367 |
+
"label": "Compute backend",
|
| 368 |
+
"multi": true,
|
| 369 |
+
"name": "compute_backend",
|
| 370 |
+
"options": [],
|
| 371 |
+
"query": {
|
| 372 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 373 |
+
"refId": "PrometheusVariableQueryEditor-compute_backend"
|
| 374 |
+
},
|
| 375 |
+
"refresh": 1,
|
| 376 |
+
"sort": 1,
|
| 377 |
+
"type": "query"
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"current": {
|
| 381 |
+
"selected": true,
|
| 382 |
+
"text": [
|
| 383 |
+
"All"
|
| 384 |
+
],
|
| 385 |
+
"value": [
|
| 386 |
+
"$__all"
|
| 387 |
+
]
|
| 388 |
+
},
|
| 389 |
+
"datasource": {
|
| 390 |
+
"type": "prometheus",
|
| 391 |
+
"uid": "prometheus"
|
| 392 |
+
},
|
| 393 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 394 |
+
"includeAll": true,
|
| 395 |
+
"label": "CUDA",
|
| 396 |
+
"multi": true,
|
| 397 |
+
"name": "cuda_version",
|
| 398 |
+
"options": [],
|
| 399 |
+
"query": {
|
| 400 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 401 |
+
"refId": "PrometheusVariableQueryEditor-cuda_version"
|
| 402 |
+
},
|
| 403 |
+
"refresh": 1,
|
| 404 |
+
"sort": 1,
|
| 405 |
+
"type": "query"
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"current": {
|
| 409 |
+
"selected": true,
|
| 410 |
+
"text": [
|
| 411 |
+
"All"
|
| 412 |
+
],
|
| 413 |
+
"value": [
|
| 414 |
+
"$__all"
|
| 415 |
+
]
|
| 416 |
+
},
|
| 417 |
+
"datasource": {
|
| 418 |
+
"type": "prometheus",
|
| 419 |
+
"uid": "prometheus"
|
| 420 |
+
},
|
| 421 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 422 |
+
"includeAll": true,
|
| 423 |
+
"label": "PyTorch",
|
| 424 |
+
"multi": true,
|
| 425 |
+
"name": "pytorch_version",
|
| 426 |
+
"options": [],
|
| 427 |
+
"query": {
|
| 428 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 429 |
+
"refId": "PrometheusVariableQueryEditor-pytorch_version"
|
| 430 |
+
},
|
| 431 |
+
"refresh": 1,
|
| 432 |
+
"sort": 1,
|
| 433 |
+
"type": "query"
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"current": {
|
| 437 |
+
"selected": true,
|
| 438 |
+
"text": [
|
| 439 |
+
"All"
|
| 440 |
+
],
|
| 441 |
+
"value": [
|
| 442 |
+
"$__all"
|
| 443 |
+
]
|
| 444 |
+
},
|
| 445 |
+
"datasource": {
|
| 446 |
+
"type": "prometheus",
|
| 447 |
+
"uid": "prometheus"
|
| 448 |
+
},
|
| 449 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 450 |
+
"includeAll": true,
|
| 451 |
+
"label": "Python",
|
| 452 |
+
"multi": true,
|
| 453 |
+
"name": "python_version",
|
| 454 |
+
"options": [],
|
| 455 |
+
"query": {
|
| 456 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 457 |
+
"refId": "PrometheusVariableQueryEditor-python_version"
|
| 458 |
+
},
|
| 459 |
+
"refresh": 1,
|
| 460 |
+
"sort": 1,
|
| 461 |
+
"type": "query"
|
| 462 |
+
}
|
| 463 |
+
]
|
| 464 |
+
},
|
| 465 |
+
"time": {
|
| 466 |
+
"from": "now-30d",
|
| 467 |
+
"to": "now"
|
| 468 |
+
},
|
| 469 |
+
"timezone": "browser",
|
| 470 |
+
"title": "Kernels Build Duration Trends",
|
| 471 |
+
"uid": "kernels-build-durations",
|
| 472 |
+
"version": 1,
|
| 473 |
+
"weekStart": ""
|
| 474 |
+
}
|
monitoring/grafana/dashboards/build-failure-overview.json
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": []
|
| 4 |
+
},
|
| 5 |
+
"editable": true,
|
| 6 |
+
"fiscalYearStartMonth": 0,
|
| 7 |
+
"graphTooltip": 1,
|
| 8 |
+
"links": [],
|
| 9 |
+
"panels": [
|
| 10 |
+
{
|
| 11 |
+
"datasource": {
|
| 12 |
+
"type": "prometheus",
|
| 13 |
+
"uid": "prometheus"
|
| 14 |
+
},
|
| 15 |
+
"fieldConfig": {
|
| 16 |
+
"defaults": {
|
| 17 |
+
"color": {
|
| 18 |
+
"mode": "thresholds"
|
| 19 |
+
},
|
| 20 |
+
"thresholds": {
|
| 21 |
+
"mode": "absolute",
|
| 22 |
+
"steps": [
|
| 23 |
+
{
|
| 24 |
+
"color": "green",
|
| 25 |
+
"value": null
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"color": "red",
|
| 29 |
+
"value": 1
|
| 30 |
+
}
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
"unit": "none"
|
| 34 |
+
},
|
| 35 |
+
"overrides": []
|
| 36 |
+
},
|
| 37 |
+
"gridPos": {
|
| 38 |
+
"h": 5,
|
| 39 |
+
"w": 8,
|
| 40 |
+
"x": 0,
|
| 41 |
+
"y": 0
|
| 42 |
+
},
|
| 43 |
+
"id": 1,
|
| 44 |
+
"options": {
|
| 45 |
+
"colorMode": "value",
|
| 46 |
+
"graphMode": "none",
|
| 47 |
+
"justifyMode": "auto",
|
| 48 |
+
"orientation": "auto",
|
| 49 |
+
"reduceOptions": {
|
| 50 |
+
"calcs": [
|
| 51 |
+
"lastNotNull"
|
| 52 |
+
],
|
| 53 |
+
"fields": "",
|
| 54 |
+
"values": false
|
| 55 |
+
},
|
| 56 |
+
"textMode": "value"
|
| 57 |
+
},
|
| 58 |
+
"targets": [
|
| 59 |
+
{
|
| 60 |
+
"datasource": {
|
| 61 |
+
"type": "prometheus",
|
| 62 |
+
"uid": "prometheus"
|
| 63 |
+
},
|
| 64 |
+
"expr": "sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 65 |
+
"instant": true,
|
| 66 |
+
"refId": "A"
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"title": "Failing combos",
|
| 70 |
+
"type": "stat"
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"datasource": {
|
| 74 |
+
"type": "prometheus",
|
| 75 |
+
"uid": "prometheus"
|
| 76 |
+
},
|
| 77 |
+
"fieldConfig": {
|
| 78 |
+
"defaults": {
|
| 79 |
+
"color": {
|
| 80 |
+
"mode": "thresholds"
|
| 81 |
+
},
|
| 82 |
+
"thresholds": {
|
| 83 |
+
"mode": "absolute",
|
| 84 |
+
"steps": [
|
| 85 |
+
{
|
| 86 |
+
"color": "green",
|
| 87 |
+
"value": null
|
| 88 |
+
}
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"unit": "none"
|
| 92 |
+
},
|
| 93 |
+
"overrides": []
|
| 94 |
+
},
|
| 95 |
+
"gridPos": {
|
| 96 |
+
"h": 5,
|
| 97 |
+
"w": 8,
|
| 98 |
+
"x": 8,
|
| 99 |
+
"y": 0
|
| 100 |
+
},
|
| 101 |
+
"id": 2,
|
| 102 |
+
"options": {
|
| 103 |
+
"colorMode": "value",
|
| 104 |
+
"graphMode": "none",
|
| 105 |
+
"justifyMode": "auto",
|
| 106 |
+
"orientation": "auto",
|
| 107 |
+
"reduceOptions": {
|
| 108 |
+
"calcs": [
|
| 109 |
+
"lastNotNull"
|
| 110 |
+
],
|
| 111 |
+
"fields": "",
|
| 112 |
+
"values": false
|
| 113 |
+
},
|
| 114 |
+
"textMode": "value"
|
| 115 |
+
},
|
| 116 |
+
"targets": [
|
| 117 |
+
{
|
| 118 |
+
"datasource": {
|
| 119 |
+
"type": "prometheus",
|
| 120 |
+
"uid": "prometheus"
|
| 121 |
+
},
|
| 122 |
+
"expr": "count(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"} == 1)",
|
| 123 |
+
"instant": true,
|
| 124 |
+
"refId": "A"
|
| 125 |
+
}
|
| 126 |
+
],
|
| 127 |
+
"title": "Alerting series",
|
| 128 |
+
"type": "stat"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"datasource": {
|
| 132 |
+
"type": "prometheus",
|
| 133 |
+
"uid": "prometheus"
|
| 134 |
+
},
|
| 135 |
+
"fieldConfig": {
|
| 136 |
+
"defaults": {
|
| 137 |
+
"color": {
|
| 138 |
+
"mode": "thresholds"
|
| 139 |
+
},
|
| 140 |
+
"thresholds": {
|
| 141 |
+
"mode": "absolute",
|
| 142 |
+
"steps": [
|
| 143 |
+
{
|
| 144 |
+
"color": "green",
|
| 145 |
+
"value": null
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"color": "orange",
|
| 149 |
+
"value": 6
|
| 150 |
+
},
|
| 151 |
+
{
|
| 152 |
+
"color": "red",
|
| 153 |
+
"value": 24
|
| 154 |
+
}
|
| 155 |
+
]
|
| 156 |
+
},
|
| 157 |
+
"unit": "h"
|
| 158 |
+
},
|
| 159 |
+
"overrides": []
|
| 160 |
+
},
|
| 161 |
+
"gridPos": {
|
| 162 |
+
"h": 5,
|
| 163 |
+
"w": 8,
|
| 164 |
+
"x": 16,
|
| 165 |
+
"y": 0
|
| 166 |
+
},
|
| 167 |
+
"id": 3,
|
| 168 |
+
"options": {
|
| 169 |
+
"colorMode": "value",
|
| 170 |
+
"graphMode": "none",
|
| 171 |
+
"justifyMode": "auto",
|
| 172 |
+
"orientation": "auto",
|
| 173 |
+
"reduceOptions": {
|
| 174 |
+
"calcs": [
|
| 175 |
+
"lastNotNull"
|
| 176 |
+
],
|
| 177 |
+
"fields": "",
|
| 178 |
+
"values": false
|
| 179 |
+
},
|
| 180 |
+
"textMode": "value"
|
| 181 |
+
},
|
| 182 |
+
"targets": [
|
| 183 |
+
{
|
| 184 |
+
"datasource": {
|
| 185 |
+
"type": "prometheus",
|
| 186 |
+
"uid": "prometheus"
|
| 187 |
+
},
|
| 188 |
+
"expr": "max((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
|
| 189 |
+
"instant": true,
|
| 190 |
+
"refId": "A"
|
| 191 |
+
}
|
| 192 |
+
],
|
| 193 |
+
"title": "Oldest sample age",
|
| 194 |
+
"type": "stat"
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"datasource": {
|
| 198 |
+
"type": "prometheus",
|
| 199 |
+
"uid": "prometheus"
|
| 200 |
+
},
|
| 201 |
+
"fieldConfig": {
|
| 202 |
+
"defaults": {
|
| 203 |
+
"color": {
|
| 204 |
+
"mode": "palette-classic"
|
| 205 |
+
},
|
| 206 |
+
"custom": {
|
| 207 |
+
"axisBorderShow": false,
|
| 208 |
+
"axisCenteredZero": false,
|
| 209 |
+
"drawStyle": "line",
|
| 210 |
+
"fillOpacity": 18,
|
| 211 |
+
"lineInterpolation": "stepAfter",
|
| 212 |
+
"lineWidth": 2,
|
| 213 |
+
"pointSize": 4,
|
| 214 |
+
"showPoints": "never",
|
| 215 |
+
"spanNulls": true
|
| 216 |
+
},
|
| 217 |
+
"max": 1,
|
| 218 |
+
"min": 0,
|
| 219 |
+
"unit": "none"
|
| 220 |
+
},
|
| 221 |
+
"overrides": []
|
| 222 |
+
},
|
| 223 |
+
"gridPos": {
|
| 224 |
+
"h": 8,
|
| 225 |
+
"w": 24,
|
| 226 |
+
"x": 0,
|
| 227 |
+
"y": 5
|
| 228 |
+
},
|
| 229 |
+
"id": 4,
|
| 230 |
+
"options": {
|
| 231 |
+
"legend": {
|
| 232 |
+
"displayMode": "table",
|
| 233 |
+
"placement": "bottom"
|
| 234 |
+
},
|
| 235 |
+
"tooltip": {
|
| 236 |
+
"mode": "multi"
|
| 237 |
+
}
|
| 238 |
+
},
|
| 239 |
+
"targets": [
|
| 240 |
+
{
|
| 241 |
+
"datasource": {
|
| 242 |
+
"type": "prometheus",
|
| 243 |
+
"uid": "prometheus"
|
| 244 |
+
},
|
| 245 |
+
"expr": "kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 246 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 247 |
+
"refId": "A"
|
| 248 |
+
}
|
| 249 |
+
],
|
| 250 |
+
"title": "Failure state by combo",
|
| 251 |
+
"type": "timeseries"
|
| 252 |
+
},
|
| 253 |
+
{
|
| 254 |
+
"datasource": {
|
| 255 |
+
"type": "prometheus",
|
| 256 |
+
"uid": "prometheus"
|
| 257 |
+
},
|
| 258 |
+
"fieldConfig": {
|
| 259 |
+
"defaults": {
|
| 260 |
+
"color": {
|
| 261 |
+
"mode": "palette-classic"
|
| 262 |
+
},
|
| 263 |
+
"custom": {
|
| 264 |
+
"axisBorderShow": false,
|
| 265 |
+
"axisCenteredZero": false,
|
| 266 |
+
"drawStyle": "line",
|
| 267 |
+
"fillOpacity": 20,
|
| 268 |
+
"lineInterpolation": "stepAfter",
|
| 269 |
+
"lineWidth": 2,
|
| 270 |
+
"pointSize": 4,
|
| 271 |
+
"showPoints": "never",
|
| 272 |
+
"spanNulls": true
|
| 273 |
+
},
|
| 274 |
+
"max": 3,
|
| 275 |
+
"min": 0,
|
| 276 |
+
"unit": "none"
|
| 277 |
+
},
|
| 278 |
+
"overrides": []
|
| 279 |
+
},
|
| 280 |
+
"gridPos": {
|
| 281 |
+
"h": 8,
|
| 282 |
+
"w": 24,
|
| 283 |
+
"x": 0,
|
| 284 |
+
"y": 13
|
| 285 |
+
},
|
| 286 |
+
"id": 5,
|
| 287 |
+
"options": {
|
| 288 |
+
"legend": {
|
| 289 |
+
"displayMode": "table",
|
| 290 |
+
"placement": "bottom"
|
| 291 |
+
},
|
| 292 |
+
"tooltip": {
|
| 293 |
+
"mode": "multi"
|
| 294 |
+
}
|
| 295 |
+
},
|
| 296 |
+
"targets": [
|
| 297 |
+
{
|
| 298 |
+
"datasource": {
|
| 299 |
+
"type": "prometheus",
|
| 300 |
+
"uid": "prometheus"
|
| 301 |
+
},
|
| 302 |
+
"expr": "kc_build_last_run_result_code{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 303 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 304 |
+
"refId": "A"
|
| 305 |
+
}
|
| 306 |
+
],
|
| 307 |
+
"title": "Result code over time",
|
| 308 |
+
"type": "timeseries"
|
| 309 |
+
}
|
| 310 |
+
],
|
| 311 |
+
"refresh": "5m",
|
| 312 |
+
"schemaVersion": 39,
|
| 313 |
+
"style": "dark",
|
| 314 |
+
"tags": [
|
| 315 |
+
"kernels-community",
|
| 316 |
+
"ci",
|
| 317 |
+
"failures"
|
| 318 |
+
],
|
| 319 |
+
"templating": {
|
| 320 |
+
"list": [
|
| 321 |
+
{
|
| 322 |
+
"current": {
|
| 323 |
+
"selected": true,
|
| 324 |
+
"text": [
|
| 325 |
+
"All"
|
| 326 |
+
],
|
| 327 |
+
"value": [
|
| 328 |
+
"$__all"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
"datasource": {
|
| 332 |
+
"type": "prometheus",
|
| 333 |
+
"uid": "prometheus"
|
| 334 |
+
},
|
| 335 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 336 |
+
"includeAll": true,
|
| 337 |
+
"label": "Kernel",
|
| 338 |
+
"multi": true,
|
| 339 |
+
"name": "kernel",
|
| 340 |
+
"options": [],
|
| 341 |
+
"query": {
|
| 342 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 343 |
+
"refId": "PrometheusVariableQueryEditor-kernel"
|
| 344 |
+
},
|
| 345 |
+
"refresh": 1,
|
| 346 |
+
"sort": 1,
|
| 347 |
+
"type": "query"
|
| 348 |
+
},
|
| 349 |
+
{
|
| 350 |
+
"current": {
|
| 351 |
+
"selected": true,
|
| 352 |
+
"text": [
|
| 353 |
+
"All"
|
| 354 |
+
],
|
| 355 |
+
"value": [
|
| 356 |
+
"$__all"
|
| 357 |
+
]
|
| 358 |
+
},
|
| 359 |
+
"datasource": {
|
| 360 |
+
"type": "prometheus",
|
| 361 |
+
"uid": "prometheus"
|
| 362 |
+
},
|
| 363 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 364 |
+
"includeAll": true,
|
| 365 |
+
"label": "Backend",
|
| 366 |
+
"multi": true,
|
| 367 |
+
"name": "backend",
|
| 368 |
+
"options": [],
|
| 369 |
+
"query": {
|
| 370 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 371 |
+
"refId": "PrometheusVariableQueryEditor-backend"
|
| 372 |
+
},
|
| 373 |
+
"refresh": 1,
|
| 374 |
+
"sort": 1,
|
| 375 |
+
"type": "query"
|
| 376 |
+
},
|
| 377 |
+
{
|
| 378 |
+
"current": {
|
| 379 |
+
"selected": true,
|
| 380 |
+
"text": [
|
| 381 |
+
"All"
|
| 382 |
+
],
|
| 383 |
+
"value": [
|
| 384 |
+
"$__all"
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
"datasource": {
|
| 388 |
+
"type": "prometheus",
|
| 389 |
+
"uid": "prometheus"
|
| 390 |
+
},
|
| 391 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 392 |
+
"includeAll": true,
|
| 393 |
+
"label": "Compute backend",
|
| 394 |
+
"multi": true,
|
| 395 |
+
"name": "compute_backend",
|
| 396 |
+
"options": [],
|
| 397 |
+
"query": {
|
| 398 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 399 |
+
"refId": "PrometheusVariableQueryEditor-compute_backend"
|
| 400 |
+
},
|
| 401 |
+
"refresh": 1,
|
| 402 |
+
"sort": 1,
|
| 403 |
+
"type": "query"
|
| 404 |
+
},
|
| 405 |
+
{
|
| 406 |
+
"current": {
|
| 407 |
+
"selected": true,
|
| 408 |
+
"text": [
|
| 409 |
+
"All"
|
| 410 |
+
],
|
| 411 |
+
"value": [
|
| 412 |
+
"$__all"
|
| 413 |
+
]
|
| 414 |
+
},
|
| 415 |
+
"datasource": {
|
| 416 |
+
"type": "prometheus",
|
| 417 |
+
"uid": "prometheus"
|
| 418 |
+
},
|
| 419 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 420 |
+
"includeAll": true,
|
| 421 |
+
"label": "CUDA",
|
| 422 |
+
"multi": true,
|
| 423 |
+
"name": "cuda_version",
|
| 424 |
+
"options": [],
|
| 425 |
+
"query": {
|
| 426 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 427 |
+
"refId": "PrometheusVariableQueryEditor-cuda_version"
|
| 428 |
+
},
|
| 429 |
+
"refresh": 1,
|
| 430 |
+
"sort": 1,
|
| 431 |
+
"type": "query"
|
| 432 |
+
},
|
| 433 |
+
{
|
| 434 |
+
"current": {
|
| 435 |
+
"selected": true,
|
| 436 |
+
"text": [
|
| 437 |
+
"All"
|
| 438 |
+
],
|
| 439 |
+
"value": [
|
| 440 |
+
"$__all"
|
| 441 |
+
]
|
| 442 |
+
},
|
| 443 |
+
"datasource": {
|
| 444 |
+
"type": "prometheus",
|
| 445 |
+
"uid": "prometheus"
|
| 446 |
+
},
|
| 447 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 448 |
+
"includeAll": true,
|
| 449 |
+
"label": "PyTorch",
|
| 450 |
+
"multi": true,
|
| 451 |
+
"name": "pytorch_version",
|
| 452 |
+
"options": [],
|
| 453 |
+
"query": {
|
| 454 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 455 |
+
"refId": "PrometheusVariableQueryEditor-pytorch_version"
|
| 456 |
+
},
|
| 457 |
+
"refresh": 1,
|
| 458 |
+
"sort": 1,
|
| 459 |
+
"type": "query"
|
| 460 |
+
},
|
| 461 |
+
{
|
| 462 |
+
"current": {
|
| 463 |
+
"selected": true,
|
| 464 |
+
"text": [
|
| 465 |
+
"All"
|
| 466 |
+
],
|
| 467 |
+
"value": [
|
| 468 |
+
"$__all"
|
| 469 |
+
]
|
| 470 |
+
},
|
| 471 |
+
"datasource": {
|
| 472 |
+
"type": "prometheus",
|
| 473 |
+
"uid": "prometheus"
|
| 474 |
+
},
|
| 475 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 476 |
+
"includeAll": true,
|
| 477 |
+
"label": "Python",
|
| 478 |
+
"multi": true,
|
| 479 |
+
"name": "python_version",
|
| 480 |
+
"options": [],
|
| 481 |
+
"query": {
|
| 482 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 483 |
+
"refId": "PrometheusVariableQueryEditor-python_version"
|
| 484 |
+
},
|
| 485 |
+
"refresh": 1,
|
| 486 |
+
"sort": 1,
|
| 487 |
+
"type": "query"
|
| 488 |
+
}
|
| 489 |
+
]
|
| 490 |
+
},
|
| 491 |
+
"time": {
|
| 492 |
+
"from": "now-30d",
|
| 493 |
+
"to": "now"
|
| 494 |
+
},
|
| 495 |
+
"timezone": "browser",
|
| 496 |
+
"title": "Kernels Build Failure Overview",
|
| 497 |
+
"uid": "kernels-build-failures",
|
| 498 |
+
"version": 1,
|
| 499 |
+
"weekStart": ""
|
| 500 |
+
}
|
monitoring/grafana/dashboards/build-matrix-overview.json
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": []
|
| 4 |
+
},
|
| 5 |
+
"editable": true,
|
| 6 |
+
"fiscalYearStartMonth": 0,
|
| 7 |
+
"graphTooltip": 1,
|
| 8 |
+
"links": [],
|
| 9 |
+
"panels": [
|
| 10 |
+
{
|
| 11 |
+
"datasource": {
|
| 12 |
+
"type": "prometheus",
|
| 13 |
+
"uid": "prometheus"
|
| 14 |
+
},
|
| 15 |
+
"fieldConfig": {
|
| 16 |
+
"defaults": {
|
| 17 |
+
"color": {
|
| 18 |
+
"mode": "thresholds"
|
| 19 |
+
},
|
| 20 |
+
"thresholds": {
|
| 21 |
+
"mode": "absolute",
|
| 22 |
+
"steps": [
|
| 23 |
+
{
|
| 24 |
+
"color": "green",
|
| 25 |
+
"value": null
|
| 26 |
+
}
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
"unit": "none"
|
| 30 |
+
},
|
| 31 |
+
"overrides": []
|
| 32 |
+
},
|
| 33 |
+
"gridPos": {
|
| 34 |
+
"h": 5,
|
| 35 |
+
"w": 6,
|
| 36 |
+
"x": 0,
|
| 37 |
+
"y": 0
|
| 38 |
+
},
|
| 39 |
+
"id": 1,
|
| 40 |
+
"options": {
|
| 41 |
+
"colorMode": "value",
|
| 42 |
+
"graphMode": "none",
|
| 43 |
+
"justifyMode": "auto",
|
| 44 |
+
"orientation": "auto",
|
| 45 |
+
"reduceOptions": {
|
| 46 |
+
"calcs": [
|
| 47 |
+
"lastNotNull"
|
| 48 |
+
],
|
| 49 |
+
"fields": "",
|
| 50 |
+
"values": false
|
| 51 |
+
},
|
| 52 |
+
"textMode": "value"
|
| 53 |
+
},
|
| 54 |
+
"targets": [
|
| 55 |
+
{
|
| 56 |
+
"datasource": {
|
| 57 |
+
"type": "prometheus",
|
| 58 |
+
"uid": "prometheus"
|
| 59 |
+
},
|
| 60 |
+
"expr": "count(kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 61 |
+
"instant": true,
|
| 62 |
+
"refId": "A"
|
| 63 |
+
}
|
| 64 |
+
],
|
| 65 |
+
"title": "Tracked combos",
|
| 66 |
+
"type": "stat"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"datasource": {
|
| 70 |
+
"type": "prometheus",
|
| 71 |
+
"uid": "prometheus"
|
| 72 |
+
},
|
| 73 |
+
"fieldConfig": {
|
| 74 |
+
"defaults": {
|
| 75 |
+
"color": {
|
| 76 |
+
"mode": "thresholds"
|
| 77 |
+
},
|
| 78 |
+
"thresholds": {
|
| 79 |
+
"mode": "absolute",
|
| 80 |
+
"steps": [
|
| 81 |
+
{
|
| 82 |
+
"color": "green",
|
| 83 |
+
"value": null
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"color": "red",
|
| 87 |
+
"value": 1
|
| 88 |
+
}
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"unit": "none"
|
| 92 |
+
},
|
| 93 |
+
"overrides": []
|
| 94 |
+
},
|
| 95 |
+
"gridPos": {
|
| 96 |
+
"h": 5,
|
| 97 |
+
"w": 6,
|
| 98 |
+
"x": 6,
|
| 99 |
+
"y": 0
|
| 100 |
+
},
|
| 101 |
+
"id": 2,
|
| 102 |
+
"options": {
|
| 103 |
+
"colorMode": "value",
|
| 104 |
+
"graphMode": "none",
|
| 105 |
+
"justifyMode": "auto",
|
| 106 |
+
"orientation": "auto",
|
| 107 |
+
"reduceOptions": {
|
| 108 |
+
"calcs": [
|
| 109 |
+
"lastNotNull"
|
| 110 |
+
],
|
| 111 |
+
"fields": "",
|
| 112 |
+
"values": false
|
| 113 |
+
},
|
| 114 |
+
"textMode": "value"
|
| 115 |
+
},
|
| 116 |
+
"targets": [
|
| 117 |
+
{
|
| 118 |
+
"datasource": {
|
| 119 |
+
"type": "prometheus",
|
| 120 |
+
"uid": "prometheus"
|
| 121 |
+
},
|
| 122 |
+
"expr": "sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 123 |
+
"instant": true,
|
| 124 |
+
"refId": "A"
|
| 125 |
+
}
|
| 126 |
+
],
|
| 127 |
+
"title": "Failing combos",
|
| 128 |
+
"type": "stat"
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"datasource": {
|
| 132 |
+
"type": "prometheus",
|
| 133 |
+
"uid": "prometheus"
|
| 134 |
+
},
|
| 135 |
+
"fieldConfig": {
|
| 136 |
+
"defaults": {
|
| 137 |
+
"color": {
|
| 138 |
+
"mode": "thresholds"
|
| 139 |
+
},
|
| 140 |
+
"thresholds": {
|
| 141 |
+
"mode": "absolute",
|
| 142 |
+
"steps": [
|
| 143 |
+
{
|
| 144 |
+
"color": "green",
|
| 145 |
+
"value": null
|
| 146 |
+
}
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
"unit": "none"
|
| 150 |
+
},
|
| 151 |
+
"overrides": []
|
| 152 |
+
},
|
| 153 |
+
"gridPos": {
|
| 154 |
+
"h": 5,
|
| 155 |
+
"w": 6,
|
| 156 |
+
"x": 12,
|
| 157 |
+
"y": 0
|
| 158 |
+
},
|
| 159 |
+
"id": 3,
|
| 160 |
+
"options": {
|
| 161 |
+
"colorMode": "value",
|
| 162 |
+
"graphMode": "none",
|
| 163 |
+
"justifyMode": "auto",
|
| 164 |
+
"orientation": "auto",
|
| 165 |
+
"reduceOptions": {
|
| 166 |
+
"calcs": [
|
| 167 |
+
"lastNotNull"
|
| 168 |
+
],
|
| 169 |
+
"fields": "",
|
| 170 |
+
"values": false
|
| 171 |
+
},
|
| 172 |
+
"textMode": "value"
|
| 173 |
+
},
|
| 174 |
+
"targets": [
|
| 175 |
+
{
|
| 176 |
+
"datasource": {
|
| 177 |
+
"type": "prometheus",
|
| 178 |
+
"uid": "prometheus"
|
| 179 |
+
},
|
| 180 |
+
"expr": "count(kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) - sum(kc_build_last_run_failed{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"})",
|
| 181 |
+
"instant": true,
|
| 182 |
+
"refId": "A"
|
| 183 |
+
}
|
| 184 |
+
],
|
| 185 |
+
"title": "Healthy combos",
|
| 186 |
+
"type": "stat"
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"datasource": {
|
| 190 |
+
"type": "prometheus",
|
| 191 |
+
"uid": "prometheus"
|
| 192 |
+
},
|
| 193 |
+
"fieldConfig": {
|
| 194 |
+
"defaults": {
|
| 195 |
+
"color": {
|
| 196 |
+
"mode": "thresholds"
|
| 197 |
+
},
|
| 198 |
+
"thresholds": {
|
| 199 |
+
"mode": "absolute",
|
| 200 |
+
"steps": [
|
| 201 |
+
{
|
| 202 |
+
"color": "green",
|
| 203 |
+
"value": null
|
| 204 |
+
},
|
| 205 |
+
{
|
| 206 |
+
"color": "orange",
|
| 207 |
+
"value": 6
|
| 208 |
+
},
|
| 209 |
+
{
|
| 210 |
+
"color": "red",
|
| 211 |
+
"value": 24
|
| 212 |
+
}
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
"unit": "h"
|
| 216 |
+
},
|
| 217 |
+
"overrides": []
|
| 218 |
+
},
|
| 219 |
+
"gridPos": {
|
| 220 |
+
"h": 5,
|
| 221 |
+
"w": 6,
|
| 222 |
+
"x": 18,
|
| 223 |
+
"y": 0
|
| 224 |
+
},
|
| 225 |
+
"id": 4,
|
| 226 |
+
"options": {
|
| 227 |
+
"colorMode": "value",
|
| 228 |
+
"graphMode": "none",
|
| 229 |
+
"justifyMode": "auto",
|
| 230 |
+
"orientation": "auto",
|
| 231 |
+
"reduceOptions": {
|
| 232 |
+
"calcs": [
|
| 233 |
+
"lastNotNull"
|
| 234 |
+
],
|
| 235 |
+
"fields": "",
|
| 236 |
+
"values": false
|
| 237 |
+
},
|
| 238 |
+
"textMode": "value"
|
| 239 |
+
},
|
| 240 |
+
"targets": [
|
| 241 |
+
{
|
| 242 |
+
"datasource": {
|
| 243 |
+
"type": "prometheus",
|
| 244 |
+
"uid": "prometheus"
|
| 245 |
+
},
|
| 246 |
+
"expr": "max((time() - kc_build_last_run_timestamp_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}) / 3600)",
|
| 247 |
+
"instant": true,
|
| 248 |
+
"refId": "A"
|
| 249 |
+
}
|
| 250 |
+
],
|
| 251 |
+
"title": "Oldest metric age",
|
| 252 |
+
"type": "stat"
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"datasource": {
|
| 256 |
+
"type": "prometheus",
|
| 257 |
+
"uid": "prometheus"
|
| 258 |
+
},
|
| 259 |
+
"fieldConfig": {
|
| 260 |
+
"defaults": {
|
| 261 |
+
"color": {
|
| 262 |
+
"mode": "continuous-GrYlRd"
|
| 263 |
+
},
|
| 264 |
+
"unit": "s"
|
| 265 |
+
},
|
| 266 |
+
"overrides": []
|
| 267 |
+
},
|
| 268 |
+
"gridPos": {
|
| 269 |
+
"h": 8,
|
| 270 |
+
"w": 8,
|
| 271 |
+
"x": 0,
|
| 272 |
+
"y": 5
|
| 273 |
+
},
|
| 274 |
+
"id": 5,
|
| 275 |
+
"options": {
|
| 276 |
+
"displayMode": "gradient",
|
| 277 |
+
"orientation": "horizontal",
|
| 278 |
+
"reduceOptions": {
|
| 279 |
+
"calcs": [
|
| 280 |
+
"lastNotNull"
|
| 281 |
+
],
|
| 282 |
+
"fields": "",
|
| 283 |
+
"values": false
|
| 284 |
+
},
|
| 285 |
+
"showUnfilled": true
|
| 286 |
+
},
|
| 287 |
+
"targets": [
|
| 288 |
+
{
|
| 289 |
+
"datasource": {
|
| 290 |
+
"type": "prometheus",
|
| 291 |
+
"uid": "prometheus"
|
| 292 |
+
},
|
| 293 |
+
"expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 294 |
+
"instant": true,
|
| 295 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 296 |
+
"refId": "A"
|
| 297 |
+
}
|
| 298 |
+
],
|
| 299 |
+
"title": "Current duration by combo",
|
| 300 |
+
"type": "bargauge"
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"datasource": {
|
| 304 |
+
"type": "prometheus",
|
| 305 |
+
"uid": "prometheus"
|
| 306 |
+
},
|
| 307 |
+
"fieldConfig": {
|
| 308 |
+
"defaults": {
|
| 309 |
+
"color": {
|
| 310 |
+
"mode": "palette-classic"
|
| 311 |
+
},
|
| 312 |
+
"custom": {
|
| 313 |
+
"axisBorderShow": false,
|
| 314 |
+
"axisCenteredZero": false,
|
| 315 |
+
"drawStyle": "line",
|
| 316 |
+
"fillOpacity": 20,
|
| 317 |
+
"lineInterpolation": "stepAfter",
|
| 318 |
+
"lineWidth": 2,
|
| 319 |
+
"pointSize": 4,
|
| 320 |
+
"showPoints": "never",
|
| 321 |
+
"spanNulls": true
|
| 322 |
+
},
|
| 323 |
+
"max": 3,
|
| 324 |
+
"min": 0,
|
| 325 |
+
"unit": "none"
|
| 326 |
+
},
|
| 327 |
+
"overrides": []
|
| 328 |
+
},
|
| 329 |
+
"gridPos": {
|
| 330 |
+
"h": 8,
|
| 331 |
+
"w": 16,
|
| 332 |
+
"x": 8,
|
| 333 |
+
"y": 5
|
| 334 |
+
},
|
| 335 |
+
"id": 6,
|
| 336 |
+
"options": {
|
| 337 |
+
"legend": {
|
| 338 |
+
"displayMode": "list",
|
| 339 |
+
"placement": "bottom"
|
| 340 |
+
},
|
| 341 |
+
"tooltip": {
|
| 342 |
+
"mode": "multi"
|
| 343 |
+
}
|
| 344 |
+
},
|
| 345 |
+
"targets": [
|
| 346 |
+
{
|
| 347 |
+
"datasource": {
|
| 348 |
+
"type": "prometheus",
|
| 349 |
+
"uid": "prometheus"
|
| 350 |
+
},
|
| 351 |
+
"expr": "kc_build_last_run_result_code{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 352 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 353 |
+
"refId": "A"
|
| 354 |
+
}
|
| 355 |
+
],
|
| 356 |
+
"title": "Latest result code over time",
|
| 357 |
+
"type": "timeseries"
|
| 358 |
+
},
|
| 359 |
+
{
|
| 360 |
+
"datasource": {
|
| 361 |
+
"type": "prometheus",
|
| 362 |
+
"uid": "prometheus"
|
| 363 |
+
},
|
| 364 |
+
"fieldConfig": {
|
| 365 |
+
"defaults": {
|
| 366 |
+
"color": {
|
| 367 |
+
"mode": "continuous-BlYlRd"
|
| 368 |
+
},
|
| 369 |
+
"unit": "s"
|
| 370 |
+
},
|
| 371 |
+
"overrides": []
|
| 372 |
+
},
|
| 373 |
+
"gridPos": {
|
| 374 |
+
"h": 8,
|
| 375 |
+
"w": 24,
|
| 376 |
+
"x": 0,
|
| 377 |
+
"y": 13
|
| 378 |
+
},
|
| 379 |
+
"id": 7,
|
| 380 |
+
"options": {
|
| 381 |
+
"legend": {
|
| 382 |
+
"displayMode": "list",
|
| 383 |
+
"placement": "bottom"
|
| 384 |
+
},
|
| 385 |
+
"tooltip": {
|
| 386 |
+
"mode": "multi"
|
| 387 |
+
}
|
| 388 |
+
},
|
| 389 |
+
"targets": [
|
| 390 |
+
{
|
| 391 |
+
"datasource": {
|
| 392 |
+
"type": "prometheus",
|
| 393 |
+
"uid": "prometheus"
|
| 394 |
+
},
|
| 395 |
+
"expr": "kc_build_last_run_duration_seconds{kernel=~\"${kernel:regex}\",backend=~\"${backend:regex}\",compute_backend=~\"${compute_backend:regex}\",cuda_version=~\"${cuda_version:regex}\",pytorch_version=~\"${pytorch_version:regex}\",python_version=~\"${python_version:regex}\"}",
|
| 396 |
+
"legendFormat": "{{kernel}} | {{backend}} | {{compute_backend}} | CUDA {{cuda_version}} | torch {{pytorch_version}} | py {{python_version}}",
|
| 397 |
+
"refId": "A"
|
| 398 |
+
}
|
| 399 |
+
],
|
| 400 |
+
"title": "Duration history",
|
| 401 |
+
"type": "timeseries"
|
| 402 |
+
}
|
| 403 |
+
],
|
| 404 |
+
"refresh": "5m",
|
| 405 |
+
"schemaVersion": 39,
|
| 406 |
+
"style": "dark",
|
| 407 |
+
"tags": [
|
| 408 |
+
"kernels-community",
|
| 409 |
+
"ci",
|
| 410 |
+
"matrix"
|
| 411 |
+
],
|
| 412 |
+
"templating": {
|
| 413 |
+
"list": [
|
| 414 |
+
{
|
| 415 |
+
"current": {
|
| 416 |
+
"selected": true,
|
| 417 |
+
"text": [
|
| 418 |
+
"All"
|
| 419 |
+
],
|
| 420 |
+
"value": [
|
| 421 |
+
"$__all"
|
| 422 |
+
]
|
| 423 |
+
},
|
| 424 |
+
"datasource": {
|
| 425 |
+
"type": "prometheus",
|
| 426 |
+
"uid": "prometheus"
|
| 427 |
+
},
|
| 428 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 429 |
+
"includeAll": true,
|
| 430 |
+
"label": "Kernel",
|
| 431 |
+
"multi": true,
|
| 432 |
+
"name": "kernel",
|
| 433 |
+
"options": [],
|
| 434 |
+
"query": {
|
| 435 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, kernel)",
|
| 436 |
+
"refId": "PrometheusVariableQueryEditor-kernel"
|
| 437 |
+
},
|
| 438 |
+
"refresh": 1,
|
| 439 |
+
"sort": 1,
|
| 440 |
+
"type": "query"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"current": {
|
| 444 |
+
"selected": true,
|
| 445 |
+
"text": [
|
| 446 |
+
"All"
|
| 447 |
+
],
|
| 448 |
+
"value": [
|
| 449 |
+
"$__all"
|
| 450 |
+
]
|
| 451 |
+
},
|
| 452 |
+
"datasource": {
|
| 453 |
+
"type": "prometheus",
|
| 454 |
+
"uid": "prometheus"
|
| 455 |
+
},
|
| 456 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 457 |
+
"includeAll": true,
|
| 458 |
+
"label": "Backend",
|
| 459 |
+
"multi": true,
|
| 460 |
+
"name": "backend",
|
| 461 |
+
"options": [],
|
| 462 |
+
"query": {
|
| 463 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, backend)",
|
| 464 |
+
"refId": "PrometheusVariableQueryEditor-backend"
|
| 465 |
+
},
|
| 466 |
+
"refresh": 1,
|
| 467 |
+
"sort": 1,
|
| 468 |
+
"type": "query"
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"current": {
|
| 472 |
+
"selected": true,
|
| 473 |
+
"text": [
|
| 474 |
+
"All"
|
| 475 |
+
],
|
| 476 |
+
"value": [
|
| 477 |
+
"$__all"
|
| 478 |
+
]
|
| 479 |
+
},
|
| 480 |
+
"datasource": {
|
| 481 |
+
"type": "prometheus",
|
| 482 |
+
"uid": "prometheus"
|
| 483 |
+
},
|
| 484 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 485 |
+
"includeAll": true,
|
| 486 |
+
"label": "Compute backend",
|
| 487 |
+
"multi": true,
|
| 488 |
+
"name": "compute_backend",
|
| 489 |
+
"options": [],
|
| 490 |
+
"query": {
|
| 491 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, compute_backend)",
|
| 492 |
+
"refId": "PrometheusVariableQueryEditor-compute_backend"
|
| 493 |
+
},
|
| 494 |
+
"refresh": 1,
|
| 495 |
+
"sort": 1,
|
| 496 |
+
"type": "query"
|
| 497 |
+
},
|
| 498 |
+
{
|
| 499 |
+
"current": {
|
| 500 |
+
"selected": true,
|
| 501 |
+
"text": [
|
| 502 |
+
"All"
|
| 503 |
+
],
|
| 504 |
+
"value": [
|
| 505 |
+
"$__all"
|
| 506 |
+
]
|
| 507 |
+
},
|
| 508 |
+
"datasource": {
|
| 509 |
+
"type": "prometheus",
|
| 510 |
+
"uid": "prometheus"
|
| 511 |
+
},
|
| 512 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 513 |
+
"includeAll": true,
|
| 514 |
+
"label": "CUDA",
|
| 515 |
+
"multi": true,
|
| 516 |
+
"name": "cuda_version",
|
| 517 |
+
"options": [],
|
| 518 |
+
"query": {
|
| 519 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, cuda_version)",
|
| 520 |
+
"refId": "PrometheusVariableQueryEditor-cuda_version"
|
| 521 |
+
},
|
| 522 |
+
"refresh": 1,
|
| 523 |
+
"sort": 1,
|
| 524 |
+
"type": "query"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"current": {
|
| 528 |
+
"selected": true,
|
| 529 |
+
"text": [
|
| 530 |
+
"All"
|
| 531 |
+
],
|
| 532 |
+
"value": [
|
| 533 |
+
"$__all"
|
| 534 |
+
]
|
| 535 |
+
},
|
| 536 |
+
"datasource": {
|
| 537 |
+
"type": "prometheus",
|
| 538 |
+
"uid": "prometheus"
|
| 539 |
+
},
|
| 540 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 541 |
+
"includeAll": true,
|
| 542 |
+
"label": "PyTorch",
|
| 543 |
+
"multi": true,
|
| 544 |
+
"name": "pytorch_version",
|
| 545 |
+
"options": [],
|
| 546 |
+
"query": {
|
| 547 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, pytorch_version)",
|
| 548 |
+
"refId": "PrometheusVariableQueryEditor-pytorch_version"
|
| 549 |
+
},
|
| 550 |
+
"refresh": 1,
|
| 551 |
+
"sort": 1,
|
| 552 |
+
"type": "query"
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"current": {
|
| 556 |
+
"selected": true,
|
| 557 |
+
"text": [
|
| 558 |
+
"All"
|
| 559 |
+
],
|
| 560 |
+
"value": [
|
| 561 |
+
"$__all"
|
| 562 |
+
]
|
| 563 |
+
},
|
| 564 |
+
"datasource": {
|
| 565 |
+
"type": "prometheus",
|
| 566 |
+
"uid": "prometheus"
|
| 567 |
+
},
|
| 568 |
+
"definition": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 569 |
+
"includeAll": true,
|
| 570 |
+
"label": "Python",
|
| 571 |
+
"multi": true,
|
| 572 |
+
"name": "python_version",
|
| 573 |
+
"options": [],
|
| 574 |
+
"query": {
|
| 575 |
+
"query": "label_values(kc_build_last_run_timestamp_seconds, python_version)",
|
| 576 |
+
"refId": "PrometheusVariableQueryEditor-python_version"
|
| 577 |
+
},
|
| 578 |
+
"refresh": 1,
|
| 579 |
+
"sort": 1,
|
| 580 |
+
"type": "query"
|
| 581 |
+
}
|
| 582 |
+
]
|
| 583 |
+
},
|
| 584 |
+
"time": {
|
| 585 |
+
"from": "now-30d",
|
| 586 |
+
"to": "now"
|
| 587 |
+
},
|
| 588 |
+
"timezone": "browser",
|
| 589 |
+
"title": "Kernels Build Matrix Overview",
|
| 590 |
+
"uid": "kernels-build-matrix",
|
| 591 |
+
"version": 1,
|
| 592 |
+
"weekStart": ""
|
| 593 |
+
}
|
monitoring/grafana/provisioning/dashboards/dashboards.yml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
providers:
|
| 4 |
+
- name: kernels-community
|
| 5 |
+
orgId: 1
|
| 6 |
+
folder: Kernels Community
|
| 7 |
+
type: file
|
| 8 |
+
disableDeletion: false
|
| 9 |
+
editable: true
|
| 10 |
+
options:
|
| 11 |
+
path: /var/lib/grafana/dashboards
|
monitoring/grafana/provisioning/datasources/prometheus.yml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
datasources:
|
| 4 |
+
- name: Prometheus
|
| 5 |
+
uid: prometheus
|
| 6 |
+
type: prometheus
|
| 7 |
+
access: proxy
|
| 8 |
+
url: http://prometheus:9090
|
| 9 |
+
isDefault: true
|
| 10 |
+
editable: false
|
monitoring/prometheus/prometheus.yml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
scrape_interval: 15s
|
| 3 |
+
evaluation_interval: 15s
|
| 4 |
+
|
| 5 |
+
rule_files:
|
| 6 |
+
- /etc/prometheus/rules/*.yml
|
| 7 |
+
|
| 8 |
+
scrape_configs:
|
| 9 |
+
- job_name: prometheus
|
| 10 |
+
static_configs:
|
| 11 |
+
- targets:
|
| 12 |
+
- prometheus:9090
|
| 13 |
+
|
| 14 |
+
- job_name: pushgateway
|
| 15 |
+
honor_labels: true
|
| 16 |
+
static_configs:
|
| 17 |
+
- targets:
|
| 18 |
+
- pushgateway:9091
|
monitoring/prometheus/rules/build-alerts.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
groups:
|
| 2 |
+
- name: kernels-community-build-alerts
|
| 3 |
+
rules:
|
| 4 |
+
- alert: KernelsBuildMatrixComboFailing
|
| 5 |
+
expr: kc_build_last_run_failed == 1
|
| 6 |
+
for: 10m
|
| 7 |
+
labels:
|
| 8 |
+
severity: warning
|
| 9 |
+
annotations:
|
| 10 |
+
summary: "Kernel build matrix combo failing"
|
| 11 |
+
description: "{{ $labels.kernel }} backend={{ $labels.backend }} compute={{ $labels.compute_backend }} cuda={{ $labels.cuda_version }} torch={{ $labels.pytorch_version }} python={{ $labels.python_version }} is currently failing."
|
| 12 |
+
|
| 13 |
+
- alert: KernelsBuildMetricsStale
|
| 14 |
+
expr: (time() - kc_build_last_run_timestamp_seconds) > 86400
|
| 15 |
+
for: 30m
|
| 16 |
+
labels:
|
| 17 |
+
severity: warning
|
| 18 |
+
annotations:
|
| 19 |
+
summary: "Kernel build metrics stale"
|
| 20 |
+
description: "{{ $labels.kernel }} backend={{ $labels.backend }} compute={{ $labels.compute_backend }} cuda={{ $labels.cuda_version }} torch={{ $labels.pytorch_version }} python={{ $labels.python_version }} has not pushed fresh metrics for more than 24 hours."
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-r requirements.txt
|
| 2 |
+
pytest>=8.3,<9
|
| 3 |
+
ruff>=0.11,<0.12
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.10,<7
|
| 2 |
+
httpx>=0.27,<1
|
| 3 |
+
pydantic>=2.7,<3
|
| 4 |
+
PyYAML>=6.0,<7
|
| 5 |
+
cachetools>=5.3,<6
|
| 6 |
+
python-dateutil>=2.9,<3
|
| 7 |
+
python-dotenv>=1.0,<2
|
| 8 |
+
huggingface_hub>=0.30,<1
|
| 9 |
+
beautifulsoup4>=4.14,<5
|
scripts/bootstrap_space.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E402
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import os
|
| 6 |
+
import subprocess
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
from huggingface_hub.utils import get_token
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
ROOT_DIR = Path(__file__).resolve().parents[1]
|
| 16 |
+
SRC_DIR = ROOT_DIR / "src"
|
| 17 |
+
if str(SRC_DIR) not in sys.path:
|
| 18 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 19 |
+
|
| 20 |
+
from kc_monitor.config import load_config
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _cached_github_token() -> str | None:
|
| 24 |
+
try:
|
| 25 |
+
completed = subprocess.run(
|
| 26 |
+
["gh", "auth", "token"],
|
| 27 |
+
capture_output=True,
|
| 28 |
+
text=True,
|
| 29 |
+
check=True,
|
| 30 |
+
)
|
| 31 |
+
except (OSError, subprocess.CalledProcessError):
|
| 32 |
+
return None
|
| 33 |
+
token = completed.stdout.strip()
|
| 34 |
+
return token or None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 38 |
+
parser = argparse.ArgumentParser(description="Create or update the Kernels Community monitor Space.")
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--space-id",
|
| 41 |
+
default=os.getenv("KCM_SPACE_ID", "adarshxs/kernels-community-monitor"),
|
| 42 |
+
help="Target Hugging Face Space repo ID.",
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--private",
|
| 46 |
+
action="store_true",
|
| 47 |
+
help="Create the Space as private if it does not already exist.",
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--skip-secret",
|
| 51 |
+
action="store_true",
|
| 52 |
+
help="Do not update the GITHUB_TOKEN Space secret.",
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--skip-variables",
|
| 56 |
+
action="store_true",
|
| 57 |
+
help="Do not update Space variables/settings.",
|
| 58 |
+
)
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
"--create-pr",
|
| 61 |
+
action="store_true",
|
| 62 |
+
help="Open a Hub pull request instead of pushing directly when write access is unavailable.",
|
| 63 |
+
)
|
| 64 |
+
return parser
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def main() -> int:
|
| 68 |
+
load_dotenv()
|
| 69 |
+
parser = build_parser()
|
| 70 |
+
args = parser.parse_args()
|
| 71 |
+
|
| 72 |
+
hf_token = os.getenv("HF_TOKEN") or get_token()
|
| 73 |
+
github_token = os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN") or _cached_github_token()
|
| 74 |
+
if not hf_token:
|
| 75 |
+
parser.error("HF_TOKEN must be set in the environment or available from a local Hugging Face login.")
|
| 76 |
+
|
| 77 |
+
config = load_config(ROOT_DIR / "config" / "monitor.yaml")
|
| 78 |
+
api = HfApi(token=hf_token)
|
| 79 |
+
|
| 80 |
+
api.create_repo(
|
| 81 |
+
repo_id=args.space_id,
|
| 82 |
+
repo_type="space",
|
| 83 |
+
space_sdk="gradio",
|
| 84 |
+
private=args.private,
|
| 85 |
+
exist_ok=True,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
if github_token and not args.skip_secret:
|
| 89 |
+
api.add_space_secret(repo_id=args.space_id, key="GITHUB_TOKEN", value=github_token)
|
| 90 |
+
|
| 91 |
+
if not args.skip_variables:
|
| 92 |
+
github_vars = {
|
| 93 |
+
"KCM_GITHUB_OWNER": config.github.owner,
|
| 94 |
+
"KCM_GITHUB_REPO": config.github.repo,
|
| 95 |
+
"KCM_GITHUB_BRANCH": config.github.branch,
|
| 96 |
+
"KCM_REFRESH_INTERVAL_SECONDS": str(config.monitor.refresh_interval_seconds),
|
| 97 |
+
"KCM_WORKFLOW_RUN_PAGE_SIZE": str(config.monitor.workflow_run_page_size),
|
| 98 |
+
"KCM_WORKFLOW_RUN_PAGES": str(config.monitor.workflow_run_pages),
|
| 99 |
+
}
|
| 100 |
+
if config.monitor.critical_kernels:
|
| 101 |
+
github_vars["KCM_CRITICAL_KERNELS"] = ",".join(config.monitor.critical_kernels)
|
| 102 |
+
for key, value in github_vars.items():
|
| 103 |
+
if value:
|
| 104 |
+
api.add_space_variable(repo_id=args.space_id, key=key, value=value)
|
| 105 |
+
|
| 106 |
+
grafana_vars = {
|
| 107 |
+
"KCM_GRAFANA_BASE_URL": config.grafana.base_url,
|
| 108 |
+
"KCM_GRAFANA_ORG_ID": str(config.grafana.org_id),
|
| 109 |
+
"KCM_GRAFANA_THEME": config.grafana.theme,
|
| 110 |
+
"KCM_GRAFANA_OVERVIEW_UID": config.grafana.overview_dashboard_uid,
|
| 111 |
+
"KCM_GRAFANA_DURATION_UID": config.grafana.duration_dashboard_uid,
|
| 112 |
+
"KCM_GRAFANA_FAILURE_UID": config.grafana.failure_dashboard_uid,
|
| 113 |
+
"KCM_PROMETHEUS_BASE_URL": config.prometheus.base_url,
|
| 114 |
+
"KCM_PUSHGATEWAY_URL": config.pushgateway.url,
|
| 115 |
+
"KCM_PUSHGATEWAY_JOB_NAME": config.pushgateway.job_name,
|
| 116 |
+
}
|
| 117 |
+
for key, value in grafana_vars.items():
|
| 118 |
+
if value:
|
| 119 |
+
api.add_space_variable(repo_id=args.space_id, key=key, value=value)
|
| 120 |
+
|
| 121 |
+
api.upload_folder(
|
| 122 |
+
repo_id=args.space_id,
|
| 123 |
+
repo_type="space",
|
| 124 |
+
folder_path=str(ROOT_DIR),
|
| 125 |
+
create_pr=args.create_pr,
|
| 126 |
+
ignore_patterns=[
|
| 127 |
+
".env",
|
| 128 |
+
".git",
|
| 129 |
+
".git/*",
|
| 130 |
+
".venv/*",
|
| 131 |
+
"venv/*",
|
| 132 |
+
"__pycache__/*",
|
| 133 |
+
".pytest_cache/*",
|
| 134 |
+
".ruff_cache/*",
|
| 135 |
+
"*.log",
|
| 136 |
+
],
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
print(f"Space URL: https://huggingface.co/spaces/{args.space_id}")
|
| 140 |
+
try:
|
| 141 |
+
runtime = api.get_space_runtime(repo_id=args.space_id)
|
| 142 |
+
print(f"Runtime stage: {runtime.stage}")
|
| 143 |
+
print(f"Hardware: {runtime.hardware}")
|
| 144 |
+
except Exception:
|
| 145 |
+
print("Runtime not yet available (Space is provisioning).")
|
| 146 |
+
return 0
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
raise SystemExit(main())
|
scripts/push_build_metrics.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E402
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
ROOT_DIR = Path(__file__).resolve().parents[1]
|
| 12 |
+
SRC_DIR = ROOT_DIR / "src"
|
| 13 |
+
if str(SRC_DIR) not in sys.path:
|
| 14 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 15 |
+
|
| 16 |
+
from kc_monitor.config import load_config
|
| 17 |
+
from kc_monitor.metrics_push import BuildMetricSample, push_build_metrics
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> int:
|
| 21 |
+
load_dotenv()
|
| 22 |
+
config = load_config(ROOT_DIR / "config" / "monitor.yaml")
|
| 23 |
+
pushgateway_url = os.getenv("PUSHGATEWAY_URL") or config.pushgateway.url
|
| 24 |
+
if not pushgateway_url:
|
| 25 |
+
raise SystemExit("Pushgateway URL is required via PUSHGATEWAY_URL or KCM_PUSHGATEWAY_URL.")
|
| 26 |
+
|
| 27 |
+
job_name = os.getenv("KCM_PUSHGATEWAY_JOB_NAME") or config.pushgateway.job_name
|
| 28 |
+
sample = BuildMetricSample.from_env(os.environ)
|
| 29 |
+
push_url = push_build_metrics(
|
| 30 |
+
sample,
|
| 31 |
+
pushgateway_url=pushgateway_url,
|
| 32 |
+
job_name=job_name,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
print(f"Pushed metrics to {push_url}")
|
| 36 |
+
print(f"Matrix combo: {sample.grouping_key}")
|
| 37 |
+
print(
|
| 38 |
+
"Outcome:"
|
| 39 |
+
f" result={sample.result}"
|
| 40 |
+
f" result_code={sample.result_code}"
|
| 41 |
+
f" failed={sample.failed}"
|
| 42 |
+
f" duration_seconds={sample.duration_seconds:.3f}"
|
| 43 |
+
)
|
| 44 |
+
return 0
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == "__main__":
|
| 48 |
+
raise SystemExit(main())
|
scripts/smoke_check.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E402
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
ROOT_DIR = Path(__file__).resolve().parents[1]
|
| 9 |
+
SRC_DIR = ROOT_DIR / "src"
|
| 10 |
+
if str(SRC_DIR) not in sys.path:
|
| 11 |
+
sys.path.insert(0, str(SRC_DIR))
|
| 12 |
+
|
| 13 |
+
from kc_monitor.config import load_config
|
| 14 |
+
from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
|
| 15 |
+
from kc_monitor.service import MonitorService
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def main() -> int:
|
| 19 |
+
config = load_config(ROOT_DIR / "config" / "monitor.yaml")
|
| 20 |
+
service = MonitorService(config)
|
| 21 |
+
try:
|
| 22 |
+
snapshot = service.get_snapshot(force_refresh=True)
|
| 23 |
+
finally:
|
| 24 |
+
service.close()
|
| 25 |
+
|
| 26 |
+
print(f"Generated at: {snapshot.generated_at.isoformat()}")
|
| 27 |
+
print(
|
| 28 |
+
"Summary:"
|
| 29 |
+
f" tracked={snapshot.summary.tracked_kernels}"
|
| 30 |
+
f" active={snapshot.summary.active_builds}"
|
| 31 |
+
f" uploading={snapshot.summary.uploading_builds}"
|
| 32 |
+
f" failed={snapshot.summary.failed_builds}"
|
| 33 |
+
)
|
| 34 |
+
for row in snapshot.kernel_rows[:10]:
|
| 35 |
+
primary = row.primary_group
|
| 36 |
+
run_url = primary.run.html_url if primary else "n/a"
|
| 37 |
+
print(
|
| 38 |
+
f"- {row.kernel_name:20}"
|
| 39 |
+
f" status={row.row_status_label:10}"
|
| 40 |
+
f" runs={row.recent_run_count:2}"
|
| 41 |
+
f" run={run_url}"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print(f"Grafana enabled: {config.grafana.enabled}")
|
| 45 |
+
print(f"Grafana base URL: {config.grafana.base_url or 'not configured'}")
|
| 46 |
+
print(f"Prometheus base URL: {config.prometheus.base_url or 'not configured'}")
|
| 47 |
+
print(f"Pushgateway URL: {config.pushgateway.url or 'not configured'}")
|
| 48 |
+
|
| 49 |
+
dashboards = dashboard_catalog(config.grafana)
|
| 50 |
+
for dashboard in dashboards:
|
| 51 |
+
print(
|
| 52 |
+
f"- {dashboard.title:18}"
|
| 53 |
+
f" uid={dashboard.uid:24}"
|
| 54 |
+
f" view={build_dashboard_url(config.grafana, dashboard.uid, embed=False) or 'not configured'}"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if snapshot.errors and not snapshot.kernel_rows:
|
| 58 |
+
return 1
|
| 59 |
+
return 0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
raise SystemExit(main())
|
src/kc_monitor/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Kernels Community Monitor package."""
|
| 2 |
+
|
| 3 |
+
__all__ = ["__version__"]
|
| 4 |
+
|
| 5 |
+
__version__ = "0.2.0"
|
src/kc_monitor/config.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Literal
|
| 6 |
+
|
| 7 |
+
import yaml
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 10 |
+
|
| 11 |
+
from kc_monitor.models import WorkflowTarget
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
ROOT_DIR = Path(__file__).resolve().parents[2]
|
| 15 |
+
DEFAULT_CONFIG_PATH = ROOT_DIR / "config" / "monitor.yaml"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class GitHubSettings(BaseModel):
|
| 19 |
+
model_config = ConfigDict(extra="ignore")
|
| 20 |
+
|
| 21 |
+
owner: str = "huggingface"
|
| 22 |
+
repo: str = "kernels-community"
|
| 23 |
+
branch: str = "main"
|
| 24 |
+
per_page: int = 30
|
| 25 |
+
request_timeout_seconds: float = 25.0
|
| 26 |
+
user_agent: str = "kernels-community-monitor/0.1"
|
| 27 |
+
token: str | None = None
|
| 28 |
+
workflows: list[WorkflowTarget] = Field(default_factory=list)
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def repo_slug(self) -> str:
|
| 32 |
+
return f"{self.owner}/{self.repo}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class MonitorSettings(BaseModel):
|
| 36 |
+
model_config = ConfigDict(extra="ignore")
|
| 37 |
+
|
| 38 |
+
refresh_interval_seconds: int = 120
|
| 39 |
+
snapshot_ttl_seconds: int = 45
|
| 40 |
+
workflow_run_page_size: int = 100
|
| 41 |
+
workflow_run_pages: int = 10
|
| 42 |
+
recent_completed_hours: int = 72
|
| 43 |
+
recent_limit: int = 40
|
| 44 |
+
completed_runs_per_workflow: int = 3
|
| 45 |
+
log_line_limit: int = 400
|
| 46 |
+
log_char_limit: int = 35000
|
| 47 |
+
detail_event_limit: int = 25
|
| 48 |
+
stall_without_log_minutes: int = 45
|
| 49 |
+
stall_active_phase_minutes: int = 180
|
| 50 |
+
critical_kernels: list[str] = Field(default_factory=list)
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def critical_kernel_set(self) -> set[str]:
|
| 54 |
+
return {item.strip() for item in self.critical_kernels if item.strip()}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class GrafanaSettings(BaseModel):
|
| 58 |
+
model_config = ConfigDict(extra="ignore")
|
| 59 |
+
|
| 60 |
+
base_url: str | None = None
|
| 61 |
+
org_id: int = 1
|
| 62 |
+
theme: Literal["dark", "light"] = "dark"
|
| 63 |
+
default_from: str = "now-30d"
|
| 64 |
+
default_to: str = "now"
|
| 65 |
+
default_refresh: str = "5m"
|
| 66 |
+
overview_dashboard_uid: str = "kernels-build-matrix"
|
| 67 |
+
duration_dashboard_uid: str = "kernels-build-durations"
|
| 68 |
+
failure_dashboard_uid: str = "kernels-build-failures"
|
| 69 |
+
|
| 70 |
+
@property
|
| 71 |
+
def enabled(self) -> bool:
|
| 72 |
+
return bool(self.base_url)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class PrometheusSettings(BaseModel):
|
| 76 |
+
model_config = ConfigDict(extra="ignore")
|
| 77 |
+
|
| 78 |
+
base_url: str | None = None
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class PushgatewaySettings(BaseModel):
|
| 82 |
+
model_config = ConfigDict(extra="ignore")
|
| 83 |
+
|
| 84 |
+
url: str | None = None
|
| 85 |
+
job_name: str = "kernels-community-build-matrix"
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class AppConfig(BaseModel):
|
| 89 |
+
model_config = ConfigDict(extra="ignore")
|
| 90 |
+
|
| 91 |
+
github: GitHubSettings = Field(default_factory=GitHubSettings)
|
| 92 |
+
monitor: MonitorSettings = Field(default_factory=MonitorSettings)
|
| 93 |
+
grafana: GrafanaSettings = Field(default_factory=GrafanaSettings)
|
| 94 |
+
prometheus: PrometheusSettings = Field(default_factory=PrometheusSettings)
|
| 95 |
+
pushgateway: PushgatewaySettings = Field(default_factory=PushgatewaySettings)
|
| 96 |
+
|
| 97 |
+
@property
|
| 98 |
+
def workflow_targets(self) -> list[WorkflowTarget]:
|
| 99 |
+
return [workflow for workflow in self.github.workflows if workflow.enabled]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _deep_merge(base: dict[str, Any], updates: dict[str, Any]) -> dict[str, Any]:
|
| 103 |
+
merged = dict(base)
|
| 104 |
+
for key, value in updates.items():
|
| 105 |
+
if isinstance(value, dict) and isinstance(merged.get(key), dict):
|
| 106 |
+
merged[key] = _deep_merge(merged[key], value)
|
| 107 |
+
else:
|
| 108 |
+
merged[key] = value
|
| 109 |
+
return merged
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _load_yaml(path: Path) -> dict[str, Any]:
|
| 113 |
+
if not path.exists():
|
| 114 |
+
return {}
|
| 115 |
+
with path.open("r", encoding="utf-8") as handle:
|
| 116 |
+
return yaml.safe_load(handle) or {}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _csv_env(name: str) -> list[str] | None:
|
| 120 |
+
raw = os.getenv(name)
|
| 121 |
+
if raw is None:
|
| 122 |
+
return None
|
| 123 |
+
return [item.strip() for item in raw.split(",") if item.strip()]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def _env_overrides() -> dict[str, Any]:
|
| 127 |
+
critical_kernels = _csv_env("KCM_CRITICAL_KERNELS")
|
| 128 |
+
|
| 129 |
+
overrides: dict[str, Any] = {
|
| 130 |
+
"github": {
|
| 131 |
+
"owner": os.getenv("KCM_GITHUB_OWNER"),
|
| 132 |
+
"repo": os.getenv("KCM_GITHUB_REPO"),
|
| 133 |
+
"branch": os.getenv("KCM_GITHUB_BRANCH"),
|
| 134 |
+
"token": os.getenv("GITHUB_TOKEN") or os.getenv("GH_TOKEN"),
|
| 135 |
+
},
|
| 136 |
+
"monitor": {
|
| 137 |
+
"refresh_interval_seconds": os.getenv("KCM_REFRESH_INTERVAL_SECONDS"),
|
| 138 |
+
"snapshot_ttl_seconds": os.getenv("KCM_SNAPSHOT_TTL_SECONDS"),
|
| 139 |
+
"workflow_run_page_size": os.getenv("KCM_WORKFLOW_RUN_PAGE_SIZE"),
|
| 140 |
+
"workflow_run_pages": os.getenv("KCM_WORKFLOW_RUN_PAGES"),
|
| 141 |
+
"recent_completed_hours": os.getenv("KCM_RECENT_COMPLETED_HOURS"),
|
| 142 |
+
"recent_limit": os.getenv("KCM_RECENT_LIMIT"),
|
| 143 |
+
"completed_runs_per_workflow": os.getenv("KCM_COMPLETED_RUNS_PER_WORKFLOW"),
|
| 144 |
+
"log_line_limit": os.getenv("KCM_LOG_LINE_LIMIT"),
|
| 145 |
+
"log_char_limit": os.getenv("KCM_LOG_CHAR_LIMIT"),
|
| 146 |
+
"detail_event_limit": os.getenv("KCM_DETAIL_EVENT_LIMIT"),
|
| 147 |
+
"stall_without_log_minutes": os.getenv("KCM_STALL_WITHOUT_LOG_MINUTES"),
|
| 148 |
+
"stall_active_phase_minutes": os.getenv("KCM_STALL_ACTIVE_PHASE_MINUTES"),
|
| 149 |
+
"critical_kernels": critical_kernels,
|
| 150 |
+
},
|
| 151 |
+
"grafana": {
|
| 152 |
+
"base_url": os.getenv("KCM_GRAFANA_BASE_URL"),
|
| 153 |
+
"org_id": os.getenv("KCM_GRAFANA_ORG_ID"),
|
| 154 |
+
"theme": os.getenv("KCM_GRAFANA_THEME"),
|
| 155 |
+
"default_from": os.getenv("KCM_GRAFANA_FROM"),
|
| 156 |
+
"default_to": os.getenv("KCM_GRAFANA_TO"),
|
| 157 |
+
"default_refresh": os.getenv("KCM_GRAFANA_REFRESH"),
|
| 158 |
+
"overview_dashboard_uid": os.getenv("KCM_GRAFANA_OVERVIEW_UID"),
|
| 159 |
+
"duration_dashboard_uid": os.getenv("KCM_GRAFANA_DURATION_UID"),
|
| 160 |
+
"failure_dashboard_uid": os.getenv("KCM_GRAFANA_FAILURE_UID"),
|
| 161 |
+
},
|
| 162 |
+
"prometheus": {
|
| 163 |
+
"base_url": os.getenv("KCM_PROMETHEUS_BASE_URL"),
|
| 164 |
+
},
|
| 165 |
+
"pushgateway": {
|
| 166 |
+
"url": os.getenv("KCM_PUSHGATEWAY_URL"),
|
| 167 |
+
"job_name": os.getenv("KCM_PUSHGATEWAY_JOB_NAME"),
|
| 168 |
+
},
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
github = {key: value for key, value in overrides["github"].items() if value is not None}
|
| 172 |
+
monitor = {key: value for key, value in overrides["monitor"].items() if value is not None}
|
| 173 |
+
grafana = {key: value for key, value in overrides["grafana"].items() if value is not None}
|
| 174 |
+
prometheus = {key: value for key, value in overrides["prometheus"].items() if value is not None}
|
| 175 |
+
pushgateway = {key: value for key, value in overrides["pushgateway"].items() if value is not None}
|
| 176 |
+
return {
|
| 177 |
+
"github": github,
|
| 178 |
+
"monitor": monitor,
|
| 179 |
+
"grafana": grafana,
|
| 180 |
+
"prometheus": prometheus,
|
| 181 |
+
"pushgateway": pushgateway,
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def load_config(config_path: str | Path | None = None) -> AppConfig:
|
| 186 |
+
load_dotenv()
|
| 187 |
+
path = Path(config_path) if config_path else DEFAULT_CONFIG_PATH
|
| 188 |
+
raw = _load_yaml(path)
|
| 189 |
+
merged = _deep_merge(raw, _env_overrides())
|
| 190 |
+
return AppConfig.model_validate(merged)
|
src/kc_monitor/github_client.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import html
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
import subprocess
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
import httpx
|
| 12 |
+
|
| 13 |
+
from kc_monitor.models import GitHubJob, GitHubRun, parse_github_datetime, utcnow
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GitHubActionsError(RuntimeError):
|
| 17 |
+
"""Raised when the GitHub API returns an unexpected response."""
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class GitHubActionsClient:
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
owner: str,
|
| 24 |
+
repo: str,
|
| 25 |
+
token: str | None = None,
|
| 26 |
+
request_timeout_seconds: float = 25.0,
|
| 27 |
+
user_agent: str = "kernels-community-monitor/0.1",
|
| 28 |
+
) -> None:
|
| 29 |
+
if not token:
|
| 30 |
+
token = self._token_from_gh_cli()
|
| 31 |
+
headers = {
|
| 32 |
+
"Accept": "application/vnd.github+json",
|
| 33 |
+
"User-Agent": user_agent,
|
| 34 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 35 |
+
}
|
| 36 |
+
if token:
|
| 37 |
+
headers["Authorization"] = f"Bearer {token}"
|
| 38 |
+
|
| 39 |
+
self.owner = owner
|
| 40 |
+
self.repo = repo
|
| 41 |
+
self._client = httpx.Client(
|
| 42 |
+
base_url="https://api.github.com",
|
| 43 |
+
headers=headers,
|
| 44 |
+
timeout=request_timeout_seconds,
|
| 45 |
+
follow_redirects=False,
|
| 46 |
+
)
|
| 47 |
+
self._anonymous_client = httpx.Client(
|
| 48 |
+
base_url="https://api.github.com",
|
| 49 |
+
headers={
|
| 50 |
+
"Accept": "application/vnd.github+json",
|
| 51 |
+
"User-Agent": user_agent,
|
| 52 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
| 53 |
+
},
|
| 54 |
+
timeout=request_timeout_seconds,
|
| 55 |
+
follow_redirects=False,
|
| 56 |
+
)
|
| 57 |
+
self._web_client = httpx.Client(
|
| 58 |
+
base_url="https://github.com",
|
| 59 |
+
headers={"User-Agent": user_agent},
|
| 60 |
+
timeout=request_timeout_seconds,
|
| 61 |
+
follow_redirects=True,
|
| 62 |
+
)
|
| 63 |
+
self._raw_client = httpx.Client(
|
| 64 |
+
base_url="https://raw.githubusercontent.com",
|
| 65 |
+
headers={"User-Agent": user_agent},
|
| 66 |
+
timeout=request_timeout_seconds,
|
| 67 |
+
follow_redirects=True,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
@staticmethod
|
| 71 |
+
def _token_from_gh_cli() -> str | None:
|
| 72 |
+
try:
|
| 73 |
+
completed = subprocess.run(
|
| 74 |
+
["gh", "auth", "token"],
|
| 75 |
+
capture_output=True,
|
| 76 |
+
text=True,
|
| 77 |
+
check=True,
|
| 78 |
+
)
|
| 79 |
+
except (OSError, subprocess.CalledProcessError):
|
| 80 |
+
return None
|
| 81 |
+
token = completed.stdout.strip()
|
| 82 |
+
return token or None
|
| 83 |
+
|
| 84 |
+
def close(self) -> None:
|
| 85 |
+
self._client.close()
|
| 86 |
+
self._anonymous_client.close()
|
| 87 |
+
self._web_client.close()
|
| 88 |
+
self._raw_client.close()
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def _is_classic_pat_forbidden(response: httpx.Response) -> bool:
|
| 92 |
+
return response.status_code == 403 and "forbids access via a personal access token (classic)" in response.text
|
| 93 |
+
|
| 94 |
+
def _request_with_fallback(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
|
| 95 |
+
response = self._client.request(method, path, **kwargs)
|
| 96 |
+
if self._is_classic_pat_forbidden(response):
|
| 97 |
+
response = self._anonymous_client.request(method, path, **kwargs)
|
| 98 |
+
return response
|
| 99 |
+
|
| 100 |
+
def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
|
| 101 |
+
response = self._request_with_fallback(method, path, **kwargs)
|
| 102 |
+
if response.status_code >= 400:
|
| 103 |
+
raise GitHubActionsError(
|
| 104 |
+
f"GitHub API request failed for {path}: {response.status_code} {response.text}"
|
| 105 |
+
)
|
| 106 |
+
return response
|
| 107 |
+
|
| 108 |
+
@staticmethod
|
| 109 |
+
def _should_use_public_fallback(response: httpx.Response) -> bool:
|
| 110 |
+
text = response.text.lower()
|
| 111 |
+
return response.status_code in {403, 404, 429} or "rate limit exceeded" in text
|
| 112 |
+
|
| 113 |
+
@staticmethod
|
| 114 |
+
def _workflow_path(workflow_file: str) -> str:
|
| 115 |
+
if workflow_file.startswith(".github/workflows/"):
|
| 116 |
+
return workflow_file
|
| 117 |
+
return f".github/workflows/{workflow_file}"
|
| 118 |
+
|
| 119 |
+
@staticmethod
|
| 120 |
+
def _parse_run_state(aria_label: str) -> tuple[str, str | None]:
|
| 121 |
+
normalized = aria_label.lower()
|
| 122 |
+
if "completed successfully" in normalized:
|
| 123 |
+
return "completed", "success"
|
| 124 |
+
if "cancel" in normalized:
|
| 125 |
+
return "completed", "cancelled"
|
| 126 |
+
if "fail" in normalized:
|
| 127 |
+
return "completed", "failure"
|
| 128 |
+
if "queued" in normalized:
|
| 129 |
+
return "queued", None
|
| 130 |
+
if "in progress" in normalized or "running" in normalized:
|
| 131 |
+
return "in_progress", None
|
| 132 |
+
return "completed", None
|
| 133 |
+
|
| 134 |
+
def _list_workflow_runs_public(self, workflow_file: str, page: int = 1) -> list[GitHubRun]:
|
| 135 |
+
response = self._web_client.get(
|
| 136 |
+
f"/{self.owner}/{self.repo}/actions/workflows/{workflow_file}",
|
| 137 |
+
params={"page": page},
|
| 138 |
+
)
|
| 139 |
+
response.raise_for_status()
|
| 140 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 141 |
+
rows = soup.find_all("div", class_="Box-row")
|
| 142 |
+
runs: list[GitHubRun] = []
|
| 143 |
+
run_prefix = f"/{self.owner}/{self.repo}/actions/runs/"
|
| 144 |
+
branch_prefix = f"/{self.owner}/{self.repo}/tree/refs/heads/"
|
| 145 |
+
pull_prefix = f"/{self.owner}/{self.repo}/pull/"
|
| 146 |
+
workflow_path = self._workflow_path(workflow_file)
|
| 147 |
+
|
| 148 |
+
for row in rows:
|
| 149 |
+
run_link = next(
|
| 150 |
+
(a for a in row.find_all("a") if (a.get("href") or "").startswith(run_prefix)),
|
| 151 |
+
None,
|
| 152 |
+
)
|
| 153 |
+
if not run_link:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
run_href = run_link.get("href") or ""
|
| 157 |
+
try:
|
| 158 |
+
run_id = int(run_href.rstrip("/").split("/")[-1])
|
| 159 |
+
except ValueError:
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
display_title = run_link.get_text(" ", strip=True)
|
| 163 |
+
aria_label = run_link.get("aria-label") or ""
|
| 164 |
+
status, conclusion = self._parse_run_state(aria_label)
|
| 165 |
+
relative_time = row.find("relative-time")
|
| 166 |
+
timestamp = parse_github_datetime(relative_time.get("datetime")) if relative_time else None
|
| 167 |
+
branch_link = next(
|
| 168 |
+
(a for a in row.find_all("a") if (a.get("href") or "").startswith(branch_prefix)),
|
| 169 |
+
None,
|
| 170 |
+
)
|
| 171 |
+
actor_link = next(
|
| 172 |
+
(
|
| 173 |
+
a
|
| 174 |
+
for a in row.find_all("a")
|
| 175 |
+
if (href := a.get("href") or "")
|
| 176 |
+
and href.startswith("/")
|
| 177 |
+
and not href.startswith(run_prefix)
|
| 178 |
+
and not href.startswith(branch_prefix)
|
| 179 |
+
and not href.startswith(pull_prefix)
|
| 180 |
+
and href.count("/") == 1
|
| 181 |
+
),
|
| 182 |
+
None,
|
| 183 |
+
)
|
| 184 |
+
workflow_name = row.find("span", class_="text-bold")
|
| 185 |
+
pull_link = next(
|
| 186 |
+
(a for a in row.find_all("a") if (a.get("href") or "").startswith(pull_prefix)),
|
| 187 |
+
None,
|
| 188 |
+
)
|
| 189 |
+
event = "pull_request" if pull_link else "workflow_dispatch"
|
| 190 |
+
head_branch = branch_link.get_text(" ", strip=True) if branch_link else ""
|
| 191 |
+
actor_login = actor_link.get_text(" ", strip=True) if actor_link else None
|
| 192 |
+
run_time = timestamp or utcnow()
|
| 193 |
+
runs.append(
|
| 194 |
+
GitHubRun(
|
| 195 |
+
id=run_id,
|
| 196 |
+
name=workflow_name.get_text(" ", strip=True) if workflow_name else workflow_file,
|
| 197 |
+
display_title=display_title,
|
| 198 |
+
path=workflow_path,
|
| 199 |
+
status=status,
|
| 200 |
+
conclusion=conclusion,
|
| 201 |
+
head_branch=head_branch,
|
| 202 |
+
head_sha="",
|
| 203 |
+
event=event,
|
| 204 |
+
html_url=f"https://github.com{run_href}",
|
| 205 |
+
jobs_url=f"https://api.github.com/repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs",
|
| 206 |
+
created_at=run_time,
|
| 207 |
+
updated_at=run_time,
|
| 208 |
+
run_started_at=run_time,
|
| 209 |
+
actor_login=actor_login,
|
| 210 |
+
raw={"source": "public_html"},
|
| 211 |
+
)
|
| 212 |
+
)
|
| 213 |
+
return runs
|
| 214 |
+
|
| 215 |
+
@staticmethod
|
| 216 |
+
def _runner_group_from_job_name(job_name: str) -> str | None:
|
| 217 |
+
match = re.search(r"\(([^)]+)\)", job_name)
|
| 218 |
+
if not match:
|
| 219 |
+
return None
|
| 220 |
+
parts = [part.strip() for part in match.group(1).split(",") if part.strip()]
|
| 221 |
+
if len(parts) < 2:
|
| 222 |
+
return None
|
| 223 |
+
return parts[1]
|
| 224 |
+
|
| 225 |
+
def _list_jobs_public(self, run_id: int) -> list[GitHubJob]:
|
| 226 |
+
response = self._web_client.get(f"/{self.owner}/{self.repo}/actions/runs/{run_id}")
|
| 227 |
+
response.raise_for_status()
|
| 228 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 229 |
+
scripts = [
|
| 230 |
+
script
|
| 231 |
+
for script in soup.find_all("script")
|
| 232 |
+
if script.get("data-target") == "react-partial.embeddedData"
|
| 233 |
+
]
|
| 234 |
+
jobs_script = next(
|
| 235 |
+
(
|
| 236 |
+
script
|
| 237 |
+
for script in scripts
|
| 238 |
+
if (parent := script.find_parent("react-partial"))
|
| 239 |
+
and parent.get("partial-name") == "actions-run-jobs-list"
|
| 240 |
+
),
|
| 241 |
+
None,
|
| 242 |
+
)
|
| 243 |
+
if jobs_script is None or not jobs_script.string:
|
| 244 |
+
raise GitHubActionsError(f"Could not locate jobs list for run {run_id} in the public page.")
|
| 245 |
+
|
| 246 |
+
embedded = json.loads(jobs_script.string)
|
| 247 |
+
props = embedded.get("props") or {}
|
| 248 |
+
fetch_url = props.get("jobGroupsFetchUrl")
|
| 249 |
+
if not fetch_url:
|
| 250 |
+
raise GitHubActionsError(f"Public run page for {run_id} did not expose job groups fetch URL.")
|
| 251 |
+
|
| 252 |
+
batch_response = self._web_client.get(
|
| 253 |
+
fetch_url,
|
| 254 |
+
params={"batch": 0, "size": 200},
|
| 255 |
+
headers={"X-Requested-With": "XMLHttpRequest"},
|
| 256 |
+
)
|
| 257 |
+
batch_response.raise_for_status()
|
| 258 |
+
payload = batch_response.json()
|
| 259 |
+
jobs: list[GitHubJob] = []
|
| 260 |
+
run_url = f"https://github.com/{self.owner}/{self.repo}/actions/runs/{run_id}"
|
| 261 |
+
|
| 262 |
+
for group in payload.get("jobGroups") or []:
|
| 263 |
+
non_nested = group.get("nonNested") or {}
|
| 264 |
+
for job_payload in non_nested.get("jobs") or []:
|
| 265 |
+
job_name = job_payload.get("displayName") or group.get("name") or ""
|
| 266 |
+
job_href = job_payload.get("href") or ""
|
| 267 |
+
jobs.append(
|
| 268 |
+
GitHubJob(
|
| 269 |
+
id=job_payload["id"],
|
| 270 |
+
run_id=run_id,
|
| 271 |
+
workflow_name="",
|
| 272 |
+
head_branch="",
|
| 273 |
+
run_url=run_url,
|
| 274 |
+
run_attempt=1,
|
| 275 |
+
head_sha="",
|
| 276 |
+
url="",
|
| 277 |
+
html_url=f"https://github.com{job_href}",
|
| 278 |
+
status=job_payload.get("status") or "unknown",
|
| 279 |
+
conclusion=job_payload.get("conclusion"),
|
| 280 |
+
created_at=utcnow(),
|
| 281 |
+
started_at=None,
|
| 282 |
+
completed_at=None,
|
| 283 |
+
name=job_name,
|
| 284 |
+
steps=[],
|
| 285 |
+
runner_group_name=self._runner_group_from_job_name(job_name),
|
| 286 |
+
)
|
| 287 |
+
)
|
| 288 |
+
return jobs
|
| 289 |
+
|
| 290 |
+
def _list_repo_tree_paths_public(self, ref: str = "main") -> list[str]:
|
| 291 |
+
response = self._web_client.get(f"/{self.owner}/{self.repo}/tree/{ref}")
|
| 292 |
+
response.raise_for_status()
|
| 293 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 294 |
+
prefix = f"/{self.owner}/{self.repo}/tree/{ref}/"
|
| 295 |
+
candidates = sorted(
|
| 296 |
+
{
|
| 297 |
+
href.removeprefix(prefix).split("/", 1)[0]
|
| 298 |
+
for anchor in soup.find_all("a")
|
| 299 |
+
if (href := anchor.get("href") or "").startswith(prefix)
|
| 300 |
+
and "/" not in href.removeprefix(prefix)
|
| 301 |
+
}
|
| 302 |
+
)
|
| 303 |
+
paths: list[str] = []
|
| 304 |
+
for candidate in candidates:
|
| 305 |
+
if candidate.startswith("."):
|
| 306 |
+
continue
|
| 307 |
+
raw_response = self._raw_client.get(f"/{self.owner}/{self.repo}/{ref}/{candidate}/build.toml")
|
| 308 |
+
if raw_response.status_code == 200:
|
| 309 |
+
paths.append(f"{candidate}/build.toml")
|
| 310 |
+
return paths
|
| 311 |
+
|
| 312 |
+
def _get_file_text_public(self, path: str, ref: str | None = None) -> str | None:
|
| 313 |
+
target_ref = ref or "main"
|
| 314 |
+
response = self._raw_client.get(f"/{self.owner}/{self.repo}/{target_ref}/{path}")
|
| 315 |
+
if response.status_code == 404:
|
| 316 |
+
return None
|
| 317 |
+
response.raise_for_status()
|
| 318 |
+
return response.text
|
| 319 |
+
|
| 320 |
+
def list_runs(self, per_page: int = 30, page: int = 1) -> list[GitHubRun]:
|
| 321 |
+
response = self._request(
|
| 322 |
+
"GET",
|
| 323 |
+
f"/repos/{self.owner}/{self.repo}/actions/runs",
|
| 324 |
+
params={"per_page": per_page, "page": page},
|
| 325 |
+
)
|
| 326 |
+
payload = response.json()
|
| 327 |
+
return [GitHubRun.from_api(item) for item in payload.get("workflow_runs") or []]
|
| 328 |
+
|
| 329 |
+
def list_workflow_runs(
|
| 330 |
+
self,
|
| 331 |
+
workflow_file: str,
|
| 332 |
+
per_page: int = 30,
|
| 333 |
+
page: int = 1,
|
| 334 |
+
) -> list[GitHubRun]:
|
| 335 |
+
response = self._request_with_fallback(
|
| 336 |
+
"GET",
|
| 337 |
+
f"/repos/{self.owner}/{self.repo}/actions/workflows/{workflow_file}/runs",
|
| 338 |
+
params={"per_page": per_page, "page": page},
|
| 339 |
+
)
|
| 340 |
+
if self._should_use_public_fallback(response):
|
| 341 |
+
return self._list_workflow_runs_public(workflow_file, page=page)
|
| 342 |
+
if response.status_code >= 400:
|
| 343 |
+
raise GitHubActionsError(
|
| 344 |
+
f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/workflows/{workflow_file}/runs: "
|
| 345 |
+
f"{response.status_code} {response.text}"
|
| 346 |
+
)
|
| 347 |
+
payload = response.json()
|
| 348 |
+
return [GitHubRun.from_api(item) for item in payload.get("workflow_runs") or []]
|
| 349 |
+
|
| 350 |
+
def list_jobs(self, run_id: int) -> list[GitHubJob]:
|
| 351 |
+
response = self._request_with_fallback(
|
| 352 |
+
"GET",
|
| 353 |
+
f"/repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs",
|
| 354 |
+
params={"per_page": 100},
|
| 355 |
+
)
|
| 356 |
+
if self._should_use_public_fallback(response):
|
| 357 |
+
return self._list_jobs_public(run_id)
|
| 358 |
+
if response.status_code >= 400:
|
| 359 |
+
raise GitHubActionsError(
|
| 360 |
+
f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/runs/{run_id}/jobs: "
|
| 361 |
+
f"{response.status_code} {response.text}"
|
| 362 |
+
)
|
| 363 |
+
payload = response.json()
|
| 364 |
+
return [GitHubJob.from_api(item) for item in payload.get("jobs") or []]
|
| 365 |
+
|
| 366 |
+
def list_repo_tree_paths(self, ref: str = "main") -> list[str]:
|
| 367 |
+
response = self._request_with_fallback(
|
| 368 |
+
"GET",
|
| 369 |
+
f"/repos/{self.owner}/{self.repo}/git/trees/{ref}",
|
| 370 |
+
params={"recursive": 1},
|
| 371 |
+
)
|
| 372 |
+
if self._should_use_public_fallback(response):
|
| 373 |
+
return self._list_repo_tree_paths_public(ref=ref)
|
| 374 |
+
if response.status_code >= 400:
|
| 375 |
+
raise GitHubActionsError(
|
| 376 |
+
f"GitHub API request failed for /repos/{self.owner}/{self.repo}/git/trees/{ref}: "
|
| 377 |
+
f"{response.status_code} {response.text}"
|
| 378 |
+
)
|
| 379 |
+
payload = response.json()
|
| 380 |
+
return [item["path"] for item in payload.get("tree") or [] if item.get("path")]
|
| 381 |
+
|
| 382 |
+
def get_job_logs(
|
| 383 |
+
self,
|
| 384 |
+
job_id: int,
|
| 385 |
+
line_limit: int = 400,
|
| 386 |
+
char_limit: int = 35000,
|
| 387 |
+
job_html_url: str | None = None,
|
| 388 |
+
) -> str | None:
|
| 389 |
+
response = self._request_with_fallback(
|
| 390 |
+
"GET",
|
| 391 |
+
f"/repos/{self.owner}/{self.repo}/actions/jobs/{job_id}/logs",
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
if response.status_code in {301, 302, 307, 308}:
|
| 395 |
+
location = response.headers.get("Location")
|
| 396 |
+
if not location:
|
| 397 |
+
return None
|
| 398 |
+
redirected = self._anonymous_client.get(location, follow_redirects=True)
|
| 399 |
+
if redirected.status_code in {404, 410}:
|
| 400 |
+
return None
|
| 401 |
+
redirected.raise_for_status()
|
| 402 |
+
text = redirected.text
|
| 403 |
+
elif response.status_code in {404, 410}:
|
| 404 |
+
return None
|
| 405 |
+
elif response.status_code == 403 and job_html_url:
|
| 406 |
+
text = self._fetch_public_job_page(job_html_url)
|
| 407 |
+
elif response.status_code >= 400:
|
| 408 |
+
raise GitHubActionsError(
|
| 409 |
+
f"GitHub API request failed for /repos/{self.owner}/{self.repo}/actions/jobs/{job_id}/logs: "
|
| 410 |
+
f"{response.status_code} {response.text}"
|
| 411 |
+
)
|
| 412 |
+
else:
|
| 413 |
+
text = response.text
|
| 414 |
+
|
| 415 |
+
if not text:
|
| 416 |
+
return None
|
| 417 |
+
|
| 418 |
+
lines = text.splitlines()
|
| 419 |
+
if line_limit and len(lines) > line_limit:
|
| 420 |
+
lines = lines[-line_limit:]
|
| 421 |
+
trimmed = "\n".join(lines)
|
| 422 |
+
if char_limit and len(trimmed) > char_limit:
|
| 423 |
+
trimmed = trimmed[-char_limit:]
|
| 424 |
+
return trimmed
|
| 425 |
+
|
| 426 |
+
def _fetch_public_job_page(self, job_html_url: str) -> str | None:
|
| 427 |
+
response = self._anonymous_client.get(job_html_url, follow_redirects=True)
|
| 428 |
+
response.raise_for_status()
|
| 429 |
+
text = response.text
|
| 430 |
+
text = re.sub(r"(?is)<script.*?</script>", " ", text)
|
| 431 |
+
text = re.sub(r"(?is)<style.*?</style>", " ", text)
|
| 432 |
+
text = re.sub(r"(?s)<[^>]+>", "\n", text)
|
| 433 |
+
text = html.unescape(text)
|
| 434 |
+
normalized_lines = [line.strip() for line in text.splitlines() if line.strip()]
|
| 435 |
+
return "\n".join(normalized_lines)
|
| 436 |
+
|
| 437 |
+
def get_file_text(self, path: str, ref: str | None = None) -> str | None:
|
| 438 |
+
params = {"ref": ref} if ref else None
|
| 439 |
+
response = self._request_with_fallback(
|
| 440 |
+
"GET",
|
| 441 |
+
f"/repos/{self.owner}/{self.repo}/contents/{path}",
|
| 442 |
+
params=params,
|
| 443 |
+
)
|
| 444 |
+
if self._should_use_public_fallback(response):
|
| 445 |
+
return self._get_file_text_public(path, ref=ref)
|
| 446 |
+
if response.status_code >= 400:
|
| 447 |
+
raise GitHubActionsError(
|
| 448 |
+
f"GitHub API request failed for /repos/{self.owner}/{self.repo}/contents/{path}: "
|
| 449 |
+
f"{response.status_code} {response.text}"
|
| 450 |
+
)
|
| 451 |
+
payload = response.json()
|
| 452 |
+
encoded = payload.get("content")
|
| 453 |
+
if not encoded:
|
| 454 |
+
return None
|
| 455 |
+
content = base64.b64decode(encoded)
|
| 456 |
+
return content.decode("utf-8", errors="replace")
|
src/kc_monitor/grafana.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from urllib.parse import urlencode
|
| 5 |
+
|
| 6 |
+
from kc_monitor.config import GrafanaSettings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass(frozen=True, slots=True)
|
| 10 |
+
class GrafanaDashboard:
|
| 11 |
+
key: str
|
| 12 |
+
title: str
|
| 13 |
+
description: str
|
| 14 |
+
uid: str
|
| 15 |
+
height: int
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def dashboard_catalog(settings: GrafanaSettings) -> list[GrafanaDashboard]:
|
| 19 |
+
return [
|
| 20 |
+
GrafanaDashboard(
|
| 21 |
+
key="overview",
|
| 22 |
+
title="Matrix overview",
|
| 23 |
+
description="Latest outcome per build matrix combo, with fast filters across kernel, backend, CUDA, PyTorch, and Python.",
|
| 24 |
+
uid=settings.overview_dashboard_uid,
|
| 25 |
+
height=420,
|
| 26 |
+
),
|
| 27 |
+
GrafanaDashboard(
|
| 28 |
+
key="durations",
|
| 29 |
+
title="Duration trends",
|
| 30 |
+
description="Compilation and upload duration trends, so regressions show up as rising wall time instead of surprise failures.",
|
| 31 |
+
uid=settings.duration_dashboard_uid,
|
| 32 |
+
height=460,
|
| 33 |
+
),
|
| 34 |
+
GrafanaDashboard(
|
| 35 |
+
key="failures",
|
| 36 |
+
title="Failure overview",
|
| 37 |
+
description="Current failing combinations and stale metrics signals, tuned for alert-driven triage instead of log scraping.",
|
| 38 |
+
uid=settings.failure_dashboard_uid,
|
| 39 |
+
height=420,
|
| 40 |
+
),
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def build_dashboard_url(
|
| 45 |
+
settings: GrafanaSettings,
|
| 46 |
+
uid: str,
|
| 47 |
+
*,
|
| 48 |
+
embed: bool,
|
| 49 |
+
) -> str:
|
| 50 |
+
base_url = (settings.base_url or "").rstrip("/")
|
| 51 |
+
if not base_url:
|
| 52 |
+
return ""
|
| 53 |
+
|
| 54 |
+
query = {
|
| 55 |
+
"orgId": settings.org_id,
|
| 56 |
+
"from": settings.default_from,
|
| 57 |
+
"to": settings.default_to,
|
| 58 |
+
"theme": settings.theme,
|
| 59 |
+
"refresh": settings.default_refresh,
|
| 60 |
+
}
|
| 61 |
+
if embed:
|
| 62 |
+
query["kiosk"] = "tv"
|
| 63 |
+
|
| 64 |
+
return f"{base_url}/d/{uid}/_?{urlencode(query)}"
|
| 65 |
+
|
src/kc_monitor/kernel_index.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import tomllib
|
| 5 |
+
|
| 6 |
+
from cachetools import TTLCache
|
| 7 |
+
|
| 8 |
+
from kc_monitor.github_client import GitHubActionsClient, GitHubActionsError
|
| 9 |
+
from kc_monitor.models import GitHubRun, KernelInfo
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
PR_TITLE_RE = re.compile(r"^\s*([A-Za-z0-9_-]+)\s*:")
|
| 13 |
+
MANUAL_BUILD_RE = re.compile(
|
| 14 |
+
r"Manual Kernel Build\s*/\s*([A-Za-z0-9_-]+)\s*/",
|
| 15 |
+
flags=re.IGNORECASE,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class KernelIndex:
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
client: GitHubActionsClient,
|
| 23 |
+
branch: str = "main",
|
| 24 |
+
cache_ttl_seconds: int = 900,
|
| 25 |
+
) -> None:
|
| 26 |
+
self.client = client
|
| 27 |
+
self.branch = branch
|
| 28 |
+
self._cache: TTLCache[str, KernelInfo] = TTLCache(maxsize=256, ttl=cache_ttl_seconds)
|
| 29 |
+
self._catalog_cache: TTLCache[str, list[KernelInfo]] = TTLCache(maxsize=1, ttl=cache_ttl_seconds)
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def infer_kernel_name(run: GitHubRun) -> str | None:
|
| 33 |
+
candidates = [run.display_title, run.name]
|
| 34 |
+
for candidate in candidates:
|
| 35 |
+
if not candidate:
|
| 36 |
+
continue
|
| 37 |
+
match = PR_TITLE_RE.match(candidate)
|
| 38 |
+
if match:
|
| 39 |
+
return match.group(1)
|
| 40 |
+
|
| 41 |
+
match = MANUAL_BUILD_RE.search(candidate)
|
| 42 |
+
if match:
|
| 43 |
+
return match.group(1)
|
| 44 |
+
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
@staticmethod
|
| 48 |
+
def _fallback_kernel_info(kernel_name: str) -> KernelInfo:
|
| 49 |
+
repo_id = f"kernels-community/{kernel_name}"
|
| 50 |
+
return KernelInfo(
|
| 51 |
+
kernel_name=kernel_name,
|
| 52 |
+
repo_id=repo_id,
|
| 53 |
+
hub_url=f"https://huggingface.co/{repo_id}",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
def get_kernel_info(self, kernel_name: str) -> KernelInfo:
|
| 57 |
+
if kernel_name in self._cache:
|
| 58 |
+
return self._cache[kernel_name]
|
| 59 |
+
|
| 60 |
+
info = self._fallback_kernel_info(kernel_name)
|
| 61 |
+
try:
|
| 62 |
+
content = self.client.get_file_text(f"{kernel_name}/build.toml", ref=self.branch)
|
| 63 |
+
except GitHubActionsError:
|
| 64 |
+
self._cache[kernel_name] = info
|
| 65 |
+
return info
|
| 66 |
+
|
| 67 |
+
if not content:
|
| 68 |
+
self._cache[kernel_name] = info
|
| 69 |
+
return info
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
data = tomllib.loads(content)
|
| 73 |
+
except tomllib.TOMLDecodeError:
|
| 74 |
+
self._cache[kernel_name] = info
|
| 75 |
+
return info
|
| 76 |
+
|
| 77 |
+
general = data.get("general") or {}
|
| 78 |
+
hub = general.get("hub") or {}
|
| 79 |
+
repo_id = hub.get("repo-id") or info.repo_id
|
| 80 |
+
parsed = KernelInfo(
|
| 81 |
+
kernel_name=general.get("name") or kernel_name,
|
| 82 |
+
repo_id=repo_id,
|
| 83 |
+
hub_url=f"https://huggingface.co/{repo_id}",
|
| 84 |
+
version=general.get("version"),
|
| 85 |
+
backends=list(general.get("backends") or []),
|
| 86 |
+
)
|
| 87 |
+
self._cache[kernel_name] = parsed
|
| 88 |
+
return parsed
|
| 89 |
+
|
| 90 |
+
def list_kernel_catalog(self) -> list[KernelInfo]:
|
| 91 |
+
if "catalog" in self._catalog_cache:
|
| 92 |
+
return self._catalog_cache["catalog"]
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
paths = self.client.list_repo_tree_paths(ref=self.branch)
|
| 96 |
+
except GitHubActionsError:
|
| 97 |
+
return []
|
| 98 |
+
|
| 99 |
+
kernel_names = sorted(
|
| 100 |
+
{
|
| 101 |
+
path.split("/", 1)[0]
|
| 102 |
+
for path in paths
|
| 103 |
+
if path.endswith("/build.toml") and "/" in path and path.count("/") == 1
|
| 104 |
+
}
|
| 105 |
+
)
|
| 106 |
+
catalog = [self._cache.get(name) or self._fallback_kernel_info(name) for name in kernel_names]
|
| 107 |
+
self._catalog_cache["catalog"] = catalog
|
| 108 |
+
return catalog
|
src/kc_monitor/log_parser.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
from kc_monitor.models import (
|
| 6 |
+
FAILING_CONCLUSIONS,
|
| 7 |
+
GitHubJob,
|
| 8 |
+
GitHubJobStep,
|
| 9 |
+
GitHubRun,
|
| 10 |
+
ParsedJobState,
|
| 11 |
+
ParsedLogEvent,
|
| 12 |
+
parse_github_datetime,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
PHASE_LABELS = {
|
| 17 |
+
"queued": "Queued",
|
| 18 |
+
"setup": "Setup",
|
| 19 |
+
"validating": "Validating",
|
| 20 |
+
"building": "Building",
|
| 21 |
+
"uploading": "Uploading",
|
| 22 |
+
"upload_complete": "Upload complete",
|
| 23 |
+
"testing": "Testing",
|
| 24 |
+
"completed": "Completed",
|
| 25 |
+
"failed": "Failed",
|
| 26 |
+
"cancelled": "Cancelled",
|
| 27 |
+
"stalled": "Stalled",
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
UPLOAD_LABELS = {
|
| 31 |
+
"not_started": "Not started",
|
| 32 |
+
"running": "Running",
|
| 33 |
+
"completed": "Completed",
|
| 34 |
+
"failed": "Failed",
|
| 35 |
+
"skipped": "Skipped",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
STEP_PHASE_RULES: list[tuple[re.Pattern[str], str]] = [
|
| 39 |
+
(re.compile(r"Set up job|checkout|nix-installer|Nix info|cachix", re.IGNORECASE), "setup"),
|
| 40 |
+
(re.compile(r"Validate kernel directory", re.IGNORECASE), "validating"),
|
| 41 |
+
(re.compile(r"Build and upload kernel|Build kernel|Build and copy kernel", re.IGNORECASE), "building"),
|
| 42 |
+
(re.compile(r"Upload kernel|Upload v1 kernels to main|Upload ci-test closure", re.IGNORECASE), "uploading"),
|
| 43 |
+
(re.compile(r"Run GPU tests", re.IGNORECASE), "testing"),
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
TIMESTAMP_RE = re.compile(r"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)")
|
| 47 |
+
REPO_ID_RE = re.compile(r"--repo-id(?:=|\s+)?\"?([A-Za-z0-9._-]+/[A-Za-z0-9._-]+)\"?")
|
| 48 |
+
UPLOAD_START_RE = re.compile(r"(kernels\s+--\s+upload|upload\s+--repo-id|Uploading\s+[A-Za-z0-9._-]+/[A-Za-z0-9._-]+)", re.IGNORECASE)
|
| 49 |
+
UPLOAD_SUCCESS_RE = re.compile(r"(Upload finished|Upload complete|Committed|commit created|pushed to hub)", re.IGNORECASE)
|
| 50 |
+
ERROR_RE = re.compile(r"(error:|Process completed with exit code|Traceback|FAILED|fatal:)", re.IGNORECASE)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def classify_step_name(step_name: str | None) -> str | None:
|
| 54 |
+
if not step_name:
|
| 55 |
+
return None
|
| 56 |
+
for pattern, phase in STEP_PHASE_RULES:
|
| 57 |
+
if pattern.search(step_name):
|
| 58 |
+
return phase
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _interesting_category(line: str) -> str | None:
|
| 63 |
+
if ERROR_RE.search(line):
|
| 64 |
+
return "error"
|
| 65 |
+
if UPLOAD_START_RE.search(line) or "upload" in line.lower():
|
| 66 |
+
return "upload"
|
| 67 |
+
if "build-and-upload" in line or "build-and-copy" in line or "nix build" in line.lower():
|
| 68 |
+
return "build"
|
| 69 |
+
if "validate kernel directory" in line.lower():
|
| 70 |
+
return "validation"
|
| 71 |
+
return None
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class JobLogParser:
|
| 75 |
+
def parse(
|
| 76 |
+
self,
|
| 77 |
+
run: GitHubRun,
|
| 78 |
+
job: GitHubJob,
|
| 79 |
+
log_text: str | None,
|
| 80 |
+
event_limit: int = 20,
|
| 81 |
+
) -> ParsedJobState:
|
| 82 |
+
lines = log_text.splitlines() if log_text else []
|
| 83 |
+
latest_log_at = self._latest_log_timestamp(lines)
|
| 84 |
+
repo_id = self._extract_repo_id(lines)
|
| 85 |
+
events = self._extract_events(lines, limit=event_limit)
|
| 86 |
+
failure_excerpt = self._failure_excerpt(lines) if (job.conclusion or "") in FAILING_CONCLUSIONS else None
|
| 87 |
+
|
| 88 |
+
active_step = job.active_step or job.last_step
|
| 89 |
+
step_phase = classify_step_name(active_step.name if active_step else None)
|
| 90 |
+
upload_status = self._upload_status(job, lines)
|
| 91 |
+
phase, reason = self._phase_for_job(job, step_phase, upload_status, lines, active_step)
|
| 92 |
+
|
| 93 |
+
return ParsedJobState(
|
| 94 |
+
phase=phase,
|
| 95 |
+
phase_label=PHASE_LABELS.get(phase, phase.title()),
|
| 96 |
+
phase_reason=reason,
|
| 97 |
+
upload_status=upload_status,
|
| 98 |
+
upload_status_label=UPLOAD_LABELS[upload_status],
|
| 99 |
+
repo_id=repo_id,
|
| 100 |
+
latest_log_at=latest_log_at,
|
| 101 |
+
active_step_name=active_step.name if active_step else None,
|
| 102 |
+
active_step_started_at=active_step.started_at if active_step else None,
|
| 103 |
+
events=events,
|
| 104 |
+
failure_excerpt=failure_excerpt,
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def _phase_for_job(
|
| 108 |
+
self,
|
| 109 |
+
job: GitHubJob,
|
| 110 |
+
step_phase: str | None,
|
| 111 |
+
upload_status: str,
|
| 112 |
+
lines: list[str],
|
| 113 |
+
active_step: GitHubJobStep | None,
|
| 114 |
+
) -> tuple[str, str]:
|
| 115 |
+
upload_started = any(UPLOAD_START_RE.search(line) for line in lines)
|
| 116 |
+
combined_step = any("Build and upload kernel" in step.name for step in job.steps)
|
| 117 |
+
|
| 118 |
+
if job.status != "completed":
|
| 119 |
+
if upload_started or upload_status == "running":
|
| 120 |
+
return "uploading", "Upload command detected in the active job log."
|
| 121 |
+
if step_phase:
|
| 122 |
+
return step_phase, f"Current GitHub Actions step: {active_step.name}."
|
| 123 |
+
return "queued", "Job is queued or still waiting for the first step to start."
|
| 124 |
+
|
| 125 |
+
conclusion = job.conclusion or "completed"
|
| 126 |
+
if conclusion == "success":
|
| 127 |
+
if upload_status == "completed" or (combined_step and upload_started):
|
| 128 |
+
return "upload_complete", "Build finished and upload markers were detected."
|
| 129 |
+
return "completed", "Job completed successfully."
|
| 130 |
+
|
| 131 |
+
if conclusion == "cancelled":
|
| 132 |
+
return "cancelled", "GitHub marked the job as cancelled."
|
| 133 |
+
|
| 134 |
+
if upload_status == "failed":
|
| 135 |
+
return "failed", "Job failed after upload started or inside an upload step."
|
| 136 |
+
|
| 137 |
+
return "failed", "GitHub marked the job as failed."
|
| 138 |
+
|
| 139 |
+
def _upload_status(self, job: GitHubJob, lines: list[str]) -> str:
|
| 140 |
+
upload_steps = [step for step in job.steps if classify_step_name(step.name) == "uploading"]
|
| 141 |
+
if any(step.is_running for step in upload_steps):
|
| 142 |
+
return "running"
|
| 143 |
+
if any((step.conclusion or "") == "success" for step in upload_steps):
|
| 144 |
+
return "completed"
|
| 145 |
+
if any((step.conclusion or "") in FAILING_CONCLUSIONS for step in upload_steps):
|
| 146 |
+
return "failed"
|
| 147 |
+
if upload_steps and all((step.conclusion or "") == "skipped" for step in upload_steps):
|
| 148 |
+
return "skipped"
|
| 149 |
+
|
| 150 |
+
upload_started = any(UPLOAD_START_RE.search(line) for line in lines)
|
| 151 |
+
upload_succeeded = any(UPLOAD_SUCCESS_RE.search(line) for line in lines)
|
| 152 |
+
combined_step_success = any(
|
| 153 |
+
"Build and upload kernel" in step.name and (step.conclusion or "") == "success"
|
| 154 |
+
for step in job.steps
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
if job.status != "completed":
|
| 158 |
+
return "running" if upload_started else "not_started"
|
| 159 |
+
|
| 160 |
+
if upload_succeeded or upload_started or combined_step_success:
|
| 161 |
+
return "completed"
|
| 162 |
+
if upload_started and (job.conclusion or "") in FAILING_CONCLUSIONS:
|
| 163 |
+
return "failed"
|
| 164 |
+
if (job.conclusion or "") == "cancelled":
|
| 165 |
+
return "skipped"
|
| 166 |
+
return "not_started"
|
| 167 |
+
|
| 168 |
+
def _latest_log_timestamp(self, lines: list[str]) -> None | object:
|
| 169 |
+
timestamps = []
|
| 170 |
+
for line in lines:
|
| 171 |
+
match = TIMESTAMP_RE.search(line)
|
| 172 |
+
if match:
|
| 173 |
+
parsed = parse_github_datetime(match.group(1))
|
| 174 |
+
if parsed:
|
| 175 |
+
timestamps.append(parsed)
|
| 176 |
+
return max(timestamps) if timestamps else None
|
| 177 |
+
|
| 178 |
+
def _extract_repo_id(self, lines: list[str]) -> str | None:
|
| 179 |
+
for line in lines:
|
| 180 |
+
match = REPO_ID_RE.search(line)
|
| 181 |
+
if match:
|
| 182 |
+
return match.group(1)
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
def _extract_events(self, lines: list[str], limit: int) -> list[ParsedLogEvent]:
|
| 186 |
+
events: list[ParsedLogEvent] = []
|
| 187 |
+
for index, line in enumerate(lines, start=1):
|
| 188 |
+
category = _interesting_category(line)
|
| 189 |
+
if not category:
|
| 190 |
+
continue
|
| 191 |
+
timestamp = None
|
| 192 |
+
match = TIMESTAMP_RE.search(line)
|
| 193 |
+
if match:
|
| 194 |
+
timestamp = parse_github_datetime(match.group(1))
|
| 195 |
+
events.append(
|
| 196 |
+
ParsedLogEvent(
|
| 197 |
+
category=category,
|
| 198 |
+
message=line.strip(),
|
| 199 |
+
line_number=index,
|
| 200 |
+
timestamp=timestamp,
|
| 201 |
+
)
|
| 202 |
+
)
|
| 203 |
+
return events[-limit:]
|
| 204 |
+
|
| 205 |
+
def _failure_excerpt(self, lines: list[str]) -> str | None:
|
| 206 |
+
if not lines:
|
| 207 |
+
return None
|
| 208 |
+
|
| 209 |
+
failure_lines = [line.strip() for line in lines if line.strip() and ERROR_RE.search(line)]
|
| 210 |
+
if failure_lines:
|
| 211 |
+
return "\n".join(failure_lines[-8:])
|
| 212 |
+
|
| 213 |
+
non_empty = [line.strip() for line in lines if line.strip()]
|
| 214 |
+
if not non_empty:
|
| 215 |
+
return None
|
| 216 |
+
return "\n".join(non_empty[-10:])
|
src/kc_monitor/metrics_push.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
import time
|
| 6 |
+
from typing import Mapping
|
| 7 |
+
from urllib.parse import quote
|
| 8 |
+
|
| 9 |
+
import httpx
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
GROUPING_LABEL_ORDER = (
|
| 13 |
+
"kernel",
|
| 14 |
+
"backend",
|
| 15 |
+
"compute_backend",
|
| 16 |
+
"cuda_version",
|
| 17 |
+
"pytorch_version",
|
| 18 |
+
"python_version",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
METRIC_LABEL_ORDER = (
|
| 22 |
+
"repository",
|
| 23 |
+
"workflow",
|
| 24 |
+
"branch",
|
| 25 |
+
"job",
|
| 26 |
+
"runner_os",
|
| 27 |
+
"runner_arch",
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
RESULT_CODE_BY_STATUS = {
|
| 31 |
+
"success": 0,
|
| 32 |
+
"cancelled": 1,
|
| 33 |
+
"skipped": 1,
|
| 34 |
+
"neutral": 1,
|
| 35 |
+
"failure": 2,
|
| 36 |
+
"timed_out": 2,
|
| 37 |
+
"startup_failure": 2,
|
| 38 |
+
"action_required": 2,
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _coalesce(value: str | None, default: str = "unknown") -> str:
|
| 43 |
+
cleaned = (value or "").strip()
|
| 44 |
+
return cleaned or default
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _escape_label_value(value: str) -> str:
|
| 48 |
+
return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _parse_unix_or_iso(value: str) -> float:
|
| 52 |
+
raw = value.strip()
|
| 53 |
+
try:
|
| 54 |
+
return float(raw)
|
| 55 |
+
except ValueError:
|
| 56 |
+
normalized = raw.replace("Z", "+00:00")
|
| 57 |
+
return datetime.fromisoformat(normalized).astimezone(timezone.utc).timestamp()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def resolve_duration_seconds(env: Mapping[str, str], completed_at_seconds: float) -> float:
|
| 61 |
+
explicit_duration = env.get("KCM_BUILD_DURATION_SECONDS")
|
| 62 |
+
if explicit_duration:
|
| 63 |
+
return max(float(explicit_duration), 0.0)
|
| 64 |
+
|
| 65 |
+
started_at = env.get("KCM_JOB_STARTED_AT")
|
| 66 |
+
if not started_at:
|
| 67 |
+
return 0.0
|
| 68 |
+
|
| 69 |
+
started_at_seconds = _parse_unix_or_iso(started_at)
|
| 70 |
+
return max(completed_at_seconds - started_at_seconds, 0.0)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def result_code_for_status(status: str) -> int:
|
| 74 |
+
return RESULT_CODE_BY_STATUS.get(status.strip().lower(), 3)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass(frozen=True, slots=True)
|
| 78 |
+
class BuildMetricSample:
|
| 79 |
+
grouping_key: dict[str, str]
|
| 80 |
+
metric_labels: dict[str, str]
|
| 81 |
+
duration_seconds: float
|
| 82 |
+
completed_at_seconds: int
|
| 83 |
+
result_code: int
|
| 84 |
+
failed: int
|
| 85 |
+
result: str
|
| 86 |
+
|
| 87 |
+
@classmethod
|
| 88 |
+
def from_env(
|
| 89 |
+
cls,
|
| 90 |
+
env: Mapping[str, str],
|
| 91 |
+
*,
|
| 92 |
+
completed_at_seconds: int | None = None,
|
| 93 |
+
) -> "BuildMetricSample":
|
| 94 |
+
completed_at = completed_at_seconds or int(time.time())
|
| 95 |
+
result = _coalesce(env.get("KCM_JOB_STATUS") or env.get("JOB_STATUS")).lower()
|
| 96 |
+
result_code = result_code_for_status(result)
|
| 97 |
+
|
| 98 |
+
grouping_key = {
|
| 99 |
+
"kernel": _coalesce(env.get("KCM_KERNEL")),
|
| 100 |
+
"backend": _coalesce(env.get("KCM_BACKEND")),
|
| 101 |
+
"compute_backend": _coalesce(env.get("KCM_COMPUTE_BACKEND")),
|
| 102 |
+
"cuda_version": _coalesce(env.get("KCM_CUDA_VERSION")),
|
| 103 |
+
"pytorch_version": _coalesce(env.get("KCM_PYTORCH_VERSION")),
|
| 104 |
+
"python_version": _coalesce(env.get("KCM_PYTHON_VERSION")),
|
| 105 |
+
}
|
| 106 |
+
metric_labels = {
|
| 107 |
+
"repository": _coalesce(env.get("GITHUB_REPOSITORY")),
|
| 108 |
+
"workflow": _coalesce(env.get("GITHUB_WORKFLOW")),
|
| 109 |
+
"branch": _coalesce(
|
| 110 |
+
env.get("GITHUB_REF_NAME")
|
| 111 |
+
or env.get("GITHUB_HEAD_REF")
|
| 112 |
+
or env.get("GITHUB_REF")
|
| 113 |
+
),
|
| 114 |
+
"job": _coalesce(env.get("GITHUB_JOB")),
|
| 115 |
+
"runner_os": _coalesce(env.get("RUNNER_OS")),
|
| 116 |
+
"runner_arch": _coalesce(env.get("RUNNER_ARCH")),
|
| 117 |
+
}
|
| 118 |
+
return cls(
|
| 119 |
+
grouping_key=grouping_key,
|
| 120 |
+
metric_labels=metric_labels,
|
| 121 |
+
duration_seconds=resolve_duration_seconds(env, completed_at),
|
| 122 |
+
completed_at_seconds=completed_at,
|
| 123 |
+
result_code=result_code,
|
| 124 |
+
failed=1 if result_code == 2 else 0,
|
| 125 |
+
result=result,
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def build_pushgateway_url(base_url: str, job_name: str, grouping_key: Mapping[str, str]) -> str:
|
| 130 |
+
path = [base_url.rstrip("/"), "metrics", "job", quote(job_name, safe="")]
|
| 131 |
+
for label in GROUPING_LABEL_ORDER:
|
| 132 |
+
path.append(quote(label, safe=""))
|
| 133 |
+
path.append(quote(grouping_key[label], safe=""))
|
| 134 |
+
return "/".join(path)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def format_prometheus_metrics(sample: BuildMetricSample) -> str:
|
| 138 |
+
labels = {
|
| 139 |
+
key: sample.metric_labels[key]
|
| 140 |
+
for key in METRIC_LABEL_ORDER
|
| 141 |
+
}
|
| 142 |
+
label_blob = ",".join(
|
| 143 |
+
f'{key}="{_escape_label_value(value)}"'
|
| 144 |
+
for key, value in labels.items()
|
| 145 |
+
)
|
| 146 |
+
info_labels = f'{label_blob},result="{_escape_label_value(sample.result)}"'
|
| 147 |
+
lines = [
|
| 148 |
+
"# TYPE kc_build_last_run_result_code gauge",
|
| 149 |
+
f"kc_build_last_run_result_code{{{label_blob}}} {sample.result_code}",
|
| 150 |
+
"# TYPE kc_build_last_run_failed gauge",
|
| 151 |
+
f"kc_build_last_run_failed{{{label_blob}}} {sample.failed}",
|
| 152 |
+
"# TYPE kc_build_last_run_duration_seconds gauge",
|
| 153 |
+
f"kc_build_last_run_duration_seconds{{{label_blob}}} {sample.duration_seconds:.3f}",
|
| 154 |
+
"# TYPE kc_build_last_run_timestamp_seconds gauge",
|
| 155 |
+
f"kc_build_last_run_timestamp_seconds{{{label_blob}}} {sample.completed_at_seconds}",
|
| 156 |
+
"# TYPE kc_build_last_run_info gauge",
|
| 157 |
+
f"kc_build_last_run_info{{{info_labels}}} 1",
|
| 158 |
+
]
|
| 159 |
+
return "\n".join(lines) + "\n"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def push_build_metrics(
|
| 163 |
+
sample: BuildMetricSample,
|
| 164 |
+
*,
|
| 165 |
+
pushgateway_url: str,
|
| 166 |
+
job_name: str,
|
| 167 |
+
timeout_seconds: float = 10.0,
|
| 168 |
+
max_attempts: int = 3,
|
| 169 |
+
) -> str:
|
| 170 |
+
url = build_pushgateway_url(pushgateway_url, job_name, sample.grouping_key)
|
| 171 |
+
payload = format_prometheus_metrics(sample)
|
| 172 |
+
headers = {"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}
|
| 173 |
+
|
| 174 |
+
last_error: httpx.HTTPError | None = None
|
| 175 |
+
with httpx.Client(timeout=timeout_seconds) as client:
|
| 176 |
+
for attempt in range(1, max_attempts + 1):
|
| 177 |
+
try:
|
| 178 |
+
response = client.put(url, content=payload.encode("utf-8"), headers=headers)
|
| 179 |
+
response.raise_for_status()
|
| 180 |
+
return url
|
| 181 |
+
except httpx.HTTPError as exc:
|
| 182 |
+
last_error = exc
|
| 183 |
+
if attempt == max_attempts:
|
| 184 |
+
break
|
| 185 |
+
time.sleep(0.5 * attempt)
|
| 186 |
+
|
| 187 |
+
if last_error is not None:
|
| 188 |
+
raise last_error
|
| 189 |
+
return url
|
| 190 |
+
|
src/kc_monitor/models.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from dateutil import parser as date_parser
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
FAILING_CONCLUSIONS = {"failure", "timed_out", "cancelled", "startup_failure"}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def utcnow() -> datetime:
|
| 14 |
+
return datetime.now(timezone.utc)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def parse_github_datetime(value: str | None) -> datetime | None:
|
| 18 |
+
if not value:
|
| 19 |
+
return None
|
| 20 |
+
return date_parser.isoparse(value).astimezone(timezone.utc)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass(slots=True)
|
| 24 |
+
class WorkflowTarget:
|
| 25 |
+
path: str
|
| 26 |
+
label: str
|
| 27 |
+
enabled: bool = True
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def basename(self) -> str:
|
| 31 |
+
return self.path.rsplit("/", 1)[-1]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(slots=True)
|
| 35 |
+
class GitHubRun:
|
| 36 |
+
id: int
|
| 37 |
+
name: str
|
| 38 |
+
display_title: str
|
| 39 |
+
path: str
|
| 40 |
+
status: str
|
| 41 |
+
conclusion: str | None
|
| 42 |
+
head_branch: str
|
| 43 |
+
head_sha: str
|
| 44 |
+
event: str
|
| 45 |
+
html_url: str
|
| 46 |
+
jobs_url: str
|
| 47 |
+
created_at: datetime
|
| 48 |
+
updated_at: datetime
|
| 49 |
+
run_started_at: datetime | None
|
| 50 |
+
actor_login: str | None = None
|
| 51 |
+
raw: dict[str, Any] = field(default_factory=dict)
|
| 52 |
+
|
| 53 |
+
@classmethod
|
| 54 |
+
def from_api(cls, payload: dict[str, Any]) -> "GitHubRun":
|
| 55 |
+
actor = payload.get("actor") or {}
|
| 56 |
+
return cls(
|
| 57 |
+
id=payload["id"],
|
| 58 |
+
name=payload.get("name") or "",
|
| 59 |
+
display_title=payload.get("display_title") or payload.get("name") or "",
|
| 60 |
+
path=payload.get("path") or "",
|
| 61 |
+
status=payload.get("status") or "unknown",
|
| 62 |
+
conclusion=payload.get("conclusion"),
|
| 63 |
+
head_branch=payload.get("head_branch") or "",
|
| 64 |
+
head_sha=payload.get("head_sha") or "",
|
| 65 |
+
event=payload.get("event") or "",
|
| 66 |
+
html_url=payload.get("html_url") or "",
|
| 67 |
+
jobs_url=payload.get("jobs_url") or "",
|
| 68 |
+
created_at=parse_github_datetime(payload.get("created_at")) or utcnow(),
|
| 69 |
+
updated_at=parse_github_datetime(payload.get("updated_at")) or utcnow(),
|
| 70 |
+
run_started_at=parse_github_datetime(payload.get("run_started_at")),
|
| 71 |
+
actor_login=actor.get("login"),
|
| 72 |
+
raw=payload,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
@property
|
| 76 |
+
def is_active(self) -> bool:
|
| 77 |
+
return self.status != "completed"
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def sort_time(self) -> datetime:
|
| 81 |
+
return self.run_started_at or self.created_at
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@dataclass(slots=True)
|
| 85 |
+
class GitHubJobStep:
|
| 86 |
+
name: str
|
| 87 |
+
status: str
|
| 88 |
+
conclusion: str | None
|
| 89 |
+
number: int
|
| 90 |
+
started_at: datetime | None
|
| 91 |
+
completed_at: datetime | None
|
| 92 |
+
|
| 93 |
+
@classmethod
|
| 94 |
+
def from_api(cls, payload: dict[str, Any]) -> "GitHubJobStep":
|
| 95 |
+
return cls(
|
| 96 |
+
name=payload.get("name") or "",
|
| 97 |
+
status=payload.get("status") or "unknown",
|
| 98 |
+
conclusion=payload.get("conclusion"),
|
| 99 |
+
number=payload.get("number") or 0,
|
| 100 |
+
started_at=parse_github_datetime(payload.get("started_at")),
|
| 101 |
+
completed_at=parse_github_datetime(payload.get("completed_at")),
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
@property
|
| 105 |
+
def is_running(self) -> bool:
|
| 106 |
+
return self.status != "completed"
|
| 107 |
+
|
| 108 |
+
@property
|
| 109 |
+
def is_failed(self) -> bool:
|
| 110 |
+
return (self.conclusion or "") in FAILING_CONCLUSIONS
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def duration_seconds(self) -> float | None:
|
| 114 |
+
if not self.started_at:
|
| 115 |
+
return None
|
| 116 |
+
end = self.completed_at or utcnow()
|
| 117 |
+
return max((end - self.started_at).total_seconds(), 0.0)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@dataclass(slots=True)
|
| 121 |
+
class GitHubJob:
|
| 122 |
+
id: int
|
| 123 |
+
run_id: int
|
| 124 |
+
workflow_name: str
|
| 125 |
+
head_branch: str
|
| 126 |
+
run_url: str
|
| 127 |
+
run_attempt: int
|
| 128 |
+
head_sha: str
|
| 129 |
+
url: str
|
| 130 |
+
html_url: str
|
| 131 |
+
status: str
|
| 132 |
+
conclusion: str | None
|
| 133 |
+
created_at: datetime
|
| 134 |
+
started_at: datetime | None
|
| 135 |
+
completed_at: datetime | None
|
| 136 |
+
name: str
|
| 137 |
+
steps: list[GitHubJobStep]
|
| 138 |
+
runner_name: str | None = None
|
| 139 |
+
runner_group_name: str | None = None
|
| 140 |
+
|
| 141 |
+
@classmethod
|
| 142 |
+
def from_api(cls, payload: dict[str, Any]) -> "GitHubJob":
|
| 143 |
+
steps = [GitHubJobStep.from_api(item) for item in payload.get("steps") or []]
|
| 144 |
+
return cls(
|
| 145 |
+
id=payload["id"],
|
| 146 |
+
run_id=payload.get("run_id") or 0,
|
| 147 |
+
workflow_name=payload.get("workflow_name") or "",
|
| 148 |
+
head_branch=payload.get("head_branch") or "",
|
| 149 |
+
run_url=payload.get("run_url") or "",
|
| 150 |
+
run_attempt=payload.get("run_attempt") or 1,
|
| 151 |
+
head_sha=payload.get("head_sha") or "",
|
| 152 |
+
url=payload.get("url") or "",
|
| 153 |
+
html_url=payload.get("html_url") or "",
|
| 154 |
+
status=payload.get("status") or "unknown",
|
| 155 |
+
conclusion=payload.get("conclusion"),
|
| 156 |
+
created_at=parse_github_datetime(payload.get("created_at")) or utcnow(),
|
| 157 |
+
started_at=parse_github_datetime(payload.get("started_at")),
|
| 158 |
+
completed_at=parse_github_datetime(payload.get("completed_at")),
|
| 159 |
+
name=payload.get("name") or "",
|
| 160 |
+
steps=steps,
|
| 161 |
+
runner_name=payload.get("runner_name"),
|
| 162 |
+
runner_group_name=payload.get("runner_group_name"),
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
@property
|
| 166 |
+
def is_active(self) -> bool:
|
| 167 |
+
return self.status != "completed"
|
| 168 |
+
|
| 169 |
+
@property
|
| 170 |
+
def active_step(self) -> GitHubJobStep | None:
|
| 171 |
+
for step in self.steps:
|
| 172 |
+
if step.is_running:
|
| 173 |
+
return step
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
@property
|
| 177 |
+
def last_step(self) -> GitHubJobStep | None:
|
| 178 |
+
return self.steps[-1] if self.steps else None
|
| 179 |
+
|
| 180 |
+
@property
|
| 181 |
+
def duration_seconds(self) -> float | None:
|
| 182 |
+
if not self.started_at:
|
| 183 |
+
return None
|
| 184 |
+
end = self.completed_at or utcnow()
|
| 185 |
+
return max((end - self.started_at).total_seconds(), 0.0)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
@dataclass(slots=True)
|
| 189 |
+
class KernelInfo:
|
| 190 |
+
kernel_name: str
|
| 191 |
+
repo_id: str
|
| 192 |
+
hub_url: str
|
| 193 |
+
version: int | None = None
|
| 194 |
+
backends: list[str] = field(default_factory=list)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
@dataclass(slots=True)
|
| 198 |
+
class ParsedLogEvent:
|
| 199 |
+
category: str
|
| 200 |
+
message: str
|
| 201 |
+
line_number: int
|
| 202 |
+
timestamp: datetime | None = None
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
@dataclass(slots=True)
|
| 206 |
+
class ParsedJobState:
|
| 207 |
+
phase: str
|
| 208 |
+
phase_label: str
|
| 209 |
+
phase_reason: str
|
| 210 |
+
upload_status: str
|
| 211 |
+
upload_status_label: str
|
| 212 |
+
repo_id: str | None
|
| 213 |
+
latest_log_at: datetime | None
|
| 214 |
+
active_step_name: str | None
|
| 215 |
+
active_step_started_at: datetime | None
|
| 216 |
+
events: list[ParsedLogEvent] = field(default_factory=list)
|
| 217 |
+
failure_excerpt: str | None = None
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
@dataclass(slots=True)
|
| 221 |
+
class MonitorRecord:
|
| 222 |
+
key: str
|
| 223 |
+
kernel_name: str
|
| 224 |
+
critical: bool
|
| 225 |
+
kernel_info: KernelInfo
|
| 226 |
+
workflow_name: str
|
| 227 |
+
workflow_path: str
|
| 228 |
+
run: GitHubRun
|
| 229 |
+
job: GitHubJob
|
| 230 |
+
phase: str
|
| 231 |
+
phase_label: str
|
| 232 |
+
phase_reason: str
|
| 233 |
+
upload_status: str
|
| 234 |
+
upload_status_label: str
|
| 235 |
+
arch: str
|
| 236 |
+
runner_group: str | None
|
| 237 |
+
suspected_stalled: bool
|
| 238 |
+
stall_reason: str | None
|
| 239 |
+
latest_signal_at: datetime | None
|
| 240 |
+
events: list[ParsedLogEvent] = field(default_factory=list)
|
| 241 |
+
failure_excerpt: str | None = None
|
| 242 |
+
active_step_name: str | None = None
|
| 243 |
+
active_step_started_at: datetime | None = None
|
| 244 |
+
|
| 245 |
+
@property
|
| 246 |
+
def is_active(self) -> bool:
|
| 247 |
+
return self.job.is_active
|
| 248 |
+
|
| 249 |
+
@property
|
| 250 |
+
def started_at(self) -> datetime | None:
|
| 251 |
+
return self.job.started_at or self.run.run_started_at
|
| 252 |
+
|
| 253 |
+
@property
|
| 254 |
+
def completed_at(self) -> datetime | None:
|
| 255 |
+
return self.job.completed_at
|
| 256 |
+
|
| 257 |
+
@property
|
| 258 |
+
def elapsed_seconds(self) -> float | None:
|
| 259 |
+
start = self.started_at
|
| 260 |
+
if not start:
|
| 261 |
+
return None
|
| 262 |
+
end = self.completed_at or utcnow()
|
| 263 |
+
return max((end - start).total_seconds(), 0.0)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
@dataclass(slots=True)
|
| 267 |
+
class KernelRunGroup:
|
| 268 |
+
kernel_name: str
|
| 269 |
+
run: GitHubRun
|
| 270 |
+
workflow_name: str
|
| 271 |
+
records: list[MonitorRecord]
|
| 272 |
+
|
| 273 |
+
@property
|
| 274 |
+
def is_active(self) -> bool:
|
| 275 |
+
return any(record.is_active for record in self.records)
|
| 276 |
+
|
| 277 |
+
@property
|
| 278 |
+
def has_failure(self) -> bool:
|
| 279 |
+
return any((record.job.conclusion or "") in FAILING_CONCLUSIONS for record in self.records)
|
| 280 |
+
|
| 281 |
+
@property
|
| 282 |
+
def has_stall(self) -> bool:
|
| 283 |
+
return any(record.suspected_stalled for record in self.records)
|
| 284 |
+
|
| 285 |
+
@property
|
| 286 |
+
def has_uploading(self) -> bool:
|
| 287 |
+
return any(record.upload_status == "running" for record in self.records)
|
| 288 |
+
|
| 289 |
+
@property
|
| 290 |
+
def triggered_at(self) -> datetime:
|
| 291 |
+
return self.run.run_started_at or self.run.created_at
|
| 292 |
+
|
| 293 |
+
@property
|
| 294 |
+
def latest_update_at(self) -> datetime:
|
| 295 |
+
candidates = [record.latest_signal_at for record in self.records if record.latest_signal_at]
|
| 296 |
+
if candidates:
|
| 297 |
+
return max(candidates)
|
| 298 |
+
return self.run.updated_at
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
@dataclass(slots=True)
|
| 302 |
+
class KernelRow:
|
| 303 |
+
kernel_name: str
|
| 304 |
+
kernel_info: KernelInfo
|
| 305 |
+
critical: bool
|
| 306 |
+
current_group: KernelRunGroup | None
|
| 307 |
+
recent_groups: list[KernelRunGroup]
|
| 308 |
+
row_status_kind: str
|
| 309 |
+
row_status_label: str
|
| 310 |
+
row_reason: str
|
| 311 |
+
upload_label: str
|
| 312 |
+
last_triggered_at: datetime | None
|
| 313 |
+
|
| 314 |
+
@property
|
| 315 |
+
def primary_group(self) -> KernelRunGroup | None:
|
| 316 |
+
if self.current_group:
|
| 317 |
+
return self.current_group
|
| 318 |
+
return self.recent_groups[0] if self.recent_groups else None
|
| 319 |
+
|
| 320 |
+
@property
|
| 321 |
+
def recent_run_count(self) -> int:
|
| 322 |
+
return len(self.recent_groups)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
@dataclass(slots=True)
|
| 326 |
+
class DashboardSummary:
|
| 327 |
+
tracked_kernels: int = 0
|
| 328 |
+
active_builds: int = 0
|
| 329 |
+
uploading_builds: int = 0
|
| 330 |
+
stalled_builds: int = 0
|
| 331 |
+
failed_builds: int = 0
|
| 332 |
+
completed_uploads: int = 0
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
@dataclass(slots=True)
|
| 336 |
+
class DashboardSnapshot:
|
| 337 |
+
generated_at: datetime
|
| 338 |
+
summary: DashboardSummary
|
| 339 |
+
kernel_rows: list[KernelRow]
|
| 340 |
+
active_records: list[MonitorRecord]
|
| 341 |
+
recent_records: list[MonitorRecord]
|
| 342 |
+
errors: list[str] = field(default_factory=list)
|
src/kc_monitor/service.py
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
import re
|
| 5 |
+
from datetime import timedelta
|
| 6 |
+
|
| 7 |
+
from cachetools import TTLCache
|
| 8 |
+
|
| 9 |
+
from kc_monitor.config import AppConfig
|
| 10 |
+
from kc_monitor.github_client import GitHubActionsClient
|
| 11 |
+
from kc_monitor.kernel_index import KernelIndex
|
| 12 |
+
from kc_monitor.log_parser import JobLogParser, classify_step_name
|
| 13 |
+
from kc_monitor.models import (
|
| 14 |
+
DashboardSnapshot,
|
| 15 |
+
DashboardSummary,
|
| 16 |
+
FAILING_CONCLUSIONS,
|
| 17 |
+
GitHubJob,
|
| 18 |
+
GitHubJobStep,
|
| 19 |
+
GitHubRun,
|
| 20 |
+
KernelInfo,
|
| 21 |
+
KernelRow,
|
| 22 |
+
KernelRunGroup,
|
| 23 |
+
MonitorRecord,
|
| 24 |
+
utcnow,
|
| 25 |
+
)
|
| 26 |
+
from kc_monitor.stall_detector import detect_stall
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
ARCH_RE = re.compile(r"\(([^,]+),")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class MonitorService:
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
config: AppConfig,
|
| 36 |
+
client: GitHubActionsClient | None = None,
|
| 37 |
+
parser: JobLogParser | None = None,
|
| 38 |
+
kernel_index: KernelIndex | None = None,
|
| 39 |
+
) -> None:
|
| 40 |
+
self.config = config
|
| 41 |
+
self.client = client or GitHubActionsClient(
|
| 42 |
+
owner=config.github.owner,
|
| 43 |
+
repo=config.github.repo,
|
| 44 |
+
token=config.github.token,
|
| 45 |
+
request_timeout_seconds=config.github.request_timeout_seconds,
|
| 46 |
+
user_agent=config.github.user_agent,
|
| 47 |
+
)
|
| 48 |
+
self.parser = parser or JobLogParser()
|
| 49 |
+
self.kernel_index = kernel_index or KernelIndex(self.client, branch=config.github.branch)
|
| 50 |
+
self._snapshot_cache: TTLCache[str, DashboardSnapshot] = TTLCache(
|
| 51 |
+
maxsize=1,
|
| 52 |
+
ttl=max(5, config.monitor.snapshot_ttl_seconds),
|
| 53 |
+
)
|
| 54 |
+
self._workflow_labels = {
|
| 55 |
+
workflow.path: workflow.label for workflow in config.workflow_targets
|
| 56 |
+
}
|
| 57 |
+
self._workflow_paths = set(self._workflow_labels)
|
| 58 |
+
|
| 59 |
+
def close(self) -> None:
|
| 60 |
+
self.client.close()
|
| 61 |
+
|
| 62 |
+
def get_snapshot(self, force_refresh: bool = False) -> DashboardSnapshot:
|
| 63 |
+
if not force_refresh and "snapshot" in self._snapshot_cache:
|
| 64 |
+
return self._snapshot_cache["snapshot"]
|
| 65 |
+
|
| 66 |
+
snapshot = self._build_snapshot()
|
| 67 |
+
self._snapshot_cache["snapshot"] = snapshot
|
| 68 |
+
return snapshot
|
| 69 |
+
|
| 70 |
+
def _build_snapshot(self) -> DashboardSnapshot:
|
| 71 |
+
errors: list[str] = []
|
| 72 |
+
records: list[MonitorRecord] = []
|
| 73 |
+
|
| 74 |
+
kernel_catalog = self.kernel_index.list_kernel_catalog()
|
| 75 |
+
catalog_names = {info.kernel_name for info in kernel_catalog}
|
| 76 |
+
selected_runs = self._collect_runs(catalog_names, errors)
|
| 77 |
+
|
| 78 |
+
if not selected_runs and not errors:
|
| 79 |
+
errors.append("No kernel runs returned from any tracked workflow.")
|
| 80 |
+
|
| 81 |
+
needs_job_detail: set[int] = {run.id for run in selected_runs}
|
| 82 |
+
for run in selected_runs:
|
| 83 |
+
if run.id in needs_job_detail:
|
| 84 |
+
try:
|
| 85 |
+
jobs = self.client.list_jobs(run.id)
|
| 86 |
+
except Exception as exc: # noqa: BLE001
|
| 87 |
+
errors.append(f"Run {run.id}: {exc}")
|
| 88 |
+
records.append(self._build_lightweight_record(run))
|
| 89 |
+
continue
|
| 90 |
+
for job in jobs:
|
| 91 |
+
try:
|
| 92 |
+
records.append(self._build_record(run, job))
|
| 93 |
+
except Exception as exc: # noqa: BLE001
|
| 94 |
+
errors.append(f"Job {job.id}: {exc}")
|
| 95 |
+
else:
|
| 96 |
+
records.append(self._build_lightweight_record(run))
|
| 97 |
+
|
| 98 |
+
records.sort(key=self._record_sort_key)
|
| 99 |
+
active_records = [record for record in records if record.is_active]
|
| 100 |
+
recent_records = records[: self.config.monitor.recent_limit]
|
| 101 |
+
kernel_rows = self._build_kernel_rows(records)
|
| 102 |
+
|
| 103 |
+
summary = DashboardSummary(
|
| 104 |
+
tracked_kernels=len(kernel_rows),
|
| 105 |
+
active_builds=sum(1 for row in kernel_rows if row.current_group is not None),
|
| 106 |
+
uploading_builds=sum(
|
| 107 |
+
1 for row in kernel_rows if row.current_group is not None and row.current_group.has_uploading
|
| 108 |
+
),
|
| 109 |
+
stalled_builds=sum(1 for row in kernel_rows if row.row_status_kind == "stalled"),
|
| 110 |
+
failed_builds=sum(
|
| 111 |
+
1 for row in kernel_rows if any(group.has_failure for group in row.recent_groups)
|
| 112 |
+
),
|
| 113 |
+
completed_uploads=sum(
|
| 114 |
+
1
|
| 115 |
+
for row in kernel_rows
|
| 116 |
+
if any(
|
| 117 |
+
any(record.upload_status == "completed" for record in group.records)
|
| 118 |
+
for group in row.recent_groups
|
| 119 |
+
)
|
| 120 |
+
),
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return DashboardSnapshot(
|
| 124 |
+
generated_at=utcnow(),
|
| 125 |
+
summary=summary,
|
| 126 |
+
kernel_rows=kernel_rows,
|
| 127 |
+
active_records=active_records,
|
| 128 |
+
recent_records=recent_records,
|
| 129 |
+
errors=errors,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
def _collect_runs(
|
| 133 |
+
self,
|
| 134 |
+
catalog_names: set[str],
|
| 135 |
+
errors: list[str],
|
| 136 |
+
) -> list[GitHubRun]:
|
| 137 |
+
latest_by_workflow_kernel: dict[tuple[str, str], GitHubRun] = {}
|
| 138 |
+
active_runs: dict[int, GitHubRun] = {}
|
| 139 |
+
per_page = max(1, self.config.monitor.workflow_run_page_size)
|
| 140 |
+
max_pages = max(1, self.config.monitor.workflow_run_pages)
|
| 141 |
+
|
| 142 |
+
for workflow in self.config.workflow_targets:
|
| 143 |
+
seen_for_workflow: set[str] = set()
|
| 144 |
+
for page in range(1, max_pages + 1):
|
| 145 |
+
try:
|
| 146 |
+
workflow_runs = self.client.list_workflow_runs(
|
| 147 |
+
workflow.basename,
|
| 148 |
+
per_page=per_page,
|
| 149 |
+
page=page,
|
| 150 |
+
)
|
| 151 |
+
except Exception as exc: # noqa: BLE001
|
| 152 |
+
errors.append(f"Workflow {workflow.label}: {exc}")
|
| 153 |
+
break
|
| 154 |
+
|
| 155 |
+
if not workflow_runs:
|
| 156 |
+
break
|
| 157 |
+
|
| 158 |
+
for run in workflow_runs:
|
| 159 |
+
if run.path not in self._workflow_paths:
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
kernel = KernelIndex.infer_kernel_name(run)
|
| 163 |
+
if not kernel:
|
| 164 |
+
continue
|
| 165 |
+
if catalog_names and kernel not in catalog_names:
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
seen_for_workflow.add(kernel)
|
| 169 |
+
if run.is_active:
|
| 170 |
+
active_runs[run.id] = run
|
| 171 |
+
continue
|
| 172 |
+
|
| 173 |
+
key = (workflow.path, kernel)
|
| 174 |
+
if key not in latest_by_workflow_kernel:
|
| 175 |
+
latest_by_workflow_kernel[key] = run
|
| 176 |
+
|
| 177 |
+
if len(workflow_runs) < per_page:
|
| 178 |
+
break
|
| 179 |
+
if catalog_names and seen_for_workflow >= catalog_names:
|
| 180 |
+
break
|
| 181 |
+
|
| 182 |
+
selected = list(active_runs.values())
|
| 183 |
+
selected.extend(latest_by_workflow_kernel.values())
|
| 184 |
+
deduped = {run.id: run for run in selected}
|
| 185 |
+
return sorted(deduped.values(), key=lambda run: (0 if run.is_active else 1, -run.sort_time.timestamp()))
|
| 186 |
+
|
| 187 |
+
def _filter_runs(self, runs: list[GitHubRun]) -> list[GitHubRun]:
|
| 188 |
+
now = utcnow()
|
| 189 |
+
cutoff = now - timedelta(hours=self.config.monitor.recent_completed_hours)
|
| 190 |
+
filtered: list[GitHubRun] = []
|
| 191 |
+
completed_counts: dict[str, int] = {}
|
| 192 |
+
for run in runs:
|
| 193 |
+
if run.path not in self._workflow_paths:
|
| 194 |
+
continue
|
| 195 |
+
if run.is_active:
|
| 196 |
+
filtered.append(run)
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
if run.updated_at < cutoff:
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
seen = completed_counts.get(run.path, 0)
|
| 203 |
+
if seen >= self.config.monitor.completed_runs_per_workflow:
|
| 204 |
+
continue
|
| 205 |
+
|
| 206 |
+
completed_counts[run.path] = seen + 1
|
| 207 |
+
filtered.append(run)
|
| 208 |
+
return filtered
|
| 209 |
+
|
| 210 |
+
def _build_lightweight_record(self, run: GitHubRun) -> MonitorRecord:
|
| 211 |
+
kernel_name = KernelIndex.infer_kernel_name(run) or "unknown"
|
| 212 |
+
kernel_info = self._kernel_info_for(kernel_name, None)
|
| 213 |
+
critical = kernel_name in self.config.monitor.critical_kernel_set
|
| 214 |
+
conclusion = run.conclusion or ""
|
| 215 |
+
|
| 216 |
+
if conclusion == "success":
|
| 217 |
+
phase, phase_label = "completed", "Completed"
|
| 218 |
+
elif conclusion == "failure":
|
| 219 |
+
phase, phase_label = "failed", "Failed"
|
| 220 |
+
elif conclusion == "cancelled":
|
| 221 |
+
phase, phase_label = "cancelled", "Cancelled"
|
| 222 |
+
elif run.is_active:
|
| 223 |
+
phase, phase_label = "running", "Running"
|
| 224 |
+
else:
|
| 225 |
+
phase, phase_label = "completed", conclusion.title() or "Done"
|
| 226 |
+
|
| 227 |
+
stub_job = GitHubJob(
|
| 228 |
+
id=0, run_id=run.id, workflow_name=run.name, head_branch=run.head_branch,
|
| 229 |
+
run_url=run.html_url, run_attempt=1, head_sha=run.head_sha, url="",
|
| 230 |
+
html_url=run.html_url, status=run.status, conclusion=run.conclusion,
|
| 231 |
+
created_at=run.created_at, started_at=run.run_started_at,
|
| 232 |
+
completed_at=run.updated_at, name=run.name, steps=[],
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
return MonitorRecord(
|
| 236 |
+
key=f"{run.id}:0",
|
| 237 |
+
kernel_name=kernel_name,
|
| 238 |
+
critical=critical,
|
| 239 |
+
kernel_info=kernel_info,
|
| 240 |
+
workflow_name=self._workflow_labels.get(run.path, run.name),
|
| 241 |
+
workflow_path=run.path,
|
| 242 |
+
run=run,
|
| 243 |
+
job=stub_job,
|
| 244 |
+
phase=phase,
|
| 245 |
+
phase_label=phase_label,
|
| 246 |
+
phase_reason=f"Run {conclusion or run.status} (summary only).",
|
| 247 |
+
upload_status="not_started",
|
| 248 |
+
upload_status_label="Unknown",
|
| 249 |
+
arch="all",
|
| 250 |
+
runner_group=None,
|
| 251 |
+
suspected_stalled=False,
|
| 252 |
+
stall_reason=None,
|
| 253 |
+
latest_signal_at=run.updated_at,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
def _build_record(self, run: GitHubRun, job: GitHubJob) -> MonitorRecord:
|
| 257 |
+
job = self._normalize_job(run, job)
|
| 258 |
+
log_text = None
|
| 259 |
+
if self._should_fetch_logs(job):
|
| 260 |
+
log_text = self.client.get_job_logs(
|
| 261 |
+
job.id,
|
| 262 |
+
line_limit=self.config.monitor.log_line_limit,
|
| 263 |
+
char_limit=self.config.monitor.log_char_limit,
|
| 264 |
+
job_html_url=job.html_url,
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
parsed = self.parser.parse(
|
| 268 |
+
run,
|
| 269 |
+
job,
|
| 270 |
+
log_text,
|
| 271 |
+
event_limit=self.config.monitor.detail_event_limit,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
kernel_name = KernelIndex.infer_kernel_name(run) or "unknown"
|
| 275 |
+
kernel_info = self._kernel_info_for(kernel_name, parsed.repo_id)
|
| 276 |
+
latest_signal_at = parsed.latest_log_at or run.updated_at or job.started_at
|
| 277 |
+
critical = kernel_name in self.config.monitor.critical_kernel_set
|
| 278 |
+
|
| 279 |
+
record = MonitorRecord(
|
| 280 |
+
key=f"{run.id}:{job.id}",
|
| 281 |
+
kernel_name=kernel_name,
|
| 282 |
+
critical=critical,
|
| 283 |
+
kernel_info=kernel_info,
|
| 284 |
+
workflow_name=self._workflow_labels.get(run.path, run.name),
|
| 285 |
+
workflow_path=run.path,
|
| 286 |
+
run=run,
|
| 287 |
+
job=job,
|
| 288 |
+
phase=parsed.phase,
|
| 289 |
+
phase_label=parsed.phase_label,
|
| 290 |
+
phase_reason=parsed.phase_reason,
|
| 291 |
+
upload_status=parsed.upload_status,
|
| 292 |
+
upload_status_label=parsed.upload_status_label,
|
| 293 |
+
arch=self._extract_arch(job.name),
|
| 294 |
+
runner_group=job.runner_group_name,
|
| 295 |
+
suspected_stalled=False,
|
| 296 |
+
stall_reason=None,
|
| 297 |
+
latest_signal_at=latest_signal_at,
|
| 298 |
+
events=parsed.events,
|
| 299 |
+
failure_excerpt=parsed.failure_excerpt,
|
| 300 |
+
active_step_name=parsed.active_step_name,
|
| 301 |
+
active_step_started_at=parsed.active_step_started_at,
|
| 302 |
+
)
|
| 303 |
+
stalled, stall_reason = detect_stall(record, self.config.monitor)
|
| 304 |
+
record.suspected_stalled = stalled
|
| 305 |
+
record.stall_reason = stall_reason
|
| 306 |
+
return record
|
| 307 |
+
|
| 308 |
+
@staticmethod
|
| 309 |
+
def _normalize_job(run: GitHubRun, job: GitHubJob) -> GitHubJob:
|
| 310 |
+
if job.steps:
|
| 311 |
+
return job
|
| 312 |
+
|
| 313 |
+
started_at = run.run_started_at or run.created_at
|
| 314 |
+
completed_at = None if job.is_active else run.updated_at
|
| 315 |
+
synthetic_steps: list[GitHubJobStep] = []
|
| 316 |
+
|
| 317 |
+
if run.path.endswith("build-release.yaml"):
|
| 318 |
+
synthetic_steps.append(
|
| 319 |
+
GitHubJobStep(
|
| 320 |
+
name="Build and upload kernel",
|
| 321 |
+
status=job.status,
|
| 322 |
+
conclusion=job.conclusion,
|
| 323 |
+
number=1,
|
| 324 |
+
started_at=started_at,
|
| 325 |
+
completed_at=completed_at,
|
| 326 |
+
)
|
| 327 |
+
)
|
| 328 |
+
if (job.conclusion or "") == "success":
|
| 329 |
+
synthetic_steps.append(
|
| 330 |
+
GitHubJobStep(
|
| 331 |
+
name="Upload v1 kernels to main",
|
| 332 |
+
status="completed",
|
| 333 |
+
conclusion="success",
|
| 334 |
+
number=2,
|
| 335 |
+
started_at=completed_at or started_at,
|
| 336 |
+
completed_at=completed_at,
|
| 337 |
+
)
|
| 338 |
+
)
|
| 339 |
+
elif run.path.endswith("manual-build-upload.yaml"):
|
| 340 |
+
synthetic_steps.append(
|
| 341 |
+
GitHubJobStep(
|
| 342 |
+
name="Build and copy kernel",
|
| 343 |
+
status=job.status,
|
| 344 |
+
conclusion=job.conclusion,
|
| 345 |
+
number=1,
|
| 346 |
+
started_at=started_at,
|
| 347 |
+
completed_at=completed_at,
|
| 348 |
+
)
|
| 349 |
+
)
|
| 350 |
+
if (job.conclusion or "") == "success":
|
| 351 |
+
synthetic_steps.append(
|
| 352 |
+
GitHubJobStep(
|
| 353 |
+
name="Upload kernel",
|
| 354 |
+
status="completed",
|
| 355 |
+
conclusion="success",
|
| 356 |
+
number=2,
|
| 357 |
+
started_at=completed_at or started_at,
|
| 358 |
+
completed_at=completed_at,
|
| 359 |
+
)
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
if synthetic_steps:
|
| 363 |
+
job.steps = synthetic_steps
|
| 364 |
+
return job
|
| 365 |
+
|
| 366 |
+
def _kernel_info_for(self, kernel_name: str, parsed_repo_id: str | None) -> KernelInfo:
|
| 367 |
+
if kernel_name == "unknown":
|
| 368 |
+
repo_id = parsed_repo_id or f"{self.config.github.owner}/{self.config.github.repo}"
|
| 369 |
+
return KernelInfo(
|
| 370 |
+
kernel_name=kernel_name,
|
| 371 |
+
repo_id=repo_id,
|
| 372 |
+
hub_url=f"https://huggingface.co/{repo_id}",
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
info = self.kernel_index.get_kernel_info(kernel_name)
|
| 376 |
+
if not parsed_repo_id or parsed_repo_id == info.repo_id:
|
| 377 |
+
return info
|
| 378 |
+
|
| 379 |
+
return KernelInfo(
|
| 380 |
+
kernel_name=info.kernel_name,
|
| 381 |
+
repo_id=parsed_repo_id,
|
| 382 |
+
hub_url=f"https://huggingface.co/{parsed_repo_id}",
|
| 383 |
+
version=info.version,
|
| 384 |
+
backends=info.backends,
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
def _should_fetch_logs(self, job: GitHubJob) -> bool:
|
| 388 |
+
if job.is_active:
|
| 389 |
+
return True
|
| 390 |
+
if (job.conclusion or "") in FAILING_CONCLUSIONS:
|
| 391 |
+
return True
|
| 392 |
+
return False
|
| 393 |
+
|
| 394 |
+
@staticmethod
|
| 395 |
+
def _extract_arch(job_name: str) -> str:
|
| 396 |
+
match = ARCH_RE.search(job_name)
|
| 397 |
+
if match:
|
| 398 |
+
return match.group(1).strip()
|
| 399 |
+
return "n/a"
|
| 400 |
+
|
| 401 |
+
@staticmethod
|
| 402 |
+
def _record_sort_key(record: MonitorRecord) -> tuple[int, int, float]:
|
| 403 |
+
started_at = record.started_at or utcnow()
|
| 404 |
+
return (
|
| 405 |
+
0 if record.is_active else 1,
|
| 406 |
+
0 if record.critical else 1,
|
| 407 |
+
-started_at.timestamp(),
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
def _build_kernel_rows(self, records: list[MonitorRecord]) -> list[KernelRow]:
|
| 411 |
+
grouped_records: dict[str, list[MonitorRecord]] = defaultdict(list)
|
| 412 |
+
for record in records:
|
| 413 |
+
grouped_records[record.kernel_name].append(record)
|
| 414 |
+
|
| 415 |
+
info_map = {info.kernel_name: info for info in self.kernel_index.list_kernel_catalog()}
|
| 416 |
+
for record in records:
|
| 417 |
+
info_map[record.kernel_name] = record.kernel_info
|
| 418 |
+
|
| 419 |
+
rows: list[KernelRow] = []
|
| 420 |
+
for kernel_name, kernel_info in info_map.items():
|
| 421 |
+
kernel_records = sorted(grouped_records.get(kernel_name, []), key=self._record_sort_key)
|
| 422 |
+
recent_groups = self._group_kernel_runs(kernel_name, kernel_records)
|
| 423 |
+
current_group = next((group for group in recent_groups if group.is_active), None)
|
| 424 |
+
row_status_kind, row_status_label, row_reason, upload_label = self._summarize_kernel(
|
| 425 |
+
current_group,
|
| 426 |
+
recent_groups,
|
| 427 |
+
)
|
| 428 |
+
rows.append(
|
| 429 |
+
KernelRow(
|
| 430 |
+
kernel_name=kernel_name,
|
| 431 |
+
kernel_info=kernel_info,
|
| 432 |
+
critical=kernel_name in self.config.monitor.critical_kernel_set,
|
| 433 |
+
current_group=current_group,
|
| 434 |
+
recent_groups=recent_groups,
|
| 435 |
+
row_status_kind=row_status_kind,
|
| 436 |
+
row_status_label=row_status_label,
|
| 437 |
+
row_reason=row_reason,
|
| 438 |
+
upload_label=upload_label,
|
| 439 |
+
last_triggered_at=recent_groups[0].triggered_at if recent_groups else None,
|
| 440 |
+
)
|
| 441 |
+
)
|
| 442 |
+
|
| 443 |
+
rows.sort(key=self._kernel_row_sort_key)
|
| 444 |
+
return rows
|
| 445 |
+
|
| 446 |
+
def _group_kernel_runs(
|
| 447 |
+
self,
|
| 448 |
+
kernel_name: str,
|
| 449 |
+
records: list[MonitorRecord],
|
| 450 |
+
) -> list[KernelRunGroup]:
|
| 451 |
+
grouped: dict[int, list[MonitorRecord]] = defaultdict(list)
|
| 452 |
+
run_lookup: dict[int, GitHubRun] = {}
|
| 453 |
+
workflow_lookup: dict[int, str] = {}
|
| 454 |
+
for record in records:
|
| 455 |
+
grouped[record.run.id].append(record)
|
| 456 |
+
run_lookup[record.run.id] = record.run
|
| 457 |
+
workflow_lookup[record.run.id] = record.workflow_name
|
| 458 |
+
|
| 459 |
+
groups: list[KernelRunGroup] = []
|
| 460 |
+
for run_id, run_records in grouped.items():
|
| 461 |
+
sorted_records = sorted(
|
| 462 |
+
run_records,
|
| 463 |
+
key=lambda record: (
|
| 464 |
+
0 if record.is_active else 1,
|
| 465 |
+
0 if record.arch == "x86_64-linux" else 1,
|
| 466 |
+
record.arch,
|
| 467 |
+
),
|
| 468 |
+
)
|
| 469 |
+
groups.append(
|
| 470 |
+
KernelRunGroup(
|
| 471 |
+
kernel_name=kernel_name,
|
| 472 |
+
run=run_lookup[run_id],
|
| 473 |
+
workflow_name=workflow_lookup[run_id],
|
| 474 |
+
records=sorted_records,
|
| 475 |
+
)
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
groups.sort(key=lambda group: -group.triggered_at.timestamp())
|
| 479 |
+
return groups
|
| 480 |
+
|
| 481 |
+
@staticmethod
|
| 482 |
+
def _summarize_kernel(
|
| 483 |
+
current_group: KernelRunGroup | None,
|
| 484 |
+
recent_groups: list[KernelRunGroup],
|
| 485 |
+
) -> tuple[str, str, str, str]:
|
| 486 |
+
if current_group is not None:
|
| 487 |
+
if current_group.has_stall:
|
| 488 |
+
status_kind = "stalled"
|
| 489 |
+
status_label = "Stalled"
|
| 490 |
+
elif current_group.has_uploading:
|
| 491 |
+
status_kind = "uploading"
|
| 492 |
+
status_label = "Uploading"
|
| 493 |
+
else:
|
| 494 |
+
status_kind = "running"
|
| 495 |
+
status_label = "Running"
|
| 496 |
+
return (
|
| 497 |
+
status_kind,
|
| 498 |
+
status_label,
|
| 499 |
+
MonitorService._arch_summary(current_group.records),
|
| 500 |
+
MonitorService._upload_summary(current_group.records),
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
if not recent_groups:
|
| 504 |
+
return ("idle", "Idle", "No recent tracked CI run.", "No recent upload")
|
| 505 |
+
|
| 506 |
+
latest_group = recent_groups[0]
|
| 507 |
+
if latest_group.has_failure:
|
| 508 |
+
status_kind = "failed"
|
| 509 |
+
status_label = "Failed"
|
| 510 |
+
elif any(record.upload_status == "completed" for record in latest_group.records):
|
| 511 |
+
status_kind = "completed"
|
| 512 |
+
status_label = "Completed"
|
| 513 |
+
elif all((record.job.conclusion or "") == "cancelled" for record in latest_group.records):
|
| 514 |
+
status_kind = "cancelled"
|
| 515 |
+
status_label = "Cancelled"
|
| 516 |
+
else:
|
| 517 |
+
status_kind = "recent"
|
| 518 |
+
status_label = "Recent"
|
| 519 |
+
|
| 520 |
+
return (
|
| 521 |
+
status_kind,
|
| 522 |
+
status_label,
|
| 523 |
+
MonitorService._arch_summary(latest_group.records),
|
| 524 |
+
MonitorService._upload_summary(latest_group.records),
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
+
@staticmethod
|
| 528 |
+
def _arch_summary(records: list[MonitorRecord]) -> str:
|
| 529 |
+
if not records:
|
| 530 |
+
return "No job details."
|
| 531 |
+
return " | ".join(
|
| 532 |
+
f"{MonitorService._short_arch(record.arch)}: {record.phase_label}"
|
| 533 |
+
for record in records
|
| 534 |
+
)
|
| 535 |
+
|
| 536 |
+
@staticmethod
|
| 537 |
+
def _upload_summary(records: list[MonitorRecord]) -> str:
|
| 538 |
+
if not records:
|
| 539 |
+
return "No upload"
|
| 540 |
+
return " | ".join(
|
| 541 |
+
f"{MonitorService._short_arch(record.arch)}: {record.upload_status_label}"
|
| 542 |
+
for record in records
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
@staticmethod
|
| 546 |
+
def _short_arch(arch: str) -> str:
|
| 547 |
+
mapping = {
|
| 548 |
+
"x86_64-linux": "x86",
|
| 549 |
+
"aarch64-linux": "arm64",
|
| 550 |
+
"x86_64-darwin": "mac",
|
| 551 |
+
"aarch64-darwin": "mac-arm",
|
| 552 |
+
}
|
| 553 |
+
return mapping.get(arch, arch)
|
| 554 |
+
|
| 555 |
+
@staticmethod
|
| 556 |
+
def _kernel_row_sort_key(row: KernelRow) -> tuple[int, int, int, str]:
|
| 557 |
+
status_rank = {
|
| 558 |
+
"stalled": 0,
|
| 559 |
+
"uploading": 1,
|
| 560 |
+
"running": 2,
|
| 561 |
+
"failed": 3,
|
| 562 |
+
"completed": 4,
|
| 563 |
+
"cancelled": 5,
|
| 564 |
+
"recent": 6,
|
| 565 |
+
"idle": 7,
|
| 566 |
+
}
|
| 567 |
+
return (
|
| 568 |
+
status_rank.get(row.row_status_kind, 99),
|
| 569 |
+
0 if row.critical else 1,
|
| 570 |
+
0 if row.last_triggered_at else 1,
|
| 571 |
+
row.kernel_name,
|
| 572 |
+
)
|
src/kc_monitor/stall_detector.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
|
| 5 |
+
from kc_monitor.config import MonitorSettings
|
| 6 |
+
from kc_monitor.models import MonitorRecord, utcnow
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
ACTIVE_PHASES = {"building", "uploading", "testing"}
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _format_duration(delta: timedelta) -> str:
|
| 13 |
+
total_seconds = int(delta.total_seconds())
|
| 14 |
+
if total_seconds < 60:
|
| 15 |
+
return f"{total_seconds}s"
|
| 16 |
+
if total_seconds < 3600:
|
| 17 |
+
return f"{total_seconds // 60}m"
|
| 18 |
+
hours, remainder = divmod(total_seconds, 3600)
|
| 19 |
+
minutes = remainder // 60
|
| 20 |
+
if minutes:
|
| 21 |
+
return f"{hours}h {minutes}m"
|
| 22 |
+
return f"{hours}h"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def detect_stall(
|
| 26 |
+
record: MonitorRecord,
|
| 27 |
+
settings: MonitorSettings,
|
| 28 |
+
now: datetime | None = None,
|
| 29 |
+
) -> tuple[bool, str | None]:
|
| 30 |
+
if not record.is_active:
|
| 31 |
+
return False, None
|
| 32 |
+
|
| 33 |
+
if record.phase not in ACTIVE_PHASES:
|
| 34 |
+
return False, None
|
| 35 |
+
|
| 36 |
+
now = now or utcnow()
|
| 37 |
+
latest_signal = record.latest_signal_at or record.run.updated_at or record.started_at
|
| 38 |
+
if latest_signal:
|
| 39 |
+
silent_for = now - latest_signal
|
| 40 |
+
if silent_for >= timedelta(minutes=settings.stall_without_log_minutes):
|
| 41 |
+
return True, f"No fresh signal for { _format_duration(silent_for) }."
|
| 42 |
+
|
| 43 |
+
if record.active_step_started_at:
|
| 44 |
+
phase_duration = now - record.active_step_started_at
|
| 45 |
+
if phase_duration >= timedelta(minutes=settings.stall_active_phase_minutes):
|
| 46 |
+
return True, f"{record.phase_label} has been running for { _format_duration(phase_duration) }."
|
| 47 |
+
|
| 48 |
+
return False, None
|
src/kc_monitor/ui.py
ADDED
|
@@ -0,0 +1,1110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import html
|
| 4 |
+
import re
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
from kc_monitor.config import AppConfig
|
| 10 |
+
from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
|
| 11 |
+
from kc_monitor.models import DashboardSnapshot, KernelRow, KernelRunGroup, MonitorRecord
|
| 12 |
+
from kc_monitor.service import MonitorService
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
VARIANT_RE = re.compile(r"\(([^)]+)\)")
|
| 16 |
+
|
| 17 |
+
THEME = gr.themes.Base()
|
| 18 |
+
|
| 19 |
+
PAGE_JS = """
|
| 20 |
+
function kcmBoot() {
|
| 21 |
+
if (window._kcmBooted) return;
|
| 22 |
+
window._kcmBooted = true;
|
| 23 |
+
|
| 24 |
+
function applyFilters() {
|
| 25 |
+
var search = document.querySelector('.kcm-search');
|
| 26 |
+
var status = document.querySelector('.kcm-status-filter');
|
| 27 |
+
var searchValue = search ? search.value.toLowerCase().trim() : '';
|
| 28 |
+
var statusValue = status ? status.value : 'all';
|
| 29 |
+
|
| 30 |
+
document.querySelectorAll('#kernelTable tbody tr[data-idx]').forEach(function(row) {
|
| 31 |
+
var kernel = (row.getAttribute('data-kernel') || '').toLowerCase();
|
| 32 |
+
var rowStatus = row.getAttribute('data-status') || 'all';
|
| 33 |
+
var workflow = (row.getAttribute('data-workflow') || '').toLowerCase();
|
| 34 |
+
var searchOk = !searchValue || kernel.indexOf(searchValue) >= 0 || workflow.indexOf(searchValue) >= 0;
|
| 35 |
+
var statusOk = statusValue === 'all' || rowStatus === statusValue;
|
| 36 |
+
row.style.display = searchOk && statusOk ? '' : 'none';
|
| 37 |
+
});
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
document.addEventListener('click', function(e) {
|
| 41 |
+
var row = e.target.closest('tr[data-idx]');
|
| 42 |
+
if (row && !e.target.closest('a')) {
|
| 43 |
+
var idx = row.getAttribute('data-idx');
|
| 44 |
+
var el = document.getElementById('modal-content-' + idx);
|
| 45 |
+
if (!el) return;
|
| 46 |
+
document.getElementById('kcmModal').innerHTML = el.innerHTML;
|
| 47 |
+
document.getElementById('kcmOverlay').classList.add('open');
|
| 48 |
+
document.body.style.overflow = 'hidden';
|
| 49 |
+
return;
|
| 50 |
+
}
|
| 51 |
+
if (e.target.closest('.kcm-modal-close') || e.target.id === 'kcmOverlay') {
|
| 52 |
+
document.getElementById('kcmOverlay').classList.remove('open');
|
| 53 |
+
document.body.style.overflow = '';
|
| 54 |
+
}
|
| 55 |
+
});
|
| 56 |
+
|
| 57 |
+
document.addEventListener('input', function(e) {
|
| 58 |
+
if (e.target.classList.contains('kcm-search')) applyFilters();
|
| 59 |
+
});
|
| 60 |
+
|
| 61 |
+
document.addEventListener('change', function(e) {
|
| 62 |
+
if (e.target.classList.contains('kcm-status-filter')) applyFilters();
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
document.addEventListener('keydown', function(e) {
|
| 66 |
+
if (e.key === 'Escape') {
|
| 67 |
+
document.getElementById('kcmOverlay').classList.remove('open');
|
| 68 |
+
document.body.style.overflow = '';
|
| 69 |
+
}
|
| 70 |
+
});
|
| 71 |
+
|
| 72 |
+
applyFilters();
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
kcmBoot();
|
| 76 |
+
new MutationObserver(function() {
|
| 77 |
+
window._kcmBooted = false;
|
| 78 |
+
kcmBoot();
|
| 79 |
+
}).observe(document.body, { childList: true, subtree: true });
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
CSS = """
|
| 83 |
+
:root {
|
| 84 |
+
--bg: #050711;
|
| 85 |
+
--surface: rgba(11, 16, 30, 0.92);
|
| 86 |
+
--surface-2: rgba(14, 22, 40, 0.94);
|
| 87 |
+
--surface-3: rgba(19, 29, 53, 0.98);
|
| 88 |
+
--surface-hover: rgba(121, 171, 255, 0.06);
|
| 89 |
+
--text: #f4f7ff;
|
| 90 |
+
--text-secondary: #98a7c4;
|
| 91 |
+
--text-tertiary: #6d7b98;
|
| 92 |
+
--accent: #86b0ff;
|
| 93 |
+
--accent-2: #6ff0c0;
|
| 94 |
+
--ok: #74efab;
|
| 95 |
+
--warn: #ffca6d;
|
| 96 |
+
--bad: #ff808e;
|
| 97 |
+
--border: rgba(255, 255, 255, 0.08);
|
| 98 |
+
--border-strong: rgba(255, 255, 255, 0.14);
|
| 99 |
+
--radius: 24px;
|
| 100 |
+
--radius-sm: 16px;
|
| 101 |
+
--shadow: 0 28px 90px rgba(0, 0, 0, 0.32);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
*,
|
| 105 |
+
*::before,
|
| 106 |
+
*::after {
|
| 107 |
+
box-sizing: border-box;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
body,
|
| 111 |
+
.gradio-container {
|
| 112 |
+
background:
|
| 113 |
+
radial-gradient(circle at 0% 0%, rgba(134, 176, 255, 0.18), transparent 28%),
|
| 114 |
+
radial-gradient(circle at 100% 0%, rgba(111, 240, 192, 0.10), transparent 30%),
|
| 115 |
+
radial-gradient(circle at 50% 100%, rgba(110, 130, 255, 0.08), transparent 40%),
|
| 116 |
+
#050711 !important;
|
| 117 |
+
color: var(--text);
|
| 118 |
+
font-family: "Inter", -apple-system, BlinkMacSystemFont, sans-serif;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
a {
|
| 122 |
+
color: var(--accent);
|
| 123 |
+
text-decoration: none;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
a:hover {
|
| 127 |
+
text-decoration: underline;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.kcm-shell {
|
| 131 |
+
max-width: 1540px;
|
| 132 |
+
margin: 0 auto;
|
| 133 |
+
padding: 18px 20px 28px;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.kcm-hero {
|
| 137 |
+
position: relative;
|
| 138 |
+
overflow: hidden;
|
| 139 |
+
background:
|
| 140 |
+
linear-gradient(135deg, rgba(134, 176, 255, 0.14), rgba(111, 240, 192, 0.06)),
|
| 141 |
+
var(--surface);
|
| 142 |
+
border: 1px solid var(--border);
|
| 143 |
+
border-radius: 30px;
|
| 144 |
+
padding: 30px 34px;
|
| 145 |
+
box-shadow: var(--shadow);
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
.kcm-hero::after {
|
| 149 |
+
content: "";
|
| 150 |
+
position: absolute;
|
| 151 |
+
inset: auto -80px -120px auto;
|
| 152 |
+
width: 320px;
|
| 153 |
+
height: 320px;
|
| 154 |
+
border-radius: 50%;
|
| 155 |
+
background: radial-gradient(circle, rgba(111, 240, 192, 0.16), transparent 62%);
|
| 156 |
+
pointer-events: none;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
.kcm-eyebrow {
|
| 160 |
+
color: var(--accent-2);
|
| 161 |
+
font-size: 11px;
|
| 162 |
+
text-transform: uppercase;
|
| 163 |
+
letter-spacing: 0.16em;
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.kcm-hero h1 {
|
| 167 |
+
margin: 10px 0 0;
|
| 168 |
+
font-size: 38px;
|
| 169 |
+
line-height: 1.05;
|
| 170 |
+
letter-spacing: -0.05em;
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
.kcm-hero p {
|
| 174 |
+
margin: 12px 0 0;
|
| 175 |
+
max-width: 1040px;
|
| 176 |
+
color: var(--text-secondary);
|
| 177 |
+
font-size: 15px;
|
| 178 |
+
line-height: 1.65;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
.kcm-meta,
|
| 182 |
+
.kcm-stats,
|
| 183 |
+
.kcm-graphs {
|
| 184 |
+
display: grid;
|
| 185 |
+
gap: 12px;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
.kcm-meta {
|
| 189 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 190 |
+
margin-top: 18px;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.kcm-stats {
|
| 194 |
+
grid-template-columns: repeat(5, minmax(0, 1fr));
|
| 195 |
+
margin-top: 18px;
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
.kcm-meta-card,
|
| 199 |
+
.kcm-stat,
|
| 200 |
+
.kcm-panel-link {
|
| 201 |
+
background: rgba(255, 255, 255, 0.04);
|
| 202 |
+
border: 1px solid var(--border);
|
| 203 |
+
border-radius: 20px;
|
| 204 |
+
padding: 16px 18px;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
.kcm-meta-card-label,
|
| 208 |
+
.kcm-stat-label {
|
| 209 |
+
font-size: 11px;
|
| 210 |
+
text-transform: uppercase;
|
| 211 |
+
letter-spacing: 0.10em;
|
| 212 |
+
color: var(--text-tertiary);
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
.kcm-meta-card-value {
|
| 216 |
+
margin-top: 8px;
|
| 217 |
+
font-size: 14px;
|
| 218 |
+
color: var(--text-secondary);
|
| 219 |
+
word-break: break-word;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
.kcm-stat-value {
|
| 223 |
+
margin-top: 8px;
|
| 224 |
+
font-size: 30px;
|
| 225 |
+
font-weight: 700;
|
| 226 |
+
letter-spacing: -0.03em;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.kcm-toolbar {
|
| 230 |
+
margin-top: 18px;
|
| 231 |
+
display: flex;
|
| 232 |
+
justify-content: space-between;
|
| 233 |
+
align-items: center;
|
| 234 |
+
gap: 14px;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
.kcm-toolbar-left {
|
| 238 |
+
color: var(--text-tertiary);
|
| 239 |
+
font-size: 13px;
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
.kcm-toolbar-left code {
|
| 243 |
+
padding: 3px 8px;
|
| 244 |
+
background: rgba(255, 255, 255, 0.05);
|
| 245 |
+
border-radius: 999px;
|
| 246 |
+
color: var(--text-secondary);
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
.kcm-toolbar-right {
|
| 250 |
+
display: flex;
|
| 251 |
+
align-items: center;
|
| 252 |
+
gap: 10px;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
.kcm-search,
|
| 256 |
+
.kcm-status-filter {
|
| 257 |
+
background: var(--surface-2);
|
| 258 |
+
border: 1px solid var(--border);
|
| 259 |
+
border-radius: 14px;
|
| 260 |
+
padding: 10px 14px;
|
| 261 |
+
color: var(--text);
|
| 262 |
+
font-size: 14px;
|
| 263 |
+
outline: none;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.kcm-search {
|
| 267 |
+
min-width: 260px;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
.kcm-table-shell {
|
| 271 |
+
margin-top: 16px;
|
| 272 |
+
background: var(--surface);
|
| 273 |
+
border: 1px solid var(--border);
|
| 274 |
+
border-radius: 26px;
|
| 275 |
+
overflow: hidden;
|
| 276 |
+
box-shadow: var(--shadow);
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.kcm-table-wrap {
|
| 280 |
+
overflow-x: auto;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
.kcm-table {
|
| 284 |
+
width: 100%;
|
| 285 |
+
border-collapse: separate;
|
| 286 |
+
border-spacing: 0;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
.kcm-table th {
|
| 290 |
+
position: sticky;
|
| 291 |
+
top: 0;
|
| 292 |
+
z-index: 2;
|
| 293 |
+
text-align: left;
|
| 294 |
+
padding: 14px 16px;
|
| 295 |
+
font-size: 11px;
|
| 296 |
+
text-transform: uppercase;
|
| 297 |
+
letter-spacing: 0.12em;
|
| 298 |
+
color: var(--text-tertiary);
|
| 299 |
+
background: rgba(7, 11, 23, 0.96);
|
| 300 |
+
border-bottom: 1px solid var(--border-strong);
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
.kcm-table td {
|
| 304 |
+
padding: 16px;
|
| 305 |
+
vertical-align: top;
|
| 306 |
+
border-bottom: 1px solid var(--border);
|
| 307 |
+
font-size: 14px;
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
.kcm-table tbody tr {
|
| 311 |
+
cursor: pointer;
|
| 312 |
+
transition: background 0.16s ease;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
.kcm-table tbody tr:hover td {
|
| 316 |
+
background: var(--surface-hover);
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
.kcm-table tbody tr:last-child td {
|
| 320 |
+
border-bottom: none;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
.kcm-kernel-name {
|
| 324 |
+
font-size: 16px;
|
| 325 |
+
font-weight: 700;
|
| 326 |
+
letter-spacing: -0.02em;
|
| 327 |
+
}
|
| 328 |
+
|
| 329 |
+
.kcm-kernel-meta,
|
| 330 |
+
.kcm-subtle,
|
| 331 |
+
.kcm-activity-sub {
|
| 332 |
+
margin-top: 4px;
|
| 333 |
+
color: var(--text-tertiary);
|
| 334 |
+
font-size: 12px;
|
| 335 |
+
line-height: 1.45;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
.kcm-badges,
|
| 339 |
+
.kcm-variant-stack,
|
| 340 |
+
.kcm-actions {
|
| 341 |
+
display: flex;
|
| 342 |
+
flex-wrap: wrap;
|
| 343 |
+
gap: 8px;
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
.kcm-badge {
|
| 347 |
+
display: inline-flex;
|
| 348 |
+
align-items: center;
|
| 349 |
+
gap: 6px;
|
| 350 |
+
padding: 5px 10px;
|
| 351 |
+
border-radius: 999px;
|
| 352 |
+
font-size: 11px;
|
| 353 |
+
font-weight: 700;
|
| 354 |
+
white-space: nowrap;
|
| 355 |
+
border: 1px solid transparent;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
.kcm-badge.ok {
|
| 359 |
+
color: var(--ok);
|
| 360 |
+
background: rgba(116, 239, 171, 0.10);
|
| 361 |
+
border-color: rgba(116, 239, 171, 0.14);
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
.kcm-badge.warn {
|
| 365 |
+
color: var(--warn);
|
| 366 |
+
background: rgba(255, 202, 109, 0.10);
|
| 367 |
+
border-color: rgba(255, 202, 109, 0.15);
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
.kcm-badge.bad {
|
| 371 |
+
color: var(--bad);
|
| 372 |
+
background: rgba(255, 128, 142, 0.10);
|
| 373 |
+
border-color: rgba(255, 128, 142, 0.14);
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
.kcm-badge.info {
|
| 377 |
+
color: var(--accent);
|
| 378 |
+
background: rgba(134, 176, 255, 0.12);
|
| 379 |
+
border-color: rgba(134, 176, 255, 0.16);
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
.kcm-badge.muted {
|
| 383 |
+
color: var(--text-tertiary);
|
| 384 |
+
background: rgba(255, 255, 255, 0.05);
|
| 385 |
+
border-color: rgba(255, 255, 255, 0.06);
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
.kcm-badge.critical {
|
| 389 |
+
color: var(--bad);
|
| 390 |
+
background: rgba(255, 128, 142, 0.10);
|
| 391 |
+
border-color: rgba(255, 128, 142, 0.14);
|
| 392 |
+
text-transform: uppercase;
|
| 393 |
+
letter-spacing: 0.12em;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
.kcm-variant {
|
| 397 |
+
min-width: 180px;
|
| 398 |
+
padding: 10px 12px;
|
| 399 |
+
border-radius: 16px;
|
| 400 |
+
background: rgba(255, 255, 255, 0.04);
|
| 401 |
+
border: 1px solid var(--border);
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
.kcm-variant-head {
|
| 405 |
+
display: flex;
|
| 406 |
+
justify-content: space-between;
|
| 407 |
+
gap: 8px;
|
| 408 |
+
align-items: center;
|
| 409 |
+
}
|
| 410 |
+
|
| 411 |
+
.kcm-variant-name {
|
| 412 |
+
font-size: 12px;
|
| 413 |
+
font-weight: 700;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
.kcm-variant-sub {
|
| 417 |
+
margin-top: 6px;
|
| 418 |
+
font-size: 11px;
|
| 419 |
+
color: var(--text-tertiary);
|
| 420 |
+
line-height: 1.45;
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
.kcm-action {
|
| 424 |
+
display: inline-flex;
|
| 425 |
+
align-items: center;
|
| 426 |
+
padding: 8px 12px;
|
| 427 |
+
border-radius: 12px;
|
| 428 |
+
background: rgba(255, 255, 255, 0.05);
|
| 429 |
+
border: 1px solid var(--border);
|
| 430 |
+
color: var(--text-secondary);
|
| 431 |
+
font-size: 12px;
|
| 432 |
+
font-weight: 600;
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
.kcm-action:hover {
|
| 436 |
+
text-decoration: none;
|
| 437 |
+
border-color: var(--border-strong);
|
| 438 |
+
color: var(--text);
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.kcm-section {
|
| 442 |
+
margin-top: 22px;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
.kcm-section-title {
|
| 446 |
+
margin: 0 0 12px;
|
| 447 |
+
font-size: 18px;
|
| 448 |
+
letter-spacing: -0.02em;
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
.kcm-graphs {
|
| 452 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
.kcm-panel-link {
|
| 456 |
+
transition: transform 0.15s ease, border-color 0.15s ease;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
.kcm-panel-link:hover {
|
| 460 |
+
transform: translateY(-2px);
|
| 461 |
+
border-color: var(--border-strong);
|
| 462 |
+
text-decoration: none;
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
.kcm-panel-label {
|
| 466 |
+
color: var(--accent-2);
|
| 467 |
+
font-size: 11px;
|
| 468 |
+
text-transform: uppercase;
|
| 469 |
+
letter-spacing: 0.12em;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.kcm-panel-title {
|
| 473 |
+
margin-top: 8px;
|
| 474 |
+
font-size: 18px;
|
| 475 |
+
font-weight: 700;
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
.kcm-panel-copy {
|
| 479 |
+
margin-top: 8px;
|
| 480 |
+
color: var(--text-secondary);
|
| 481 |
+
font-size: 13px;
|
| 482 |
+
line-height: 1.55;
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
.kcm-frame {
|
| 486 |
+
margin-top: 16px;
|
| 487 |
+
background: var(--surface-3);
|
| 488 |
+
border: 1px solid var(--border);
|
| 489 |
+
border-radius: 24px;
|
| 490 |
+
overflow: hidden;
|
| 491 |
+
box-shadow: var(--shadow);
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
.kcm-frame-head {
|
| 495 |
+
padding: 14px 18px;
|
| 496 |
+
display: flex;
|
| 497 |
+
justify-content: space-between;
|
| 498 |
+
align-items: center;
|
| 499 |
+
gap: 12px;
|
| 500 |
+
border-bottom: 1px solid var(--border);
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
.kcm-frame-title {
|
| 504 |
+
font-size: 15px;
|
| 505 |
+
font-weight: 700;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
.kcm-frame-copy {
|
| 509 |
+
font-size: 13px;
|
| 510 |
+
color: var(--text-secondary);
|
| 511 |
+
line-height: 1.45;
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
.kcm-open {
|
| 515 |
+
font-size: 12px;
|
| 516 |
+
font-weight: 700;
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.kcm-frame iframe {
|
| 520 |
+
display: block;
|
| 521 |
+
width: 100%;
|
| 522 |
+
border: none;
|
| 523 |
+
background: #0b1020;
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
.kcm-overlay {
|
| 527 |
+
position: fixed;
|
| 528 |
+
inset: 0;
|
| 529 |
+
z-index: 9999;
|
| 530 |
+
display: none;
|
| 531 |
+
padding: 26px 16px;
|
| 532 |
+
overflow-y: auto;
|
| 533 |
+
background: rgba(4, 7, 16, 0.82);
|
| 534 |
+
backdrop-filter: blur(16px);
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
.kcm-overlay.open {
|
| 538 |
+
display: block;
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
.kcm-modal {
|
| 542 |
+
max-width: 1180px;
|
| 543 |
+
margin: 0 auto;
|
| 544 |
+
background: var(--surface-3);
|
| 545 |
+
border: 1px solid var(--border-strong);
|
| 546 |
+
border-radius: 28px;
|
| 547 |
+
overflow: hidden;
|
| 548 |
+
box-shadow: 0 40px 140px rgba(0, 0, 0, 0.42);
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
.kcm-modal-header {
|
| 552 |
+
padding: 24px 28px;
|
| 553 |
+
border-bottom: 1px solid var(--border);
|
| 554 |
+
display: flex;
|
| 555 |
+
justify-content: space-between;
|
| 556 |
+
align-items: flex-start;
|
| 557 |
+
gap: 20px;
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
.kcm-modal-header h2 {
|
| 561 |
+
margin: 0;
|
| 562 |
+
font-size: 28px;
|
| 563 |
+
letter-spacing: -0.04em;
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
.kcm-modal-header p {
|
| 567 |
+
margin: 8px 0 0;
|
| 568 |
+
color: var(--text-secondary);
|
| 569 |
+
font-size: 14px;
|
| 570 |
+
line-height: 1.55;
|
| 571 |
+
}
|
| 572 |
+
|
| 573 |
+
.kcm-modal-close {
|
| 574 |
+
padding: 9px 14px;
|
| 575 |
+
border-radius: 12px;
|
| 576 |
+
border: 1px solid var(--border);
|
| 577 |
+
background: rgba(255, 255, 255, 0.05);
|
| 578 |
+
color: var(--text-secondary);
|
| 579 |
+
cursor: pointer;
|
| 580 |
+
font-size: 12px;
|
| 581 |
+
font-weight: 700;
|
| 582 |
+
}
|
| 583 |
+
|
| 584 |
+
.kcm-modal-body {
|
| 585 |
+
padding: 24px 28px 30px;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
.kcm-run-card {
|
| 589 |
+
margin-top: 14px;
|
| 590 |
+
background: rgba(255, 255, 255, 0.03);
|
| 591 |
+
border: 1px solid var(--border);
|
| 592 |
+
border-radius: 22px;
|
| 593 |
+
padding: 18px;
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
.kcm-run-card-head {
|
| 597 |
+
display: flex;
|
| 598 |
+
justify-content: space-between;
|
| 599 |
+
align-items: flex-start;
|
| 600 |
+
gap: 14px;
|
| 601 |
+
margin-bottom: 14px;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
.kcm-run-card-title {
|
| 605 |
+
font-size: 16px;
|
| 606 |
+
font-weight: 700;
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
.kcm-run-card-meta {
|
| 610 |
+
margin-top: 6px;
|
| 611 |
+
color: var(--text-tertiary);
|
| 612 |
+
font-size: 12px;
|
| 613 |
+
line-height: 1.55;
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
.kcm-arch-grid {
|
| 617 |
+
display: grid;
|
| 618 |
+
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
| 619 |
+
gap: 12px;
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
.kcm-arch-card {
|
| 623 |
+
background: rgba(255, 255, 255, 0.03);
|
| 624 |
+
border: 1px solid var(--border);
|
| 625 |
+
border-radius: 18px;
|
| 626 |
+
padding: 14px;
|
| 627 |
+
}
|
| 628 |
+
|
| 629 |
+
.kcm-arch-head {
|
| 630 |
+
display: flex;
|
| 631 |
+
justify-content: space-between;
|
| 632 |
+
align-items: center;
|
| 633 |
+
gap: 10px;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
.kcm-arch-name {
|
| 637 |
+
font-size: 14px;
|
| 638 |
+
font-weight: 700;
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
.kcm-arch-detail {
|
| 642 |
+
margin-top: 8px;
|
| 643 |
+
font-size: 12px;
|
| 644 |
+
color: var(--text-secondary);
|
| 645 |
+
line-height: 1.55;
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
.kcm-failure-box {
|
| 649 |
+
margin-top: 10px;
|
| 650 |
+
padding: 10px 12px;
|
| 651 |
+
border-radius: 14px;
|
| 652 |
+
background: rgba(255, 128, 142, 0.08);
|
| 653 |
+
border: 1px solid rgba(255, 128, 142, 0.12);
|
| 654 |
+
color: var(--bad);
|
| 655 |
+
font-family: "JetBrains Mono", Consolas, monospace;
|
| 656 |
+
font-size: 12px;
|
| 657 |
+
white-space: pre-wrap;
|
| 658 |
+
max-height: 200px;
|
| 659 |
+
overflow-y: auto;
|
| 660 |
+
}
|
| 661 |
+
|
| 662 |
+
.kcm-empty {
|
| 663 |
+
padding: 16px 0;
|
| 664 |
+
color: var(--text-tertiary);
|
| 665 |
+
font-size: 14px;
|
| 666 |
+
}
|
| 667 |
+
|
| 668 |
+
@media (max-width: 1260px) {
|
| 669 |
+
.kcm-stats,
|
| 670 |
+
.kcm-meta,
|
| 671 |
+
.kcm-graphs {
|
| 672 |
+
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 673 |
+
}
|
| 674 |
+
}
|
| 675 |
+
|
| 676 |
+
@media (max-width: 900px) {
|
| 677 |
+
.kcm-stats,
|
| 678 |
+
.kcm-meta,
|
| 679 |
+
.kcm-graphs,
|
| 680 |
+
.kcm-arch-grid {
|
| 681 |
+
grid-template-columns: 1fr;
|
| 682 |
+
}
|
| 683 |
+
|
| 684 |
+
.kcm-toolbar,
|
| 685 |
+
.kcm-run-card-head,
|
| 686 |
+
.kcm-modal-header {
|
| 687 |
+
flex-direction: column;
|
| 688 |
+
align-items: stretch;
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
.kcm-search {
|
| 692 |
+
min-width: 0;
|
| 693 |
+
width: 100%;
|
| 694 |
+
}
|
| 695 |
+
}
|
| 696 |
+
"""
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
def _dt(value: datetime | None) -> str:
|
| 700 |
+
if not value:
|
| 701 |
+
return "n/a"
|
| 702 |
+
return value.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
| 703 |
+
|
| 704 |
+
|
| 705 |
+
def _short_dt(value: datetime | None) -> str:
|
| 706 |
+
if not value:
|
| 707 |
+
return "Never"
|
| 708 |
+
return value.astimezone(timezone.utc).strftime("%b %d, %H:%M")
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
def _badge(label: str, kind: str) -> str:
|
| 712 |
+
css = {
|
| 713 |
+
"completed": "ok",
|
| 714 |
+
"uploading": "warn",
|
| 715 |
+
"running": "info",
|
| 716 |
+
"recent": "info",
|
| 717 |
+
"failed": "bad",
|
| 718 |
+
"cancelled": "bad",
|
| 719 |
+
"stalled": "warn",
|
| 720 |
+
"idle": "muted",
|
| 721 |
+
"success": "ok",
|
| 722 |
+
"not_started": "muted",
|
| 723 |
+
"skipped": "muted",
|
| 724 |
+
}.get(kind, "info")
|
| 725 |
+
return f'<span class="kcm-badge {css}">{html.escape(label)}</span>'
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
def _short_arch(arch: str) -> str:
|
| 729 |
+
return {
|
| 730 |
+
"x86_64-linux": "x86_64-linux",
|
| 731 |
+
"aarch64-linux": "aarch64-linux",
|
| 732 |
+
"x86_64-darwin": "x86_64-darwin",
|
| 733 |
+
"aarch64-darwin": "aarch64-darwin",
|
| 734 |
+
}.get(arch, arch)
|
| 735 |
+
|
| 736 |
+
|
| 737 |
+
def _variant_label(record: MonitorRecord) -> str:
|
| 738 |
+
match = VARIANT_RE.search(record.job.name)
|
| 739 |
+
if match:
|
| 740 |
+
parts = [part.strip() for part in match.group(1).split(",") if part.strip()]
|
| 741 |
+
if parts:
|
| 742 |
+
parts[0] = _short_arch(parts[0])
|
| 743 |
+
return " | ".join(parts)
|
| 744 |
+
if record.workflow_name.lower().startswith("manual"):
|
| 745 |
+
return "manual upload"
|
| 746 |
+
if record.arch and record.arch not in {"all", "n/a"}:
|
| 747 |
+
return _short_arch(record.arch)
|
| 748 |
+
return record.job.name or "job"
|
| 749 |
+
|
| 750 |
+
|
| 751 |
+
def _variant_chip(record: MonitorRecord) -> str:
|
| 752 |
+
phase_kind = "stalled" if record.suspected_stalled else record.phase
|
| 753 |
+
upload = _badge(record.upload_status_label, record.upload_status)
|
| 754 |
+
return f"""
|
| 755 |
+
<div class="kcm-variant">
|
| 756 |
+
<div class="kcm-variant-head">
|
| 757 |
+
<div class="kcm-variant-name">{html.escape(_variant_label(record))}</div>
|
| 758 |
+
{_badge(record.phase_label, phase_kind)}
|
| 759 |
+
</div>
|
| 760 |
+
<div class="kcm-variant-sub">Upload {upload}</div>
|
| 761 |
+
<div class="kcm-variant-sub">Runner {html.escape(record.runner_group or 'n/a')}</div>
|
| 762 |
+
</div>
|
| 763 |
+
"""
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
def _group_badges(group: KernelRunGroup) -> str:
|
| 767 |
+
badges = []
|
| 768 |
+
if group.is_active:
|
| 769 |
+
badges.append(_badge("Running", "running"))
|
| 770 |
+
elif group.has_failure:
|
| 771 |
+
badges.append(_badge("Failed", "failed"))
|
| 772 |
+
else:
|
| 773 |
+
badges.append(_badge("Completed", "completed"))
|
| 774 |
+
if group.has_uploading:
|
| 775 |
+
badges.append(_badge("Uploading", "uploading"))
|
| 776 |
+
if group.has_stall:
|
| 777 |
+
badges.append(_badge("Stalled", "stalled"))
|
| 778 |
+
return " ".join(badges)
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
def _latest_group_for_workflow(row: KernelRow, workflow_path: str) -> KernelRunGroup | None:
|
| 782 |
+
return next((group for group in row.recent_groups if group.run.path == workflow_path), None)
|
| 783 |
+
|
| 784 |
+
|
| 785 |
+
def _workflow_cell(group: KernelRunGroup | None, empty_label: str) -> str:
|
| 786 |
+
if not group:
|
| 787 |
+
return f'<div class="kcm-subtle">{html.escape(empty_label)}</div>'
|
| 788 |
+
variant_stack = "".join(_variant_chip(record) for record in group.records)
|
| 789 |
+
return f"""
|
| 790 |
+
<div class="kcm-badges">{_group_badges(group)}</div>
|
| 791 |
+
<div class="kcm-subtle">{html.escape(group.run.display_title or group.run.name)}</div>
|
| 792 |
+
<div class="kcm-variant-stack" style="margin-top:10px">{variant_stack}</div>
|
| 793 |
+
"""
|
| 794 |
+
|
| 795 |
+
|
| 796 |
+
def _actions_cell(row: KernelRow, config: AppConfig) -> str:
|
| 797 |
+
actions: list[str] = []
|
| 798 |
+
release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
|
| 799 |
+
manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
|
| 800 |
+
if release_group:
|
| 801 |
+
actions.append(
|
| 802 |
+
f'<a class="kcm-action" href="{html.escape(release_group.run.html_url)}" target="_blank">Release run</a>'
|
| 803 |
+
)
|
| 804 |
+
if manual_group:
|
| 805 |
+
actions.append(
|
| 806 |
+
f'<a class="kcm-action" href="{html.escape(manual_group.run.html_url)}" target="_blank">Manual run</a>'
|
| 807 |
+
)
|
| 808 |
+
if config.grafana.enabled:
|
| 809 |
+
overview_url = build_dashboard_url(config.grafana, config.grafana.overview_dashboard_uid, embed=False)
|
| 810 |
+
actions.append(
|
| 811 |
+
f'<a class="kcm-action" href="{html.escape(overview_url)}" target="_blank">Grafana</a>'
|
| 812 |
+
)
|
| 813 |
+
return "".join(actions) or '<span class="kcm-subtle">No links</span>'
|
| 814 |
+
|
| 815 |
+
|
| 816 |
+
def _render_kernel_row(row: KernelRow, idx: int, config: AppConfig) -> str:
|
| 817 |
+
release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
|
| 818 |
+
manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
|
| 819 |
+
critical_tag = '<span class="kcm-badge critical">critical</span>' if row.critical else ""
|
| 820 |
+
workflows_text = " / ".join(
|
| 821 |
+
group.workflow_name for group in [release_group, manual_group] if group is not None
|
| 822 |
+
)
|
| 823 |
+
activity = row.primary_group
|
| 824 |
+
activity_title = html.escape(activity.run.display_title or activity.run.name) if activity else "No tracked run yet"
|
| 825 |
+
activity_sub = html.escape(_short_dt(row.last_triggered_at)) if row.last_triggered_at else "No activity"
|
| 826 |
+
return f"""
|
| 827 |
+
<tr
|
| 828 |
+
data-idx="{idx}"
|
| 829 |
+
data-kernel="{html.escape(row.kernel_name.lower())}"
|
| 830 |
+
data-status="{html.escape(row.row_status_kind)}"
|
| 831 |
+
data-workflow="{html.escape(workflows_text.lower())}"
|
| 832 |
+
>
|
| 833 |
+
<td style="min-width:220px">
|
| 834 |
+
<div class="kcm-kernel-name">{html.escape(row.kernel_name)} {critical_tag}</div>
|
| 835 |
+
<div class="kcm-kernel-meta">{html.escape(row.kernel_info.repo_id)}</div>
|
| 836 |
+
<div class="kcm-kernel-meta">{html.escape(", ".join(row.kernel_info.backends) or "backend metadata unavailable")}</div>
|
| 837 |
+
</td>
|
| 838 |
+
<td style="min-width:360px">{_workflow_cell(release_group, "No release workflow run found in the scanned history.")}</td>
|
| 839 |
+
<td style="min-width:280px">{_workflow_cell(manual_group, "No manual upload run found in the scanned history.")}</td>
|
| 840 |
+
<td style="min-width:240px">
|
| 841 |
+
<div class="kcm-badges">{_badge(row.row_status_label, row.row_status_kind)}</div>
|
| 842 |
+
<div class="kcm-activity-sub">{activity_title}</div>
|
| 843 |
+
<div class="kcm-activity-sub">{activity_sub}</div>
|
| 844 |
+
</td>
|
| 845 |
+
<td style="min-width:220px"><div class="kcm-actions">{_actions_cell(row, config)}</div></td>
|
| 846 |
+
</tr>
|
| 847 |
+
"""
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
def _render_arch_card(record: MonitorRecord) -> str:
|
| 851 |
+
phase_kind = "stalled" if record.suspected_stalled else record.phase
|
| 852 |
+
stall_line = (
|
| 853 |
+
f'<div class="kcm-arch-detail" style="color:var(--warn)">{html.escape(record.stall_reason or "")}</div>'
|
| 854 |
+
if record.suspected_stalled
|
| 855 |
+
else ""
|
| 856 |
+
)
|
| 857 |
+
failure = (
|
| 858 |
+
f'<div class="kcm-failure-box">{html.escape(record.failure_excerpt)}</div>'
|
| 859 |
+
if record.failure_excerpt
|
| 860 |
+
else ""
|
| 861 |
+
)
|
| 862 |
+
return f"""
|
| 863 |
+
<div class="kcm-arch-card">
|
| 864 |
+
<div class="kcm-arch-head">
|
| 865 |
+
<span class="kcm-arch-name">{html.escape(_variant_label(record))}</span>
|
| 866 |
+
{_badge(record.phase_label, phase_kind)}
|
| 867 |
+
</div>
|
| 868 |
+
<div class="kcm-arch-detail">Upload { _badge(record.upload_status_label, record.upload_status) }</div>
|
| 869 |
+
<div class="kcm-arch-detail">Runner {html.escape(record.runner_group or 'n/a')}</div>
|
| 870 |
+
<div class="kcm-arch-detail">Started {_dt(record.started_at)} | Latest signal {_dt(record.latest_signal_at)}</div>
|
| 871 |
+
<div class="kcm-arch-detail"><a href="{html.escape(record.job.html_url)}" target="_blank">Open job</a></div>
|
| 872 |
+
{stall_line}
|
| 873 |
+
{failure}
|
| 874 |
+
</div>
|
| 875 |
+
"""
|
| 876 |
+
|
| 877 |
+
|
| 878 |
+
def _render_group(group: KernelRunGroup) -> str:
|
| 879 |
+
arch_cards = "".join(_render_arch_card(record) for record in group.records)
|
| 880 |
+
return f"""
|
| 881 |
+
<div class="kcm-run-card">
|
| 882 |
+
<div class="kcm-run-card-head">
|
| 883 |
+
<div>
|
| 884 |
+
<div class="kcm-run-card-title">{html.escape(group.run.display_title or group.run.name)}</div>
|
| 885 |
+
<div class="kcm-run-card-meta">
|
| 886 |
+
{html.escape(group.workflow_name)} | branch {html.escape(group.run.head_branch or 'n/a')} | actor {html.escape(group.run.actor_login or 'n/a')}<br>
|
| 887 |
+
Triggered {_dt(group.triggered_at)}
|
| 888 |
+
</div>
|
| 889 |
+
</div>
|
| 890 |
+
<div>
|
| 891 |
+
<div class="kcm-badges">{_group_badges(group)}</div>
|
| 892 |
+
<div class="kcm-run-card-meta" style="margin-top:8px">
|
| 893 |
+
<a href="{html.escape(group.run.html_url)}" target="_blank">Open Actions run</a>
|
| 894 |
+
</div>
|
| 895 |
+
</div>
|
| 896 |
+
</div>
|
| 897 |
+
<div class="kcm-arch-grid">{arch_cards}</div>
|
| 898 |
+
</div>
|
| 899 |
+
"""
|
| 900 |
+
|
| 901 |
+
|
| 902 |
+
def _render_hidden_modal(row: KernelRow, idx: int, config: AppConfig) -> str:
|
| 903 |
+
release_group = _latest_group_for_workflow(row, ".github/workflows/build-release.yaml")
|
| 904 |
+
manual_group = _latest_group_for_workflow(row, ".github/workflows/manual-build-upload.yaml")
|
| 905 |
+
critical_tag = '<span class="kcm-badge critical">critical</span>' if row.critical else ""
|
| 906 |
+
grafana_link = ""
|
| 907 |
+
if config.grafana.enabled:
|
| 908 |
+
grafana_url = build_dashboard_url(config.grafana, config.grafana.overview_dashboard_uid, embed=False)
|
| 909 |
+
grafana_link = f'<a href="{html.escape(grafana_url)}" target="_blank" class="kcm-modal-close">Open Grafana</a>'
|
| 910 |
+
|
| 911 |
+
sections = []
|
| 912 |
+
if release_group:
|
| 913 |
+
sections.append(f'<h3 class="kcm-section-title">Latest release build</h3>{_render_group(release_group)}')
|
| 914 |
+
if manual_group:
|
| 915 |
+
sections.append(f'<h3 class="kcm-section-title">Latest manual upload</h3>{_render_group(manual_group)}')
|
| 916 |
+
if row.recent_groups:
|
| 917 |
+
sections.append(
|
| 918 |
+
"<h3 class=\"kcm-section-title\">Recent tracked runs</h3>"
|
| 919 |
+
+ "".join(_render_group(group) for group in row.recent_groups[:8])
|
| 920 |
+
)
|
| 921 |
+
if not sections:
|
| 922 |
+
sections.append('<div class="kcm-empty">No tracked GitHub Actions runs found for this kernel yet.</div>')
|
| 923 |
+
|
| 924 |
+
return f"""
|
| 925 |
+
<div id="modal-content-{idx}" style="display:none">
|
| 926 |
+
<div class="kcm-modal-header">
|
| 927 |
+
<div>
|
| 928 |
+
<h2>{html.escape(row.kernel_name)} {critical_tag}</h2>
|
| 929 |
+
<p>{html.escape(row.kernel_info.repo_id)}</p>
|
| 930 |
+
<p>{_badge(row.row_status_label, row.row_status_kind)} {html.escape(", ".join(row.kernel_info.backends) or "No backend metadata")}</p>
|
| 931 |
+
</div>
|
| 932 |
+
<div style="display:flex;gap:10px;flex-wrap:wrap">
|
| 933 |
+
<a href="{html.escape(row.kernel_info.hub_url)}" target="_blank" class="kcm-modal-close">Open Hub repo</a>
|
| 934 |
+
{grafana_link}
|
| 935 |
+
<button class="kcm-modal-close">Close</button>
|
| 936 |
+
</div>
|
| 937 |
+
</div>
|
| 938 |
+
<div class="kcm-modal-body">
|
| 939 |
+
{"".join(sections)}
|
| 940 |
+
</div>
|
| 941 |
+
</div>
|
| 942 |
+
"""
|
| 943 |
+
|
| 944 |
+
|
| 945 |
+
def _render_graph_section(config: AppConfig) -> str:
|
| 946 |
+
if not config.grafana.enabled:
|
| 947 |
+
return """
|
| 948 |
+
<section class="kcm-section">
|
| 949 |
+
<h2 class="kcm-section-title">Metrics + trends</h2>
|
| 950 |
+
<div class="kcm-panel-link">
|
| 951 |
+
<div class="kcm-panel-label">Grafana not configured</div>
|
| 952 |
+
<div class="kcm-panel-title">The live Actions table is active; the Grafana deck is ready to attach.</div>
|
| 953 |
+
<div class="kcm-panel-copy">
|
| 954 |
+
Set <code>KCM_GRAFANA_BASE_URL</code> on the Space once you have a public Grafana endpoint.
|
| 955 |
+
The provisioning and Actions metrics emitter already live in <code>monitoring/</code> and
|
| 956 |
+
<code>scripts/push_build_metrics.py</code>.
|
| 957 |
+
</div>
|
| 958 |
+
</div>
|
| 959 |
+
</section>
|
| 960 |
+
"""
|
| 961 |
+
dashboards = dashboard_catalog(config.grafana)
|
| 962 |
+
cards = "".join(
|
| 963 |
+
f"""
|
| 964 |
+
<a class="kcm-panel-link" href="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=False))}" target="_blank">
|
| 965 |
+
<div class="kcm-panel-label">Grafana</div>
|
| 966 |
+
<div class="kcm-panel-title">{html.escape(dashboard.title)}</div>
|
| 967 |
+
<div class="kcm-panel-copy">{html.escape(dashboard.description)}</div>
|
| 968 |
+
</a>
|
| 969 |
+
"""
|
| 970 |
+
for dashboard in dashboards
|
| 971 |
+
)
|
| 972 |
+
embeds = "".join(
|
| 973 |
+
f"""
|
| 974 |
+
<div class="kcm-frame">
|
| 975 |
+
<div class="kcm-frame-head">
|
| 976 |
+
<div>
|
| 977 |
+
<div class="kcm-frame-title">{html.escape(dashboard.title)}</div>
|
| 978 |
+
<div class="kcm-frame-copy">{html.escape(dashboard.description)}</div>
|
| 979 |
+
</div>
|
| 980 |
+
<a class="kcm-open" href="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=False))}" target="_blank">Open in Grafana</a>
|
| 981 |
+
</div>
|
| 982 |
+
<iframe src="{html.escape(build_dashboard_url(config.grafana, dashboard.uid, embed=True))}" height="{dashboard.height}" loading="lazy"></iframe>
|
| 983 |
+
</div>
|
| 984 |
+
"""
|
| 985 |
+
for dashboard in dashboards
|
| 986 |
+
)
|
| 987 |
+
return f"""
|
| 988 |
+
<section class="kcm-section">
|
| 989 |
+
<h2 class="kcm-section-title">Metrics + trends</h2>
|
| 990 |
+
<div class="kcm-graphs">{cards}</div>
|
| 991 |
+
{embeds}
|
| 992 |
+
</section>
|
| 993 |
+
"""
|
| 994 |
+
|
| 995 |
+
|
| 996 |
+
def render_page(snapshot: DashboardSnapshot, config: AppConfig) -> str:
|
| 997 |
+
summary = snapshot.summary
|
| 998 |
+
meta_cards = "".join(
|
| 999 |
+
[
|
| 1000 |
+
f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">Source repo</div><div class="kcm-meta-card-value">{html.escape(config.github.repo_slug)}</div></div>',
|
| 1001 |
+
f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">GitHub scans</div><div class="kcm-meta-card-value">{html.escape(str(config.monitor.workflow_run_pages))} pages x {html.escape(str(config.monitor.workflow_run_page_size))} runs</div></div>',
|
| 1002 |
+
f'<div class="kcm-meta-card"><div class="kcm-meta-card-label">Grafana</div><div class="kcm-meta-card-value">{html.escape(config.grafana.base_url or "not configured")}</div></div>',
|
| 1003 |
+
]
|
| 1004 |
+
)
|
| 1005 |
+
stats = "".join(
|
| 1006 |
+
f'<div class="kcm-stat"><div class="kcm-stat-label">{label}</div><div class="kcm-stat-value">{value}</div></div>'
|
| 1007 |
+
for label, value in [
|
| 1008 |
+
("Kernels", summary.tracked_kernels),
|
| 1009 |
+
("Active", summary.active_builds),
|
| 1010 |
+
("Uploading", summary.uploading_builds),
|
| 1011 |
+
("Stalled", summary.stalled_builds),
|
| 1012 |
+
("Failed", summary.failed_builds),
|
| 1013 |
+
]
|
| 1014 |
+
)
|
| 1015 |
+
rows_html = "".join(_render_kernel_row(row, idx, config) for idx, row in enumerate(snapshot.kernel_rows))
|
| 1016 |
+
errors_html = ""
|
| 1017 |
+
if snapshot.errors:
|
| 1018 |
+
errors_html = f' | <span style="color:var(--bad)">{html.escape("; ".join(snapshot.errors[:3]))}</span>'
|
| 1019 |
+
|
| 1020 |
+
return f"""
|
| 1021 |
+
<div class="kcm-shell">
|
| 1022 |
+
<section class="kcm-hero">
|
| 1023 |
+
<div class="kcm-eyebrow">Kernels community observatory</div>
|
| 1024 |
+
<h1>Kernel CI Command Center.</h1>
|
| 1025 |
+
<p>
|
| 1026 |
+
Every kernel source directory in <code>{html.escape(config.github.repo_slug)}</code> is enumerated from the repo tree,
|
| 1027 |
+
then matched to its latest release and manual-upload GitHub Actions runs. Variant-level job status stays visible, and
|
| 1028 |
+
Grafana handles the longer-term duration and failure telemetry.
|
| 1029 |
+
</p>
|
| 1030 |
+
<div class="kcm-meta">{meta_cards}</div>
|
| 1031 |
+
<div class="kcm-stats">{stats}</div>
|
| 1032 |
+
</section>
|
| 1033 |
+
|
| 1034 |
+
<div class="kcm-toolbar">
|
| 1035 |
+
<div class="kcm-toolbar-left">
|
| 1036 |
+
Refreshed <code>{html.escape(_dt(snapshot.generated_at))}</code> | <code>{len(snapshot.kernel_rows)}</code> kernels{errors_html}
|
| 1037 |
+
</div>
|
| 1038 |
+
<div class="kcm-toolbar-right">
|
| 1039 |
+
<input class="kcm-search" type="text" placeholder="Filter kernel or workflow..." />
|
| 1040 |
+
<select class="kcm-status-filter">
|
| 1041 |
+
<option value="all">All states</option>
|
| 1042 |
+
<option value="running">Running</option>
|
| 1043 |
+
<option value="uploading">Uploading</option>
|
| 1044 |
+
<option value="stalled">Stalled</option>
|
| 1045 |
+
<option value="failed">Failed</option>
|
| 1046 |
+
<option value="completed">Completed</option>
|
| 1047 |
+
<option value="idle">Idle</option>
|
| 1048 |
+
</select>
|
| 1049 |
+
</div>
|
| 1050 |
+
</div>
|
| 1051 |
+
|
| 1052 |
+
<section class="kcm-table-shell">
|
| 1053 |
+
<div class="kcm-table-wrap">
|
| 1054 |
+
<table class="kcm-table" id="kernelTable">
|
| 1055 |
+
<thead>
|
| 1056 |
+
<tr>
|
| 1057 |
+
<th>Kernel dir</th>
|
| 1058 |
+
<th>Latest release build</th>
|
| 1059 |
+
<th>Latest manual upload</th>
|
| 1060 |
+
<th>Latest activity</th>
|
| 1061 |
+
<th>Actions</th>
|
| 1062 |
+
</tr>
|
| 1063 |
+
</thead>
|
| 1064 |
+
<tbody>{rows_html}</tbody>
|
| 1065 |
+
</table>
|
| 1066 |
+
</div>
|
| 1067 |
+
</section>
|
| 1068 |
+
|
| 1069 |
+
{_render_graph_section(config)}
|
| 1070 |
+
</div>
|
| 1071 |
+
<div class="kcm-overlay" id="kcmOverlay">
|
| 1072 |
+
<div class="kcm-modal" id="kcmModal"></div>
|
| 1073 |
+
</div>
|
| 1074 |
+
{"".join(_render_hidden_modal(row, idx, config) for idx, row in enumerate(snapshot.kernel_rows))}
|
| 1075 |
+
"""
|
| 1076 |
+
|
| 1077 |
+
|
| 1078 |
+
LOADING_HTML = """
|
| 1079 |
+
<div class="kcm-shell">
|
| 1080 |
+
<section class="kcm-hero">
|
| 1081 |
+
<div class="kcm-eyebrow">Kernels community observatory</div>
|
| 1082 |
+
<h1>Booting the kernel CI command center...</h1>
|
| 1083 |
+
<p>The first load walks the kernel catalog and scans the latest GitHub Actions runs, so it can take a few seconds.</p>
|
| 1084 |
+
</section>
|
| 1085 |
+
</div>
|
| 1086 |
+
"""
|
| 1087 |
+
|
| 1088 |
+
|
| 1089 |
+
def build_dashboard(service: MonitorService, config: AppConfig) -> gr.Blocks:
|
| 1090 |
+
with gr.Blocks() as demo:
|
| 1091 |
+
refresh_timer = gr.Timer(value=8, active=True)
|
| 1092 |
+
loaded_state = gr.State(False)
|
| 1093 |
+
|
| 1094 |
+
with gr.Row():
|
| 1095 |
+
refresh_btn = gr.Button("Refresh now", variant="primary", scale=0, min_width=160)
|
| 1096 |
+
|
| 1097 |
+
page_html = gr.HTML(value=LOADING_HTML)
|
| 1098 |
+
|
| 1099 |
+
def refresh(_=None):
|
| 1100 |
+
snapshot = service.get_snapshot(force_refresh=True)
|
| 1101 |
+
return render_page(snapshot, config), True, gr.Timer(value=config.monitor.refresh_interval_seconds, active=True)
|
| 1102 |
+
|
| 1103 |
+
def tick_refresh(loaded):
|
| 1104 |
+
snapshot = service.get_snapshot(force_refresh=not loaded)
|
| 1105 |
+
return render_page(snapshot, config), True, gr.Timer(value=config.monitor.refresh_interval_seconds, active=True)
|
| 1106 |
+
|
| 1107 |
+
refresh_btn.click(refresh, outputs=[page_html, loaded_state, refresh_timer])
|
| 1108 |
+
refresh_timer.tick(tick_refresh, inputs=[loaded_state], outputs=[page_html, loaded_state, refresh_timer])
|
| 1109 |
+
|
| 1110 |
+
return demo
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
ROOT_DIR = Path(__file__).resolve().parents[1]
|
| 8 |
+
SRC_DIR = ROOT_DIR / "src"
|
| 9 |
+
if str(SRC_DIR) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(SRC_DIR))
|
tests/fixtures/active_build_job.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 66947931664,
|
| 3 |
+
"run_id": 23049830725,
|
| 4 |
+
"workflow_name": "Build Release",
|
| 5 |
+
"head_branch": "tiny-build-fix",
|
| 6 |
+
"run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830725",
|
| 7 |
+
"run_attempt": 1,
|
| 8 |
+
"head_sha": "ca745cc4e08039817fc47d780f7dd3126187a6d6",
|
| 9 |
+
"url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931664",
|
| 10 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830725/job/66947931664",
|
| 11 |
+
"status": "in_progress",
|
| 12 |
+
"conclusion": null,
|
| 13 |
+
"created_at": "2026-03-22T10:00:00Z",
|
| 14 |
+
"started_at": "2026-03-22T10:00:10Z",
|
| 15 |
+
"completed_at": null,
|
| 16 |
+
"name": "build-kernel (aarch64-linux, aws-r8g-8xl-plus-nix)",
|
| 17 |
+
"steps": [
|
| 18 |
+
{
|
| 19 |
+
"name": "Set up job",
|
| 20 |
+
"status": "completed",
|
| 21 |
+
"conclusion": "success",
|
| 22 |
+
"number": 1,
|
| 23 |
+
"started_at": "2026-03-22T10:00:10Z",
|
| 24 |
+
"completed_at": "2026-03-22T10:00:12Z"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "Validate kernel directory",
|
| 28 |
+
"status": "completed",
|
| 29 |
+
"conclusion": "success",
|
| 30 |
+
"number": 6,
|
| 31 |
+
"started_at": "2026-03-22T10:00:30Z",
|
| 32 |
+
"completed_at": "2026-03-22T10:00:31Z"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"name": "Build and upload kernel",
|
| 36 |
+
"status": "in_progress",
|
| 37 |
+
"conclusion": null,
|
| 38 |
+
"number": 7,
|
| 39 |
+
"started_at": "2026-03-22T10:01:00Z",
|
| 40 |
+
"completed_at": null
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"runner_name": "aws-r8g-8xl-plus-nix-runner",
|
| 44 |
+
"runner_group_name": "aws-r8g-8xl-plus-nix"
|
| 45 |
+
}
|
tests/fixtures/build_release_run.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 23049830725,
|
| 3 |
+
"name": "Build Release",
|
| 4 |
+
"display_title": "sgl-flash-attn3: upload path sanity",
|
| 5 |
+
"path": ".github/workflows/build-release.yaml",
|
| 6 |
+
"status": "in_progress",
|
| 7 |
+
"conclusion": null,
|
| 8 |
+
"head_branch": "tiny-build-fix",
|
| 9 |
+
"head_sha": "ca745cc4e08039817fc47d780f7dd3126187a6d6",
|
| 10 |
+
"event": "pull_request",
|
| 11 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830725",
|
| 12 |
+
"jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830725/jobs",
|
| 13 |
+
"created_at": "2026-03-22T10:00:00Z",
|
| 14 |
+
"updated_at": "2026-03-22T14:20:00Z",
|
| 15 |
+
"run_started_at": "2026-03-22T10:00:00Z",
|
| 16 |
+
"actor": {
|
| 17 |
+
"login": "adarshxs"
|
| 18 |
+
}
|
| 19 |
+
}
|
tests/fixtures/failed_build_job.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 66947931666,
|
| 3 |
+
"run_id": 23049830726,
|
| 4 |
+
"workflow_name": "Build Release",
|
| 5 |
+
"head_branch": "repo-id-bug",
|
| 6 |
+
"run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830726",
|
| 7 |
+
"run_attempt": 1,
|
| 8 |
+
"head_sha": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
|
| 9 |
+
"url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931666",
|
| 10 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830726/job/66947931666",
|
| 11 |
+
"status": "completed",
|
| 12 |
+
"conclusion": "failure",
|
| 13 |
+
"created_at": "2026-03-21T10:00:00Z",
|
| 14 |
+
"started_at": "2026-03-21T10:00:10Z",
|
| 15 |
+
"completed_at": "2026-03-21T10:26:08Z",
|
| 16 |
+
"name": "build-kernel (aarch64-linux, aws-r8g-8xl-plus-nix)",
|
| 17 |
+
"steps": [
|
| 18 |
+
{
|
| 19 |
+
"name": "Set up job",
|
| 20 |
+
"status": "completed",
|
| 21 |
+
"conclusion": "success",
|
| 22 |
+
"number": 1,
|
| 23 |
+
"started_at": "2026-03-21T10:00:10Z",
|
| 24 |
+
"completed_at": "2026-03-21T10:00:12Z"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "Validate kernel directory",
|
| 28 |
+
"status": "completed",
|
| 29 |
+
"conclusion": "success",
|
| 30 |
+
"number": 6,
|
| 31 |
+
"started_at": "2026-03-21T10:00:30Z",
|
| 32 |
+
"completed_at": "2026-03-21T10:00:31Z"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"name": "Build and upload kernel",
|
| 36 |
+
"status": "completed",
|
| 37 |
+
"conclusion": "failure",
|
| 38 |
+
"number": 7,
|
| 39 |
+
"started_at": "2026-03-21T10:01:00Z",
|
| 40 |
+
"completed_at": "2026-03-21T10:26:08Z"
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"runner_name": "aws-r8g-8xl-plus-nix-runner",
|
| 44 |
+
"runner_group_name": "aws-r8g-8xl-plus-nix"
|
| 45 |
+
}
|
tests/fixtures/failed_build_run.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 23049830726,
|
| 3 |
+
"name": "Build Release",
|
| 4 |
+
"display_title": "sgl-flash-attn3: repo id regression",
|
| 5 |
+
"path": ".github/workflows/build-release.yaml",
|
| 6 |
+
"status": "completed",
|
| 7 |
+
"conclusion": "failure",
|
| 8 |
+
"head_branch": "repo-id-bug",
|
| 9 |
+
"head_sha": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
|
| 10 |
+
"event": "pull_request",
|
| 11 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830726",
|
| 12 |
+
"jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830726/jobs",
|
| 13 |
+
"created_at": "2026-03-21T10:00:00Z",
|
| 14 |
+
"updated_at": "2026-03-21T10:26:08Z",
|
| 15 |
+
"run_started_at": "2026-03-21T10:00:00Z",
|
| 16 |
+
"actor": {
|
| 17 |
+
"login": "adarshxs"
|
| 18 |
+
}
|
| 19 |
+
}
|
tests/fixtures/manual_build_run.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 23049830727,
|
| 3 |
+
"name": "Manual Kernel Build",
|
| 4 |
+
"display_title": "Manual Kernel Build / flash-attn3 / target=main / request=manual",
|
| 5 |
+
"path": ".github/workflows/manual-build-upload.yaml",
|
| 6 |
+
"status": "completed",
|
| 7 |
+
"conclusion": "success",
|
| 8 |
+
"head_branch": "manual-test",
|
| 9 |
+
"head_sha": "cccccccccccccccccccccccccccccccccccccccc",
|
| 10 |
+
"event": "workflow_dispatch",
|
| 11 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830727",
|
| 12 |
+
"jobs_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830727/jobs",
|
| 13 |
+
"created_at": "2026-03-21T14:00:00Z",
|
| 14 |
+
"updated_at": "2026-03-21T15:01:00Z",
|
| 15 |
+
"run_started_at": "2026-03-21T14:00:00Z",
|
| 16 |
+
"actor": {
|
| 17 |
+
"login": "adarshxs"
|
| 18 |
+
}
|
| 19 |
+
}
|
tests/fixtures/manual_upload_job.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"id": 66947931668,
|
| 3 |
+
"run_id": 23049830727,
|
| 4 |
+
"workflow_name": "Manual Kernel Build",
|
| 5 |
+
"head_branch": "manual-test",
|
| 6 |
+
"run_url": "https://api.github.com/repos/huggingface/kernels-community/actions/runs/23049830727",
|
| 7 |
+
"run_attempt": 1,
|
| 8 |
+
"head_sha": "cccccccccccccccccccccccccccccccccccccccc",
|
| 9 |
+
"url": "https://api.github.com/repos/huggingface/kernels-community/actions/jobs/66947931668",
|
| 10 |
+
"html_url": "https://github.com/huggingface/kernels-community/actions/runs/23049830727/job/66947931668",
|
| 11 |
+
"status": "completed",
|
| 12 |
+
"conclusion": "success",
|
| 13 |
+
"created_at": "2026-03-21T14:00:00Z",
|
| 14 |
+
"started_at": "2026-03-21T14:00:10Z",
|
| 15 |
+
"completed_at": "2026-03-21T15:01:00Z",
|
| 16 |
+
"name": "build-and-upload",
|
| 17 |
+
"steps": [
|
| 18 |
+
{
|
| 19 |
+
"name": "Set up job",
|
| 20 |
+
"status": "completed",
|
| 21 |
+
"conclusion": "success",
|
| 22 |
+
"number": 1,
|
| 23 |
+
"started_at": "2026-03-21T14:00:10Z",
|
| 24 |
+
"completed_at": "2026-03-21T14:00:12Z"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"name": "Validate kernel directory",
|
| 28 |
+
"status": "completed",
|
| 29 |
+
"conclusion": "success",
|
| 30 |
+
"number": 6,
|
| 31 |
+
"started_at": "2026-03-21T14:00:30Z",
|
| 32 |
+
"completed_at": "2026-03-21T14:00:31Z"
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"name": "Build and copy kernel",
|
| 36 |
+
"status": "completed",
|
| 37 |
+
"conclusion": "success",
|
| 38 |
+
"number": 7,
|
| 39 |
+
"started_at": "2026-03-21T14:01:00Z",
|
| 40 |
+
"completed_at": "2026-03-21T14:45:00Z"
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "Upload kernel",
|
| 44 |
+
"status": "completed",
|
| 45 |
+
"conclusion": "success",
|
| 46 |
+
"number": 8,
|
| 47 |
+
"started_at": "2026-03-21T14:45:10Z",
|
| 48 |
+
"completed_at": "2026-03-21T15:01:00Z"
|
| 49 |
+
}
|
| 50 |
+
],
|
| 51 |
+
"runner_name": "aws-highmemory-32-plus-nix-runner",
|
| 52 |
+
"runner_group_name": "aws-highmemory-32-plus-nix"
|
| 53 |
+
}
|
tests/test_grafana.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from kc_monitor.config import GrafanaSettings
|
| 4 |
+
from kc_monitor.grafana import build_dashboard_url, dashboard_catalog
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_dashboard_catalog_uses_configured_uids() -> None:
|
| 8 |
+
settings = GrafanaSettings(
|
| 9 |
+
base_url="https://grafana.example.com",
|
| 10 |
+
overview_dashboard_uid="overview-uid",
|
| 11 |
+
duration_dashboard_uid="durations-uid",
|
| 12 |
+
failure_dashboard_uid="failures-uid",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
dashboards = dashboard_catalog(settings)
|
| 16 |
+
|
| 17 |
+
assert [dashboard.uid for dashboard in dashboards] == [
|
| 18 |
+
"overview-uid",
|
| 19 |
+
"durations-uid",
|
| 20 |
+
"failures-uid",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_build_dashboard_url_supports_embed_mode() -> None:
|
| 25 |
+
settings = GrafanaSettings(
|
| 26 |
+
base_url="https://grafana.example.com/",
|
| 27 |
+
org_id=7,
|
| 28 |
+
theme="light",
|
| 29 |
+
default_from="now-7d",
|
| 30 |
+
default_to="now",
|
| 31 |
+
default_refresh="30s",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
embed_url = build_dashboard_url(settings, "overview-uid", embed=True)
|
| 35 |
+
full_url = build_dashboard_url(settings, "overview-uid", embed=False)
|
| 36 |
+
|
| 37 |
+
assert embed_url == (
|
| 38 |
+
"https://grafana.example.com/d/overview-uid/_?"
|
| 39 |
+
"orgId=7&from=now-7d&to=now&theme=light&refresh=30s&kiosk=tv"
|
| 40 |
+
)
|
| 41 |
+
assert full_url == (
|
| 42 |
+
"https://grafana.example.com/d/overview-uid/_?"
|
| 43 |
+
"orgId=7&from=now-7d&to=now&theme=light&refresh=30s"
|
| 44 |
+
)
|
tests/test_log_parser.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from kc_monitor.log_parser import JobLogParser
|
| 7 |
+
from kc_monitor.models import GitHubJob, GitHubRun
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_json_fixture(name: str) -> dict:
|
| 14 |
+
return json.loads((FIXTURES_DIR / name).read_text(encoding="utf-8"))
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def load_text_fixture(name: str) -> str:
|
| 18 |
+
return (FIXTURES_DIR / name).read_text(encoding="utf-8")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_parser_detects_upload_in_progress_from_combined_step() -> None:
|
| 22 |
+
run = GitHubRun.from_api(load_json_fixture("build_release_run.json"))
|
| 23 |
+
job = GitHubJob.from_api(load_json_fixture("active_build_job.json"))
|
| 24 |
+
|
| 25 |
+
parsed = JobLogParser().parse(run, job, load_text_fixture("running_build_upload.log"))
|
| 26 |
+
|
| 27 |
+
assert parsed.phase == "uploading"
|
| 28 |
+
assert parsed.upload_status == "running"
|
| 29 |
+
assert parsed.repo_id == "kernels-community/sgl-flash-attn3"
|
| 30 |
+
assert parsed.latest_log_at is not None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_parser_keeps_upload_not_started_when_build_fails_first() -> None:
|
| 34 |
+
run = GitHubRun.from_api(load_json_fixture("failed_build_run.json"))
|
| 35 |
+
job = GitHubJob.from_api(load_json_fixture("failed_build_job.json"))
|
| 36 |
+
|
| 37 |
+
parsed = JobLogParser().parse(run, job, load_text_fixture("failed_build.log"))
|
| 38 |
+
|
| 39 |
+
assert parsed.phase == "failed"
|
| 40 |
+
assert parsed.upload_status == "not_started"
|
| 41 |
+
assert "Mandatory repo-id is missing" in (parsed.failure_excerpt or "")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_parser_marks_manual_upload_as_completed() -> None:
|
| 45 |
+
run = GitHubRun.from_api(load_json_fixture("manual_build_run.json"))
|
| 46 |
+
job = GitHubJob.from_api(load_json_fixture("manual_upload_job.json"))
|
| 47 |
+
|
| 48 |
+
parsed = JobLogParser().parse(run, job, load_text_fixture("manual_upload_success.log"))
|
| 49 |
+
|
| 50 |
+
assert parsed.phase == "upload_complete"
|
| 51 |
+
assert parsed.upload_status == "completed"
|
| 52 |
+
assert parsed.repo_id == "kernels-community/flash-attn3"
|
tests/test_metrics_push.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from kc_monitor.metrics_push import (
|
| 4 |
+
BuildMetricSample,
|
| 5 |
+
build_pushgateway_url,
|
| 6 |
+
format_prometheus_metrics,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def test_build_metric_sample_uses_matrix_labels_and_duration() -> None:
|
| 11 |
+
sample = BuildMetricSample.from_env(
|
| 12 |
+
{
|
| 13 |
+
"KCM_JOB_STATUS": "failure",
|
| 14 |
+
"KCM_JOB_STARTED_AT": "100",
|
| 15 |
+
"KCM_KERNEL": "flash-attn3",
|
| 16 |
+
"KCM_BACKEND": "cuda",
|
| 17 |
+
"KCM_COMPUTE_BACKEND": "triton",
|
| 18 |
+
"KCM_CUDA_VERSION": "12.4",
|
| 19 |
+
"KCM_PYTORCH_VERSION": "2.5.1",
|
| 20 |
+
"KCM_PYTHON_VERSION": "3.11",
|
| 21 |
+
"GITHUB_REPOSITORY": "huggingface/kernels-community",
|
| 22 |
+
"GITHUB_WORKFLOW": "Build Release",
|
| 23 |
+
"GITHUB_REF_NAME": "main",
|
| 24 |
+
"GITHUB_JOB": "build_kernel",
|
| 25 |
+
"RUNNER_OS": "Linux",
|
| 26 |
+
"RUNNER_ARCH": "X64",
|
| 27 |
+
},
|
| 28 |
+
completed_at_seconds=145,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
assert sample.grouping_key == {
|
| 32 |
+
"kernel": "flash-attn3",
|
| 33 |
+
"backend": "cuda",
|
| 34 |
+
"compute_backend": "triton",
|
| 35 |
+
"cuda_version": "12.4",
|
| 36 |
+
"pytorch_version": "2.5.1",
|
| 37 |
+
"python_version": "3.11",
|
| 38 |
+
}
|
| 39 |
+
assert sample.metric_labels["repository"] == "huggingface/kernels-community"
|
| 40 |
+
assert sample.result == "failure"
|
| 41 |
+
assert sample.result_code == 2
|
| 42 |
+
assert sample.failed == 1
|
| 43 |
+
assert sample.duration_seconds == 45.0
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_build_pushgateway_url_is_stable_per_matrix_combo() -> None:
|
| 47 |
+
url = build_pushgateway_url(
|
| 48 |
+
"http://pushgateway:9091",
|
| 49 |
+
"kernels-community-build-matrix",
|
| 50 |
+
{
|
| 51 |
+
"kernel": "flash-attn3",
|
| 52 |
+
"backend": "cuda",
|
| 53 |
+
"compute_backend": "triton",
|
| 54 |
+
"cuda_version": "12.4",
|
| 55 |
+
"pytorch_version": "2.5.1",
|
| 56 |
+
"python_version": "3.11",
|
| 57 |
+
},
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
assert url == (
|
| 61 |
+
"http://pushgateway:9091/metrics/job/kernels-community-build-matrix/"
|
| 62 |
+
"kernel/flash-attn3/backend/cuda/compute_backend/triton/cuda_version/12.4/"
|
| 63 |
+
"pytorch_version/2.5.1/python_version/3.11"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def test_prometheus_payload_contains_expected_metrics() -> None:
|
| 68 |
+
sample = BuildMetricSample.from_env(
|
| 69 |
+
{
|
| 70 |
+
"KCM_JOB_STATUS": "success",
|
| 71 |
+
"KCM_BUILD_DURATION_SECONDS": "12.5",
|
| 72 |
+
"KCM_KERNEL": "flash-attn3",
|
| 73 |
+
"KCM_BACKEND": "cuda",
|
| 74 |
+
"KCM_COMPUTE_BACKEND": "triton",
|
| 75 |
+
"KCM_CUDA_VERSION": "12.4",
|
| 76 |
+
"KCM_PYTORCH_VERSION": "2.5.1",
|
| 77 |
+
"KCM_PYTHON_VERSION": "3.11",
|
| 78 |
+
"GITHUB_REPOSITORY": "huggingface/kernels-community",
|
| 79 |
+
"GITHUB_WORKFLOW": "Build Release",
|
| 80 |
+
"GITHUB_REF_NAME": "main",
|
| 81 |
+
"GITHUB_JOB": "build_kernel",
|
| 82 |
+
"RUNNER_OS": "Linux",
|
| 83 |
+
"RUNNER_ARCH": "X64",
|
| 84 |
+
},
|
| 85 |
+
completed_at_seconds=1700000000,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
payload = format_prometheus_metrics(sample)
|
| 89 |
+
|
| 90 |
+
assert "kc_build_last_run_result_code" in payload
|
| 91 |
+
assert "kc_build_last_run_failed" in payload
|
| 92 |
+
assert "kc_build_last_run_duration_seconds" in payload
|
| 93 |
+
assert "kc_build_last_run_timestamp_seconds" in payload
|
| 94 |
+
assert 'result="success"' in payload
|
| 95 |
+
assert "12.500" in payload
|
| 96 |
+
assert "1700000000" in payload
|
tests/test_service.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from kc_monitor.config import AppConfig
|
| 7 |
+
from kc_monitor.models import GitHubJob, GitHubRun
|
| 8 |
+
from kc_monitor.service import MonitorService
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_json_fixture(name: str) -> dict:
|
| 15 |
+
return json.loads((FIXTURES_DIR / name).read_text(encoding="utf-8"))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load_text_fixture(name: str) -> str:
|
| 19 |
+
return (FIXTURES_DIR / name).read_text(encoding="utf-8")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class FakeGitHubClient:
|
| 23 |
+
def __init__(self) -> None:
|
| 24 |
+
self.runs = [
|
| 25 |
+
GitHubRun.from_api(load_json_fixture("build_release_run.json")),
|
| 26 |
+
GitHubRun.from_api(load_json_fixture("failed_build_run.json")),
|
| 27 |
+
GitHubRun.from_api(load_json_fixture("manual_build_run.json")),
|
| 28 |
+
]
|
| 29 |
+
self.jobs = {
|
| 30 |
+
23049830725: [GitHubJob.from_api(load_json_fixture("active_build_job.json"))],
|
| 31 |
+
23049830726: [GitHubJob.from_api(load_json_fixture("failed_build_job.json"))],
|
| 32 |
+
23049830727: [GitHubJob.from_api(load_json_fixture("manual_upload_job.json"))],
|
| 33 |
+
}
|
| 34 |
+
self.logs = {
|
| 35 |
+
66947931664: load_text_fixture("running_build_upload.log"),
|
| 36 |
+
66947931666: load_text_fixture("failed_build.log"),
|
| 37 |
+
66947931668: load_text_fixture("manual_upload_success.log"),
|
| 38 |
+
}
|
| 39 |
+
self.build_toml = {
|
| 40 |
+
"sgl-flash-attn3/build.toml": """
|
| 41 |
+
[general]
|
| 42 |
+
name = "sgl-flash-attn3"
|
| 43 |
+
version = 1
|
| 44 |
+
backends = ["cuda"]
|
| 45 |
+
|
| 46 |
+
[general.hub]
|
| 47 |
+
repo-id = "kernels-community/sgl-flash-attn3"
|
| 48 |
+
""".strip(),
|
| 49 |
+
"flash-attn3/build.toml": """
|
| 50 |
+
[general]
|
| 51 |
+
name = "flash-attn3"
|
| 52 |
+
version = 1
|
| 53 |
+
backends = ["cuda"]
|
| 54 |
+
|
| 55 |
+
[general.hub]
|
| 56 |
+
repo-id = "kernels-community/flash-attn3"
|
| 57 |
+
""".strip(),
|
| 58 |
+
}
|
| 59 |
+
self.tree_paths = [
|
| 60 |
+
"sgl-flash-attn3/build.toml",
|
| 61 |
+
"flash-attn3/build.toml",
|
| 62 |
+
"deep-gemm/build.toml",
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
def close(self) -> None:
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
def list_runs(self, per_page: int = 30, page: int = 1) -> list[GitHubRun]:
|
| 69 |
+
return self.runs[:per_page]
|
| 70 |
+
|
| 71 |
+
def list_workflow_runs(
|
| 72 |
+
self,
|
| 73 |
+
workflow_file: str,
|
| 74 |
+
per_page: int = 30,
|
| 75 |
+
page: int = 1,
|
| 76 |
+
) -> list[GitHubRun]:
|
| 77 |
+
return [r for r in self.runs if r.path.endswith(workflow_file)][:per_page]
|
| 78 |
+
|
| 79 |
+
def list_jobs(self, run_id: int) -> list[GitHubJob]:
|
| 80 |
+
return self.jobs[run_id]
|
| 81 |
+
|
| 82 |
+
def get_job_logs(
|
| 83 |
+
self,
|
| 84 |
+
job_id: int,
|
| 85 |
+
line_limit: int = 400,
|
| 86 |
+
char_limit: int = 35000,
|
| 87 |
+
job_html_url: str | None = None,
|
| 88 |
+
) -> str:
|
| 89 |
+
return self.logs[job_id]
|
| 90 |
+
|
| 91 |
+
def get_file_text(self, path: str, ref: str | None = None) -> str | None:
|
| 92 |
+
return self.build_toml.get(path)
|
| 93 |
+
|
| 94 |
+
def list_repo_tree_paths(self, ref: str = "main") -> list[str]:
|
| 95 |
+
return self.tree_paths
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def test_service_builds_summary_and_records() -> None:
|
| 99 |
+
config = AppConfig.model_validate(
|
| 100 |
+
{
|
| 101 |
+
"github": {
|
| 102 |
+
"owner": "huggingface",
|
| 103 |
+
"repo": "kernels-community",
|
| 104 |
+
"branch": "main",
|
| 105 |
+
"per_page": 10,
|
| 106 |
+
"workflows": [
|
| 107 |
+
{
|
| 108 |
+
"path": ".github/workflows/build-release.yaml",
|
| 109 |
+
"label": "Build Release",
|
| 110 |
+
"enabled": True,
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"path": ".github/workflows/manual-build-upload.yaml",
|
| 114 |
+
"label": "Manual Kernel Build",
|
| 115 |
+
"enabled": True,
|
| 116 |
+
},
|
| 117 |
+
],
|
| 118 |
+
},
|
| 119 |
+
"monitor": {
|
| 120 |
+
"recent_completed_hours": 400,
|
| 121 |
+
"critical_kernels": ["flash-attn3", "sgl-flash-attn3"],
|
| 122 |
+
"snapshot_ttl_seconds": 1,
|
| 123 |
+
},
|
| 124 |
+
}
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
service = MonitorService(config, client=FakeGitHubClient())
|
| 128 |
+
snapshot = service.get_snapshot(force_refresh=True)
|
| 129 |
+
|
| 130 |
+
assert snapshot.summary.active_builds == 1
|
| 131 |
+
assert snapshot.summary.completed_uploads == 1
|
| 132 |
+
assert snapshot.summary.failed_builds == 1
|
| 133 |
+
assert snapshot.summary.uploading_builds == 1
|
| 134 |
+
assert snapshot.summary.tracked_kernels == 3
|
| 135 |
+
assert len(snapshot.active_records) == 1
|
| 136 |
+
assert len(snapshot.kernel_rows) == 3
|
| 137 |
+
assert snapshot.active_records[0].kernel_name == "sgl-flash-attn3"
|
| 138 |
+
assert snapshot.active_records[0].critical is True
|
| 139 |
+
assert snapshot.kernel_rows[0].kernel_name == "sgl-flash-attn3"
|
| 140 |
+
assert snapshot.kernel_rows[-1].kernel_name == "deep-gemm"
|
| 141 |
+
assert any(record.upload_status == "completed" for record in snapshot.recent_records)
|
| 142 |
+
assert any(record.phase == "failed" for record in snapshot.recent_records)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def test_service_normalizes_public_jobs_without_steps() -> None:
|
| 146 |
+
run = GitHubRun.from_api(load_json_fixture("build_release_run.json"))
|
| 147 |
+
job = GitHubJob.from_api(load_json_fixture("active_build_job.json"))
|
| 148 |
+
job.steps = []
|
| 149 |
+
|
| 150 |
+
normalized = MonitorService._normalize_job(run, job)
|
| 151 |
+
|
| 152 |
+
assert [step.name for step in normalized.steps] == ["Build and upload kernel"]
|