延续 PP-04 MVP,补全可观测性闭环:
- grafana/provisioning/dashboards/json/hms-overview.json: HMS 概览 dashboard
(服务状态/DB 连接池/EventBus 积压/内存 CPU/API 5xx 错误率,基于 app metrics)
- postgres-exporter + redis-exporter 服务: 之前 prometheus.yml 配了 target 但
服务未部署(pg_stat_activity/redis_memory 等告警永不触发),现补齐
- alertmanager 启用 --config.expand-env: 支持渠道 token 用 \${VAR} 从 .env 注入
(避免重蹈 PP-03 Redis 密码明文入 git 覆辙)
- alertmanager/README.md: 钉钉/企微/邮件渠道配置文档(上线前填)
nginx-exporter 跳过(alerts.yml 无 nginx 规则 + 需改 nginx.conf 配 stub_status)
116 lines
3.8 KiB
JSON
116 lines
3.8 KiB
JSON
{
|
|
"uid": "hms-overview",
|
|
"title": "HMS 概览",
|
|
"tags": ["HMS", "overview"],
|
|
"timezone": "browser",
|
|
"schemaVersion": 39,
|
|
"version": 1,
|
|
"refresh": "30s",
|
|
"time": { "from": "now-1h", "to": "now" },
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"type": "stat",
|
|
"title": "HMS 服务状态",
|
|
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [{ "refId": "A", "expr": "up{job=\"hms\"}", "legendFormat": "" }],
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "background",
|
|
"mappings": [
|
|
{
|
|
"type": "value",
|
|
"options": {
|
|
"0": { "text": "DOWN", "color": "red", "index": 0 },
|
|
"1": { "text": "UP", "color": "green", "index": 1 }
|
|
}
|
|
}
|
|
]
|
|
}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"type": "stat",
|
|
"title": "EventBus 积压 (pending)",
|
|
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [{ "refId": "A", "expr": "eventbus_pending_total", "legendFormat": "" }],
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "value",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 100 },
|
|
{ "color": "red", "value": 500 }
|
|
]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"type": "stat",
|
|
"title": "API 5xx 错误率 (5m)",
|
|
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(http_requests_total[5m])), 1)",
|
|
"legendFormat": "5xx ratio"
|
|
}
|
|
],
|
|
"options": {
|
|
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
|
"colorMode": "background",
|
|
"thresholds": {
|
|
"mode": "absolute",
|
|
"steps": [
|
|
{ "color": "green", "value": null },
|
|
{ "color": "yellow", "value": 0.01 },
|
|
{ "color": "red", "value": 0.05 }
|
|
]
|
|
},
|
|
"mappings": []
|
|
},
|
|
"fieldConfig": { "defaults": { "unit": "percentunit" } }
|
|
},
|
|
{
|
|
"id": 4,
|
|
"type": "timeseries",
|
|
"title": "DB 连接池(活跃 / 空闲)",
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [
|
|
{ "refId": "A", "expr": "db_pool_connections_active", "legendFormat": "活跃" },
|
|
{ "refId": "B", "expr": "db_pool_connections_idle", "legendFormat": "空闲" }
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "short" } }
|
|
},
|
|
{
|
|
"id": 5,
|
|
"type": "timeseries",
|
|
"title": "进程内存 / CPU",
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [
|
|
{ "refId": "A", "expr": "process_resident_memory_bytes", "legendFormat": "内存 (bytes)" },
|
|
{ "refId": "B", "expr": "rate(process_cpu_seconds_total[5m])", "legendFormat": "CPU (cores/s)" }
|
|
]
|
|
},
|
|
{
|
|
"id": 6,
|
|
"type": "timeseries",
|
|
"title": "EventBus 积压趋势",
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"targets": [
|
|
{ "refId": "A", "expr": "eventbus_pending_total", "legendFormat": "pending events" }
|
|
],
|
|
"fieldConfig": { "defaults": { "unit": "short" } }
|
|
}
|
|
]
|
|
}
|