feat(docker): PP-04 可观测性 MVP — Alertmanager 告警出口 + Grafana provisioning
PP-04 核实属实:11 条告警规则在 prometheus 加载但无 alertmanager(告警 无通知出口),grafana provisioning 目录空,exporter 服务也未部署 ("配置齐全运行为零")。 MVP 打通告警链路 + 让 grafana 可用(不依赖 exporter,基于 app metrics): - docker-compose.production.yml 加 alertmanager 服务 + alertmanager_data 卷 - prometheus.yml 加 alerting 指向 alertmanager:9093 - alertmanager/config.yml 路由(SEV-1 critical 即时通知 + 分组) - grafana/provisioning/datasources 自动连 prometheus - grafana/provisioning/dashboards provider 就绪 待办(上线前):① alertmanager 占位 webhook 替换为真实渠道(钉钉/企微/邮件) ② 补 grafana dashboard JSON ③ 部署 postgres/redis/nginx exporter 让 prometheus 抓得到
This commit is contained in:
34
docker/alertmanager/config.yml
Normal file
34
docker/alertmanager/config.yml
Normal file
@@ -0,0 +1,34 @@
|
||||
# Alertmanager 告警通知配置
|
||||
#
|
||||
# ⚠️ TODO(上线前必填):将 receivers.default.webhook_configs 替换为真实通知渠道:
|
||||
# - 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX
|
||||
# - 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX
|
||||
# - 邮件 SMTP:配置 global.smtp_* + email_configs
|
||||
#
|
||||
# 当前为占位 webhook(指向无效端点),alertmanager 会启动但告警 POST 失败记日志。
|
||||
# PP-04 MVP 目的:先打通 prometheus → alertmanager 链路,渠道上线前填。
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
# 路由:按 alertname + service 分组,先 SEV-1(critical)走即时通知
|
||||
route:
|
||||
receiver: "default"
|
||||
group_by: ["alertname", "service"]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
# SEV-1 关键告警(DB 宕机/5xx 飙升/Redis 不可达)立即通知,5 分钟重复
|
||||
- matchers:
|
||||
- severity = "critical"
|
||||
receiver: "default"
|
||||
group_wait: 0s
|
||||
repeat_interval: 5m
|
||||
|
||||
receivers:
|
||||
- name: "default"
|
||||
# 占位:上线前替换为真实 webhook
|
||||
webhook_configs:
|
||||
- url: "http://placeholder.invalid/alert"
|
||||
send_resolved: true
|
||||
@@ -134,6 +134,23 @@ services:
|
||||
networks:
|
||||
- hms-internal
|
||||
|
||||
# ── Alertmanager 告警通知出口 ──
|
||||
# PP-04: 之前 11 条告警规则在 prometheus 加载但无 alertmanager,告警触发无人知晓
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.27.0
|
||||
container_name: hms-alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
command:
|
||||
- "--config.file=/etc/alertmanager/config.yml"
|
||||
- "--storage.path=/alertmanager"
|
||||
expose:
|
||||
- "9093"
|
||||
networks:
|
||||
- hms-internal
|
||||
|
||||
# ── Grafana 可视化 ──
|
||||
grafana:
|
||||
image: grafana/grafana:11.4.0
|
||||
@@ -167,6 +184,8 @@ volumes:
|
||||
driver: local
|
||||
grafana_data:
|
||||
driver: local
|
||||
alertmanager_data:
|
||||
driver: local
|
||||
|
||||
networks:
|
||||
hms-internal:
|
||||
|
||||
15
docker/grafana/provisioning/dashboards/dashboards.yml
Normal file
15
docker/grafana/provisioning/dashboards/dashboards.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
# Grafana dashboard provider
|
||||
# 自动加载 ./json/ 下的 dashboard JSON 文件(PP-04 后续补充 HMS 概览 dashboard)
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: "HMS Dashboards"
|
||||
orgId: 1
|
||||
folder: ""
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
updateIntervalSeconds: 30
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards/json
|
||||
foldersFromFilesStructure: false
|
||||
13
docker/grafana/provisioning/datasources/prometheus.yml
Normal file
13
docker/grafana/provisioning/datasources/prometheus.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
# Grafana 数据源自动 provisioning
|
||||
# 启动时自动注册 Prometheus 数据源,无需手动在 UI 配置
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
jsonData:
|
||||
timeInterval: "15s"
|
||||
@@ -5,6 +5,11 @@ global:
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "hms"
|
||||
metrics_path: /metrics
|
||||
|
||||
Reference in New Issue
Block a user