From 6457c53d9cc5cb69ee872f05df21462a95b6f592 Mon Sep 17 00:00:00 2001 From: iven Date: Fri, 26 Jun 2026 09:25:43 +0800 Subject: [PATCH] =?UTF-8?q?feat(docker):=20PP-04=20=E5=8F=AF=E8=A7=82?= =?UTF-8?q?=E6=B5=8B=E6=80=A7=20MVP=20=E2=80=94=20Alertmanager=20=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E5=87=BA=E5=8F=A3=20+=20Grafana=20provisioning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PP-04 核实属实:11 条告警规则在 prometheus 加载但无 alertmanager(告警 无通知出口),grafana provisioning 目录空,exporter 服务也未部署 ("配置齐全运行为零")。 MVP 打通告警链路 + 让 grafana 可用(不依赖 exporter,基于 app metrics): - docker-compose.production.yml 加 alertmanager 服务 + alertmanager_data 卷 - prometheus.yml 加 alerting 指向 alertmanager:9093 - alertmanager/config.yml 路由(SEV-1 critical 即时通知 + 分组) - grafana/provisioning/datasources 自动连 prometheus - grafana/provisioning/dashboards provider 就绪 待办(上线前):① alertmanager 占位 webhook 替换为真实渠道(钉钉/企微/邮件) ② 补 grafana dashboard JSON ③ 部署 postgres/redis/nginx exporter 让 prometheus 抓得到 --- docker/alertmanager/config.yml | 34 +++++++++++++++++++ docker/docker-compose.production.yml | 19 +++++++++++ .../provisioning/dashboards/dashboards.yml | 15 ++++++++ .../provisioning/datasources/prometheus.yml | 13 +++++++ docker/prometheus/prometheus.yml | 5 +++ 5 files changed, 86 insertions(+) create mode 100644 docker/alertmanager/config.yml create mode 100644 docker/grafana/provisioning/dashboards/dashboards.yml create mode 100644 docker/grafana/provisioning/datasources/prometheus.yml diff --git a/docker/alertmanager/config.yml b/docker/alertmanager/config.yml new file mode 100644 index 0000000..4054b79 --- /dev/null +++ b/docker/alertmanager/config.yml @@ -0,0 +1,34 @@ +# Alertmanager 告警通知配置 +# +# ⚠️ TODO(上线前必填):将 receivers.default.webhook_configs 替换为真实通知渠道: +# - 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX +# - 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX +# - 邮件 SMTP:配置 global.smtp_* + email_configs +# +# 当前为占位 webhook(指向无效端点),alertmanager 会启动但告警 POST 失败记日志。 +# PP-04 MVP 目的:先打通 prometheus → alertmanager 链路,渠道上线前填。 + +global: + resolve_timeout: 5m + +# 路由:按 alertname + service 分组,先 SEV-1(critical)走即时通知 +route: + receiver: "default" + group_by: ["alertname", "service"] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + # SEV-1 关键告警(DB 宕机/5xx 飙升/Redis 不可达)立即通知,5 分钟重复 + - matchers: + - severity = "critical" + receiver: "default" + group_wait: 0s + repeat_interval: 5m + +receivers: + - name: "default" + # 占位:上线前替换为真实 webhook + webhook_configs: + - url: "http://placeholder.invalid/alert" + send_resolved: true diff --git a/docker/docker-compose.production.yml b/docker/docker-compose.production.yml index 71a62ce..22a2209 100644 --- a/docker/docker-compose.production.yml +++ b/docker/docker-compose.production.yml @@ -134,6 +134,23 @@ services: networks: - hms-internal + # ── Alertmanager 告警通知出口 ── + # PP-04: 之前 11 条告警规则在 prometheus 加载但无 alertmanager,告警触发无人知晓 + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: hms-alertmanager + restart: unless-stopped + volumes: + - ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro + - alertmanager_data:/alertmanager + command: + - "--config.file=/etc/alertmanager/config.yml" + - "--storage.path=/alertmanager" + expose: + - "9093" + networks: + - hms-internal + # ── Grafana 可视化 ── grafana: image: grafana/grafana:11.4.0 @@ -167,6 +184,8 @@ volumes: driver: local grafana_data: driver: local + alertmanager_data: + driver: local networks: hms-internal: diff --git a/docker/grafana/provisioning/dashboards/dashboards.yml b/docker/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..31ba229 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,15 @@ +# Grafana dashboard provider +# 自动加载 ./json/ 下的 dashboard JSON 文件(PP-04 后续补充 HMS 概览 dashboard) +apiVersion: 1 + +providers: + - name: "HMS Dashboards" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /etc/grafana/provisioning/dashboards/json + foldersFromFilesStructure: false diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..13b1ea8 --- /dev/null +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,13 @@ +# Grafana 数据源自动 provisioning +# 启动时自动注册 Prometheus 数据源,无需手动在 UI 配置 +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "15s" diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 2a4762b..987e97f 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -5,6 +5,11 @@ global: rule_files: - "alerts.yml" +alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager:9093"] + scrape_configs: - job_name: "hms" metrics_path: /metrics