From 984fca627b0ba833ff5f11993e4537700fd3d359 Mon Sep 17 00:00:00 2001 From: iven Date: Fri, 26 Jun 2026 15:18:43 +0800 Subject: [PATCH] =?UTF-8?q?fix(docker):=20B1=20alertmanager=20=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E6=B8=A0=E9=81=93=E6=8E=A5=E7=BA=BF=20ALERT=5FWEBHOOK?= =?UTF-8?q?=5FURL=20=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=E6=B3=A8=E5=85=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - config.yml url 写死 placeholder.invalid -> ${ALERT_WEBHOOK_URL} - compose alertmanager 补 environment 注入(评估漏检:虽有 --config.expand-env=true 但容器内无变量可展开) - 未配置时 fallback 占位 url 保 MVP 链路可启动(fail-fast 优于静默盲飞) - .env.production.example 补 ALERT_WEBHOOK_URL 模板 - 上线评估 B1 代码层修复,真实 webhook 由 staging 填入 --- docker/.env.production.example | 7 +++++++ docker/alertmanager/config.yml | 15 ++++++++++----- docker/docker-compose.production.yml | 4 ++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/docker/.env.production.example b/docker/.env.production.example index 33f5637..4b4d459 100644 --- a/docker/.env.production.example +++ b/docker/.env.production.example @@ -68,3 +68,10 @@ UPLOADS_BACKUP_CRON=0 3 * * * # Grafana 管理员密码 GRAFANA_ADMIN_PASSWORD=CHANGE_ME_GRAFANA_ADMIN GRAFANA_ROOT_URL=http://localhost:3001 + +# ===== 监控告警 ===== + +# Alertmanager 告警通知出口(上线前必填,否则 DB 宕机/5xx 飙升等告警发不到任何人) +# 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX +# 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX +ALERT_WEBHOOK_URL=https://oapi.dingtalk.com/robot/send?access_token=CHANGE_ME diff --git a/docker/alertmanager/config.yml b/docker/alertmanager/config.yml index 4054b79..95f6f10 100644 --- a/docker/alertmanager/config.yml +++ b/docker/alertmanager/config.yml @@ -1,12 +1,15 @@ # Alertmanager 告警通知配置 # -# ⚠️ TODO(上线前必填):将 receivers.default.webhook_configs 替换为真实通知渠道: +# 通知渠道由 ALERT_WEBHOOK_URL 环境变量注入(见 receivers.default.webhook_configs), +# 容器启用 --config.expand-env=true 展开。来源:docker/.env.production。 +# +# ⚠️ 上线前必填(docker/.env.production.example 已给模板),否则告警发不到任何人: # - 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX # - 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX # - 邮件 SMTP:配置 global.smtp_* + email_configs # -# 当前为占位 webhook(指向无效端点),alertmanager 会启动但告警 POST 失败记日志。 -# PP-04 MVP 目的:先打通 prometheus → alertmanager 链路,渠道上线前填。 +# 未配置 ALERT_WEBHOOK_URL 时,compose 层 fallback 占位 url,alertmanager 可启动 +# 但 POST 失败 —— fail-fast 优于 PP-04 之前"告警触发无人知晓"的盲飞状态。 global: resolve_timeout: 5m @@ -28,7 +31,9 @@ route: receivers: - name: "default" - # 占位:上线前替换为真实 webhook + # 真实通知渠道由 ALERT_WEBHOOK_URL 环境变量注入(alertmanager 启用 --config.expand-env=true)。 + # 上线前必填:见 docker/.env.production.example。未配置时 compose 层 fallback 占位 url, + # alertmanager 可启动但 POST 失败 —— fail-fast 优于静默发到无效端点。 webhook_configs: - - url: "http://placeholder.invalid/alert" + - url: "${ALERT_WEBHOOK_URL}" send_resolved: true diff --git a/docker/docker-compose.production.yml b/docker/docker-compose.production.yml index d4c2b9f..0b63e17 100644 --- a/docker/docker-compose.production.yml +++ b/docker/docker-compose.production.yml @@ -140,6 +140,10 @@ services: image: prom/alertmanager:v0.27.0 container_name: hms-alertmanager restart: unless-stopped + # ALERT_WEBHOOK_URL 从宿主机 .env.production 注入容器,供 config.yml 的 ${ALERT_WEBHOOK_URL} 展开。 + # 未配置时 fallback 占位 url,保持 MVP 链路可启动;上线前在 .env.production 填真实钉钉/企微 webhook。 + environment: + ALERT_WEBHOOK_URL: "${ALERT_WEBHOOK_URL:-http://placeholder.invalid/alert}" volumes: - ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro - alertmanager_data:/alertmanager