- config.yml url 写死 placeholder.invalid -> ${ALERT_WEBHOOK_URL}
- compose alertmanager 补 environment 注入(评估漏检:虽有 --config.expand-env=true 但容器内无变量可展开)
- 未配置时 fallback 占位 url 保 MVP 链路可启动(fail-fast 优于静默盲飞)
- .env.production.example 补 ALERT_WEBHOOK_URL 模板
- 上线评估 B1 代码层修复,真实 webhook 由 staging 填入
40 lines
1.6 KiB
YAML
40 lines
1.6 KiB
YAML
# Alertmanager 告警通知配置
|
||
#
|
||
# 通知渠道由 ALERT_WEBHOOK_URL 环境变量注入(见 receivers.default.webhook_configs),
|
||
# 容器启用 --config.expand-env=true 展开。来源:docker/.env.production。
|
||
#
|
||
# ⚠️ 上线前必填(docker/.env.production.example 已给模板),否则告警发不到任何人:
|
||
# - 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX
|
||
# - 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX
|
||
# - 邮件 SMTP:配置 global.smtp_* + email_configs
|
||
#
|
||
# 未配置 ALERT_WEBHOOK_URL 时,compose 层 fallback 占位 url,alertmanager 可启动
|
||
# 但 POST 失败 —— fail-fast 优于 PP-04 之前"告警触发无人知晓"的盲飞状态。
|
||
|
||
global:
|
||
resolve_timeout: 5m
|
||
|
||
# 路由:按 alertname + service 分组,先 SEV-1(critical)走即时通知
|
||
route:
|
||
receiver: "default"
|
||
group_by: ["alertname", "service"]
|
||
group_wait: 30s
|
||
group_interval: 5m
|
||
repeat_interval: 4h
|
||
routes:
|
||
# SEV-1 关键告警(DB 宕机/5xx 飙升/Redis 不可达)立即通知,5 分钟重复
|
||
- matchers:
|
||
- severity = "critical"
|
||
receiver: "default"
|
||
group_wait: 0s
|
||
repeat_interval: 5m
|
||
|
||
receivers:
|
||
- name: "default"
|
||
# 真实通知渠道由 ALERT_WEBHOOK_URL 环境变量注入(alertmanager 启用 --config.expand-env=true)。
|
||
# 上线前必填:见 docker/.env.production.example。未配置时 compose 层 fallback 占位 url,
|
||
# alertmanager 可启动但 POST 失败 —— fail-fast 优于静默发到无效端点。
|
||
webhook_configs:
|
||
- url: "${ALERT_WEBHOOK_URL}"
|
||
send_resolved: true
|