diff --git a/docker/alertmanager/config.yml b/docker/alertmanager/config.yml new file mode 100644 index 0000000..4054b79 --- /dev/null +++ b/docker/alertmanager/config.yml @@ -0,0 +1,34 @@ +# Alertmanager 告警通知配置 +# +# ⚠️ TODO(上线前必填):将 receivers.default.webhook_configs 替换为真实通知渠道: +# - 钉钉机器人:https://oapi.dingtalk.com/robot/send?access_token=XXX +# - 企业微信群机器人:https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX +# - 邮件 SMTP:配置 global.smtp_* + email_configs +# +# 当前为占位 webhook(指向无效端点),alertmanager 会启动但告警 POST 失败记日志。 +# PP-04 MVP 目的:先打通 prometheus → alertmanager 链路,渠道上线前填。 + +global: + resolve_timeout: 5m + +# 路由:按 alertname + service 分组,先 SEV-1(critical)走即时通知 +route: + receiver: "default" + group_by: ["alertname", "service"] + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + # SEV-1 关键告警(DB 宕机/5xx 飙升/Redis 不可达)立即通知,5 分钟重复 + - matchers: + - severity = "critical" + receiver: "default" + group_wait: 0s + repeat_interval: 5m + +receivers: + - name: "default" + # 占位:上线前替换为真实 webhook + webhook_configs: + - url: "http://placeholder.invalid/alert" + send_resolved: true diff --git a/docker/docker-compose.production.yml b/docker/docker-compose.production.yml index 71a62ce..22a2209 100644 --- a/docker/docker-compose.production.yml +++ b/docker/docker-compose.production.yml @@ -134,6 +134,23 @@ services: networks: - hms-internal + # ── Alertmanager 告警通知出口 ── + # PP-04: 之前 11 条告警规则在 prometheus 加载但无 alertmanager,告警触发无人知晓 + alertmanager: + image: prom/alertmanager:v0.27.0 + container_name: hms-alertmanager + restart: unless-stopped + volumes: + - ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro + - alertmanager_data:/alertmanager + command: + - "--config.file=/etc/alertmanager/config.yml" + - "--storage.path=/alertmanager" + expose: + - "9093" + networks: + - hms-internal + # ── Grafana 可视化 ── grafana: image: grafana/grafana:11.4.0 @@ -167,6 +184,8 @@ volumes: driver: local grafana_data: driver: local + alertmanager_data: + driver: local networks: hms-internal: diff --git a/docker/grafana/provisioning/dashboards/dashboards.yml b/docker/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..31ba229 --- /dev/null +++ b/docker/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,15 @@ +# Grafana dashboard provider +# 自动加载 ./json/ 下的 dashboard JSON 文件(PP-04 后续补充 HMS 概览 dashboard) +apiVersion: 1 + +providers: + - name: "HMS Dashboards" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /etc/grafana/provisioning/dashboards/json + foldersFromFilesStructure: false diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 0000000..13b1ea8 --- /dev/null +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,13 @@ +# Grafana 数据源自动 provisioning +# 启动时自动注册 Prometheus 数据源,无需手动在 UI 配置 +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "15s" diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml index 2a4762b..987e97f 100644 --- a/docker/prometheus/prometheus.yml +++ b/docker/prometheus/prometheus.yml @@ -5,6 +5,11 @@ global: rule_files: - "alerts.yml" +alerting: + alertmanagers: + - static_configs: + - targets: ["alertmanager:9093"] + scrape_configs: - job_name: "hms" metrics_path: /metrics