diff --git a/docker/alertmanager/README.md b/docker/alertmanager/README.md new file mode 100644 index 0000000..8822541 --- /dev/null +++ b/docker/alertmanager/README.md @@ -0,0 +1,63 @@ +# Alertmanager 告警通知配置 + +> PP-04 可观测性。当前 `config.yml` 使用占位 webhook(`http://placeholder.invalid/alert`),告警会 POST 失败但记日志。 +> **上线前必须**替换为真实通知渠道,否则 11 条告警规则触发了也没人收到。 + +alertmanager 已启用 `--config.expand-env=true`,支持 `${VAR}` 从环境变量展开。 + +## 方案 A:钉钉 / 企业微信 webhook(推荐) + +1. `config.yml` 的 receiver 改为环境变量引用: + + ```yaml + receivers: + - name: "default" + webhook_configs: + - url: "${ALERT_WEBHOOK_URL}" + send_resolved: true + ``` + +2. `.env`(不入 git)加: + ``` + # 钉钉机器人 + ALERT_WEBHOOK_URL=https://oapi.dingtalk.com/robot/send?access_token=XXX + # 或企业微信群机器人 + # ALERT_WEBHOOK_URL=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXX + ``` + +> token 必须放 `.env`,不能写进 `config.yml`(git 追踪)——避免重蹈 PP-03 Redis 密码明文泄露覆辙。 + +## 方案 B:邮件 SMTP + +```yaml +global: + smtp_smarthost: "smtp.exmail.qq.com:465" + smtp_from: "alert@hms.example.com" + smtp_auth_username: "alert@hms.example.com" + smtp_auth_password: "${SMTP_PASSWORD}" +receivers: + - name: "default" + email_configs: + - to: "ops@hms.example.com" + send_resolved: true +``` + +`.env` 加 `SMTP_PASSWORD=...`。 + +## 验证 + +部署后用 Alertmanager API 触发测试告警: + +```bash +curl -XPOST http://:9093/api/v2/alerts \ + -H "Content-Type: application/json" \ + -d '[{"labels":{"alertname":"test","severity":"critical"}}]' +``` + +应收到渠道通知(钉钉/企微/邮件)。Alertmanager UI:`http://:9093`。 + +## 当前路由策略 + +- 按 `alertname + service` 分组 +- `severity=critical`(DB 宕机/5xx 飙升/Redis 不可达)即时通知,5 分钟重复 +- 其他告警 30s 聚合,4 小时重复 diff --git a/docker/docker-compose.production.yml b/docker/docker-compose.production.yml index 22a2209..d4c2b9f 100644 --- a/docker/docker-compose.production.yml +++ b/docker/docker-compose.production.yml @@ -145,6 +145,7 @@ services: - alertmanager_data:/alertmanager command: - "--config.file=/etc/alertmanager/config.yml" + - "--config.expand-env=true" - "--storage.path=/alertmanager" expose: - "9093" @@ -171,6 +172,30 @@ services: networks: - hms-internal + # ── Prometheus exporters(PP-04:之前 prometheus.yml 配了 target 但服务未部署,告警永不触发)── + postgres-exporter: + image: prometheuscommunity/postgres-exporter:v0.15.0 + container_name: hms-postgres-exporter + restart: unless-stopped + environment: + DATA_SOURCE_NAME: "postgresql://${POSTGRES_USER:-erp}:${POSTGRES_PASSWORD}@postgres:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-erp}?sslmode=disable" + expose: + - "9187" + networks: + - hms-internal + + redis-exporter: + image: oliver006/redis_exporter:v1.66.0 + container_name: hms-redis-exporter + restart: unless-stopped + environment: + REDIS_ADDR: "redis://redis:${REDIS_PORT:-6379}" + REDIS_PASSWORD: "${REDIS_PASSWORD:-erp_redis_dev}" + expose: + - "9121" + networks: + - hms-internal + volumes: app-uploads: driver: local diff --git a/docker/grafana/provisioning/dashboards/json/hms-overview.json b/docker/grafana/provisioning/dashboards/json/hms-overview.json new file mode 100644 index 0000000..1a238eb --- /dev/null +++ b/docker/grafana/provisioning/dashboards/json/hms-overview.json @@ -0,0 +1,115 @@ +{ + "uid": "hms-overview", + "title": "HMS 概览", + "tags": ["HMS", "overview"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { "from": "now-1h", "to": "now" }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "HMS 服务状态", + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "A", "expr": "up{job=\"hms\"}", "legendFormat": "" }], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "mappings": [ + { + "type": "value", + "options": { + "0": { "text": "DOWN", "color": "red", "index": 0 }, + "1": { "text": "UP", "color": "green", "index": 1 } + } + } + ] + } + }, + { + "id": 2, + "type": "stat", + "title": "EventBus 积压 (pending)", + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [{ "refId": "A", "expr": "eventbus_pending_total", "legendFormat": "" }], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "value", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 500 } + ] + } + } + }, + { + "id": 3, + "type": "stat", + "title": "API 5xx 错误率 (5m)", + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / clamp_min(sum(rate(http_requests_total[5m])), 1)", + "legendFormat": "5xx ratio" + } + ], + "options": { + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "colorMode": "background", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "mappings": [] + }, + "fieldConfig": { "defaults": { "unit": "percentunit" } } + }, + { + "id": 4, + "type": "timeseries", + "title": "DB 连接池(活跃 / 空闲)", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "A", "expr": "db_pool_connections_active", "legendFormat": "活跃" }, + { "refId": "B", "expr": "db_pool_connections_idle", "legendFormat": "空闲" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + }, + { + "id": 5, + "type": "timeseries", + "title": "进程内存 / CPU", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "A", "expr": "process_resident_memory_bytes", "legendFormat": "内存 (bytes)" }, + { "refId": "B", "expr": "rate(process_cpu_seconds_total[5m])", "legendFormat": "CPU (cores/s)" } + ] + }, + { + "id": 6, + "type": "timeseries", + "title": "EventBus 积压趋势", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { "refId": "A", "expr": "eventbus_pending_total", "legendFormat": "pending events" } + ], + "fieldConfig": { "defaults": { "unit": "short" } } + } + ] +} diff --git a/docker/grafana/provisioning/datasources/prometheus.yml b/docker/grafana/provisioning/datasources/prometheus.yml index 13b1ea8..46316f7 100644 --- a/docker/grafana/provisioning/datasources/prometheus.yml +++ b/docker/grafana/provisioning/datasources/prometheus.yml @@ -4,6 +4,7 @@ apiVersion: 1 datasources: - name: Prometheus + uid: prometheus type: prometheus access: proxy url: http://prometheus:9090