groups: # ── 系统级告警 ── - name: system rules: - alert: HMSHighMemoryUsage expr: process_resident_memory_bytes > 800000000 for: 5m labels: severity: warning annotations: summary: "HMS 内存使用超过 800MB" description: "当前值: {{ $value | humanize }}B" - alert: HMSHighMemoryCritical expr: process_resident_memory_bytes > 1000000000 for: 2m labels: severity: critical annotations: summary: "HMS 内存使用超过 1GB(危险)" description: "当前值: {{ $value | humanize }}B" - alert: HMSHighCPU expr: rate(process_cpu_seconds_total[5m]) > 0.8 for: 10m labels: severity: warning annotations: summary: "HMS CPU 使用率超过 80%" # ── 应用级告警 ── - name: application rules: - alert: HMSHighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05 for: 5m labels: severity: critical annotations: summary: "API 5xx 错误率超过 5%" description: "当前错误率: {{ $value | humanizePercentage }}" - alert: HMSSlowResponses expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2 for: 10m labels: severity: warning annotations: summary: "95% 请求响应时间超过 2 秒" - alert: HMSInstanceDown expr: up{job="hms"} == 0 for: 2m labels: severity: critical annotations: summary: "HMS 服务不可达" # ── 数据库告警 ── - name: database rules: - alert: HMSPostgresConnectionsHigh expr: pg_stat_activity_count > 80 for: 5m labels: severity: warning annotations: summary: "PostgreSQL 活跃连接数超过 80" - alert: HMSPostgresReplicationLag expr: pg_replication_lag > 30 for: 5m labels: severity: critical annotations: summary: "PostgreSQL 复制延迟超过 30 秒" - alert: HMSBackupMissing expr: time() - hms_last_backup_timestamp > 86400 * 2 for: 1h labels: severity: critical annotations: summary: "数据库备份超过 48 小时未执行" # ── Redis 告警 ── - name: redis rules: - alert: HMSRedisMemoryHigh expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "Redis 内存使用超过 90%" - alert: HMSRedisDown expr: redis_up == 0 for: 2m labels: severity: critical annotations: summary: "Redis 服务不可达"