Files
hms/docker/prometheus/alerts.yml
iven bc571c7749 feat(docker): 生产环境 DevOps 基础设施 — TLS + 备份加密 + Prometheus + Redis 持久化
新增:
- nginx/nginx.conf: TLS 1.2/1.3 终端 + HSTS/CSP 安全头 + SSE 长连接 + 50M 上传限制
- prometheus/prometheus.yml: HMS/PostgreSQL/Redis/Nginx 四指标源
- prometheus/alerts.yml: 4 组告警规则(系统/应用/数据库/Redis),含 5xx 错误率 + 内存 + 连接数
- restore.sh: 备份恢复脚本(支持加密备份解密恢复)

改进:
- backup.sh: 新增 BACKUP_PASSPHRASE 加密(AES-256-CBC)+ 完整性校验 + 恢复指引
- docker-compose.production.yml: 添加 Nginx/Prometheus/Grafana/uploads-backup 容器
- docker-compose.yml: Redis 添加 --appendonly yes 持久化
- .env.production.example: 添加 DevOps 相关环境变量模板
2026-05-21 18:21:51 +08:00

104 lines
2.9 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
groups:
# ── 系统级告警 ──
- name: system
rules:
- alert: HMSHighMemoryUsage
expr: process_resident_memory_bytes > 800000000
for: 5m
labels:
severity: warning
annotations:
summary: "HMS 内存使用超过 800MB"
description: "当前值: {{ $value | humanize }}B"
- alert: HMSHighMemoryCritical
expr: process_resident_memory_bytes > 1000000000
for: 2m
labels:
severity: critical
annotations:
summary: "HMS 内存使用超过 1GB危险"
description: "当前值: {{ $value | humanize }}B"
- alert: HMSHighCPU
expr: rate(process_cpu_seconds_total[5m]) > 0.8
for: 10m
labels:
severity: warning
annotations:
summary: "HMS CPU 使用率超过 80%"
# ── 应用级告警 ──
- name: application
rules:
- alert: HMSHighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "API 5xx 错误率超过 5%"
description: "当前错误率: {{ $value | humanizePercentage }}"
- alert: HMSSlowResponses
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "95% 请求响应时间超过 2 秒"
- alert: HMSInstanceDown
expr: up{job="hms"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "HMS 服务不可达"
# ── 数据库告警 ──
- name: database
rules:
- alert: HMSPostgresConnectionsHigh
expr: pg_stat_activity_count > 80
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL 活跃连接数超过 80"
- alert: HMSPostgresReplicationLag
expr: pg_replication_lag > 30
for: 5m
labels:
severity: critical
annotations:
summary: "PostgreSQL 复制延迟超过 30 秒"
- alert: HMSBackupMissing
expr: time() - hms_last_backup_timestamp > 86400 * 2
for: 1h
labels:
severity: critical
annotations:
summary: "数据库备份超过 48 小时未执行"
# ── Redis 告警 ──
- name: redis
rules:
- alert: HMSRedisMemoryHigh
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Redis 内存使用超过 90%"
- alert: HMSRedisDown
expr: redis_up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Redis 服务不可达"