新增: - nginx/nginx.conf: TLS 1.2/1.3 终端 + HSTS/CSP 安全头 + SSE 长连接 + 50M 上传限制 - prometheus/prometheus.yml: HMS/PostgreSQL/Redis/Nginx 四指标源 - prometheus/alerts.yml: 4 组告警规则(系统/应用/数据库/Redis),含 5xx 错误率 + 内存 + 连接数 - restore.sh: 备份恢复脚本(支持加密备份解密恢复) 改进: - backup.sh: 新增 BACKUP_PASSPHRASE 加密(AES-256-CBC)+ 完整性校验 + 恢复指引 - docker-compose.production.yml: 添加 Nginx/Prometheus/Grafana/uploads-backup 容器 - docker-compose.yml: Redis 添加 --appendonly yes 持久化 - .env.production.example: 添加 DevOps 相关环境变量模板
104 lines
2.9 KiB
YAML
104 lines
2.9 KiB
YAML
groups:
|
||
# ── 系统级告警 ──
|
||
- name: system
|
||
rules:
|
||
- alert: HMSHighMemoryUsage
|
||
expr: process_resident_memory_bytes > 800000000
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS 内存使用超过 800MB"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighMemoryCritical
|
||
expr: process_resident_memory_bytes > 1000000000
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 内存使用超过 1GB(危险)"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighCPU
|
||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS CPU 使用率超过 80%"
|
||
|
||
# ── 应用级告警 ──
|
||
- name: application
|
||
rules:
|
||
- alert: HMSHighErrorRate
|
||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "API 5xx 错误率超过 5%"
|
||
description: "当前错误率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HMSSlowResponses
|
||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "95% 请求响应时间超过 2 秒"
|
||
|
||
- alert: HMSInstanceDown
|
||
expr: up{job="hms"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 服务不可达"
|
||
|
||
# ── 数据库告警 ──
|
||
- name: database
|
||
rules:
|
||
- alert: HMSPostgresConnectionsHigh
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "PostgreSQL 活跃连接数超过 80"
|
||
|
||
- alert: HMSPostgresReplicationLag
|
||
expr: pg_replication_lag > 30
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "PostgreSQL 复制延迟超过 30 秒"
|
||
|
||
- alert: HMSBackupMissing
|
||
expr: time() - hms_last_backup_timestamp > 86400 * 2
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "数据库备份超过 48 小时未执行"
|
||
|
||
# ── Redis 告警 ──
|
||
- name: redis
|
||
rules:
|
||
- alert: HMSRedisMemoryHigh
|
||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Redis 内存使用超过 90%"
|
||
|
||
- alert: HMSRedisDown
|
||
expr: redis_up == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Redis 服务不可达"
|