删除内容: - 前端: health/(67文件), ai/(2文件), Copilot, MediaPicker, 相关API/Store/Hook - 后端: wechat_handler, wechat_service, wechat_user entity, analytics handler, ai_workflow_seed - 配置: WechatConfig, AppConfig.wechat, AuthState wechat 字段 - 启动: 微信凭据检查块, ensure_ai_workflows() 调用 - 迁移: 新增 m20260613_000170_drop_wechat_users.rs - 脚本: api_test_health_alert.py, api_test_mp.py, mpsync.sh/ps1 - E2E: health-data page, flows/ 目录 保留: erp-core/auth/workflow/message/config/plugin + 基座前端 + 通用组件
104 lines
2.9 KiB
YAML
104 lines
2.9 KiB
YAML
groups:
|
||
# ── 系统级告警 ──
|
||
- name: system
|
||
rules:
|
||
- alert: HMSHighMemoryUsage
|
||
expr: process_resident_memory_bytes > 800000000
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS 内存使用超过 800MB"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighMemoryCritical
|
||
expr: process_resident_memory_bytes > 1000000000
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 内存使用超过 1GB(危险)"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighCPU
|
||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS CPU 使用率超过 80%"
|
||
|
||
# ── 应用级告警 ──
|
||
- name: application
|
||
rules:
|
||
- alert: HMSHighErrorRate
|
||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "API 5xx 错误率超过 5%"
|
||
description: "当前错误率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HMSSlowResponses
|
||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "95% 请求响应时间超过 2 秒"
|
||
|
||
- alert: HMSInstanceDown
|
||
expr: up{job="hms"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 服务不可达"
|
||
|
||
# ── 数据库告警 ──
|
||
- name: database
|
||
rules:
|
||
- alert: HMSPostgresConnectionsHigh
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "PostgreSQL 活跃连接数超过 80"
|
||
|
||
- alert: HMSPostgresReplicationLag
|
||
expr: pg_replication_lag > 30
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "PostgreSQL 复制延迟超过 30 秒"
|
||
|
||
- alert: HMSBackupMissing
|
||
expr: time() - hms_last_backup_timestamp > 86400 * 2
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "数据库备份超过 48 小时未执行"
|
||
|
||
# ── Redis 告警 ──
|
||
- name: redis
|
||
rules:
|
||
- alert: HMSRedisMemoryHigh
|
||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Redis 内存使用超过 90%"
|
||
|
||
- alert: HMSRedisDown
|
||
expr: redis_up == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Redis 服务不可达"
|