- Stripped 11 business crates (health, ai, dialysis, plugins) - Cleaned AppState, AppConfig, main.rs from business coupling - Reduced migrations from 169 to 53 (base-only) - Removed health_provider trait from erp-core - Removed business integration tests - Removed gateway rate limiting middleware - Base capabilities: auth, RBAC, JWT, config, workflow, message, plugin, audit, crypto, RLS, multi-tenant Cargo check: OK Cargo test: OK
104 lines
2.9 KiB
YAML
104 lines
2.9 KiB
YAML
groups:
|
||
# ── 系统级告警 ──
|
||
- name: system
|
||
rules:
|
||
- alert: HMSHighMemoryUsage
|
||
expr: process_resident_memory_bytes > 800000000
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS 内存使用超过 800MB"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighMemoryCritical
|
||
expr: process_resident_memory_bytes > 1000000000
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 内存使用超过 1GB(危险)"
|
||
description: "当前值: {{ $value | humanize }}B"
|
||
|
||
- alert: HMSHighCPU
|
||
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "HMS CPU 使用率超过 80%"
|
||
|
||
# ── 应用级告警 ──
|
||
- name: application
|
||
rules:
|
||
- alert: HMSHighErrorRate
|
||
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "API 5xx 错误率超过 5%"
|
||
description: "当前错误率: {{ $value | humanizePercentage }}"
|
||
|
||
- alert: HMSSlowResponses
|
||
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||
for: 10m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "95% 请求响应时间超过 2 秒"
|
||
|
||
- alert: HMSInstanceDown
|
||
expr: up{job="hms"} == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "HMS 服务不可达"
|
||
|
||
# ── 数据库告警 ──
|
||
- name: database
|
||
rules:
|
||
- alert: HMSPostgresConnectionsHigh
|
||
expr: pg_stat_activity_count > 80
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "PostgreSQL 活跃连接数超过 80"
|
||
|
||
- alert: HMSPostgresReplicationLag
|
||
expr: pg_replication_lag > 30
|
||
for: 5m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "PostgreSQL 复制延迟超过 30 秒"
|
||
|
||
- alert: HMSBackupMissing
|
||
expr: time() - hms_last_backup_timestamp > 86400 * 2
|
||
for: 1h
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "数据库备份超过 48 小时未执行"
|
||
|
||
# ── Redis 告警 ──
|
||
- name: redis
|
||
rules:
|
||
- alert: HMSRedisMemoryHigh
|
||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||
for: 5m
|
||
labels:
|
||
severity: warning
|
||
annotations:
|
||
summary: "Redis 内存使用超过 90%"
|
||
|
||
- alert: HMSRedisDown
|
||
expr: redis_up == 0
|
||
for: 2m
|
||
labels:
|
||
severity: critical
|
||
annotations:
|
||
summary: "Redis 服务不可达"
|