feat(docker): 生产环境 DevOps 基础设施 — TLS + 备份加密 + Prometheus + Redis 持久化
新增: - nginx/nginx.conf: TLS 1.2/1.3 终端 + HSTS/CSP 安全头 + SSE 长连接 + 50M 上传限制 - prometheus/prometheus.yml: HMS/PostgreSQL/Redis/Nginx 四指标源 - prometheus/alerts.yml: 4 组告警规则(系统/应用/数据库/Redis),含 5xx 错误率 + 内存 + 连接数 - restore.sh: 备份恢复脚本(支持加密备份解密恢复) 改进: - backup.sh: 新增 BACKUP_PASSPHRASE 加密(AES-256-CBC)+ 完整性校验 + 恢复指引 - docker-compose.production.yml: 添加 Nginx/Prometheus/Grafana/uploads-backup 容器 - docker-compose.yml: Redis 添加 --appendonly yes 持久化 - .env.production.example: 添加 DevOps 相关环境变量模板
This commit is contained in:
@@ -50,3 +50,21 @@ ERP__AI__DEFAULT_PROVIDER=ollama
|
|||||||
ERP__AI__API_KEY=
|
ERP__AI__API_KEY=
|
||||||
ERP__AI__BASE_URL=http://localhost:11434
|
ERP__AI__BASE_URL=http://localhost:11434
|
||||||
ERP__AI__MODEL=qwen2.5:7b
|
ERP__AI__MODEL=qwen2.5:7b
|
||||||
|
|
||||||
|
# ===== DevOps =====
|
||||||
|
|
||||||
|
# 备份加密密码(openssl AES-256-CBC,必填用于生产)
|
||||||
|
BACKUP_PASSPHRASE=CHANGE_ME_BACKUP_ENCRYPTION_PASSWORD
|
||||||
|
|
||||||
|
# 备份保留天数
|
||||||
|
BACKUP_KEEP_DAYS=7
|
||||||
|
|
||||||
|
# 备份执行时间(cron 格式)
|
||||||
|
BACKUP_CRON=0 2 * * *
|
||||||
|
|
||||||
|
# uploads 备份时间
|
||||||
|
UPLOADS_BACKUP_CRON=0 3 * * *
|
||||||
|
|
||||||
|
# Grafana 管理员密码
|
||||||
|
GRAFANA_ADMIN_PASSWORD=CHANGE_ME_GRAFANA_ADMIN
|
||||||
|
GRAFANA_ROOT_URL=http://localhost:3001
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# PostgreSQL 自动备份脚本
|
# PostgreSQL 自动备份脚本(含加密)
|
||||||
# 用法:
|
# 用法:
|
||||||
# 手动: ./docker/backup.sh
|
# 手动: ./docker/backup.sh
|
||||||
# 自动: 由 docker compose backup 服务每日 02:00 执行
|
# 自动: 由 docker compose backup 服务每日 02:00 执行
|
||||||
|
#
|
||||||
|
# 加密方式(二选一):
|
||||||
|
# BACKUP_PASSPHRASE — 使用 openssl AES-256-CBC 对称加密(无额外依赖)
|
||||||
|
# GPG_RECIPIENT — 使用 GPG 非对称加密(需预置公钥)
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
BACKUP_DIR="${BACKUP_DIR:-/backups}"
|
BACKUP_DIR="${BACKUP_DIR:-/backups}"
|
||||||
@@ -13,7 +17,9 @@ PG_DB="${PGDATABSE:-erp}"
|
|||||||
KEEP_DAYS="${KEEP_DAYS:-7}"
|
KEEP_DAYS="${KEEP_DAYS:-7}"
|
||||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||||
FILENAME="${PG_DB}_${TIMESTAMP}.sql.gz"
|
FILENAME="${PG_DB}_${TIMESTAMP}.sql.gz"
|
||||||
|
ENCRYPTED_FILENAME="${FILENAME}.enc"
|
||||||
FILEPATH="${BACKUP_DIR}/${FILENAME}"
|
FILEPATH="${BACKUP_DIR}/${FILENAME}"
|
||||||
|
ENCRYPTED_FILEPATH="${BACKUP_DIR}/${ENCRYPTED_FILENAME}"
|
||||||
|
|
||||||
mkdir -p "${BACKUP_DIR}"
|
mkdir -p "${BACKUP_DIR}"
|
||||||
|
|
||||||
@@ -36,8 +42,54 @@ else
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 清理过期备份
|
# ── 加密备份 ──
|
||||||
DELETED=$(find "${BACKUP_DIR}" -name "${PG_DB}_*.sql.gz" -mtime +${KEEP_DAYS} -delete -print | wc -l)
|
if [ -n "${BACKUP_PASSPHRASE:-}" ]; then
|
||||||
|
echo "[$(date -Iseconds)] 使用 AES-256-CBC 加密备份..."
|
||||||
|
if openssl enc -aes-256-cbc -salt -pbkdf2 -pass "pass:${BACKUP_PASSPHRASE}" \
|
||||||
|
-in "${FILEPATH}" -out "${ENCRYPTED_FILEPATH}"; then
|
||||||
|
rm -f "${FILEPATH}"
|
||||||
|
ENC_SIZE=$(du -h "${ENCRYPTED_FILEPATH}" | cut -f1)
|
||||||
|
echo "[$(date -Iseconds)] 加密完成: ${ENCRYPTED_FILENAME} (${ENC_SIZE})"
|
||||||
|
else
|
||||||
|
echo "[$(date -Iseconds)] 加密失败!保留未加密备份" >&2
|
||||||
|
rm -f "${ENCRYPTED_FILEPATH}"
|
||||||
|
fi
|
||||||
|
elif [ -n "${GPG_RECIPIENT:-}" ]; then
|
||||||
|
echo "[$(date -Iseconds)] 使用 GPG 加密备份..."
|
||||||
|
if gpg --batch --yes --encrypt --recipient "${GPG_RECIPIENT}" "${FILEPATH}"; then
|
||||||
|
rm -f "${FILEPATH}"
|
||||||
|
ENC_SIZE=$(du -h "${ENCRYPTED_FILEPATH}" | cut -f1)
|
||||||
|
echo "[$(date -Iseconds)] 加密完成: ${ENCRYPTED_FILENAME} (${ENC_SIZE})"
|
||||||
|
else
|
||||||
|
echo "[$(date -Iseconds)] GPG 加密失败!保留未加密备份" >&2
|
||||||
|
rm -f "${FILEPATH}.gpg"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "[$(date -Iseconds)] 警告: 未设置 BACKUP_PASSPHRASE 或 GPG_RECIPIENT,备份未加密!" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 备份完整性校验 ──
|
||||||
|
LATEST_FILE=$(ls -t "${BACKUP_DIR}/${PG_DB}"_*.sql.gz* 2>/dev/null | head -1)
|
||||||
|
if [ -n "${LATEST_FILE}" ] && [ -f "${LATEST_FILE}" ]; then
|
||||||
|
if [[ "${LATEST_FILE}" == *.enc ]]; then
|
||||||
|
echo "[$(date -Iseconds)] 加密备份文件存在: $(basename "${LATEST_FILE}")"
|
||||||
|
elif gzip -t "${LATEST_FILE}" 2>/dev/null; then
|
||||||
|
echo "[$(date -Iseconds)] 备份完整性校验通过"
|
||||||
|
else
|
||||||
|
echo "[$(date -Iseconds)] 警告: 备份文件可能损坏: ${LATEST_FILE}" >&2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ── 清理过期备份 ──
|
||||||
|
DELETED=$(find "${BACKUP_DIR}" -name "${PG_DB}_*.sql.gz*" -mtime +${KEEP_DAYS} -delete -print | wc -l)
|
||||||
if [ "${DELETED}" -gt 0 ]; then
|
if [ "${DELETED}" -gt 0 ]; then
|
||||||
echo "[$(date -Iseconds)] 已清理 ${DELETED} 个过期备份(>${KEEP_DAYS}天)"
|
echo "[$(date -Iseconds)] 已清理 ${DELETED} 个过期备份(>${KEEP_DAYS}天)"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ── 恢复指引 ──
|
||||||
|
echo ""
|
||||||
|
echo "恢复方法:"
|
||||||
|
echo " # 解密(如加密):"
|
||||||
|
echo " openssl enc -d -aes-256-cbc -pbkdf2 -pass pass:\$BACKUP_PASSPHRASE -in ${ENCRYPTED_FILEPATH} -out ${FILEPATH}"
|
||||||
|
echo " # 恢复:"
|
||||||
|
echo " gunzip -c ${FILEPATH} | psql -h \$PGHOST -U \$PGUSER -d \$PGDB"
|
||||||
|
|||||||
@@ -2,15 +2,44 @@
|
|||||||
# 使用方式: docker compose -f docker/docker-compose.yml -f docker/docker-compose.production.yml up -d
|
# 使用方式: docker compose -f docker/docker-compose.yml -f docker/docker-compose.production.yml up -d
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
# ── Nginx 反代 + TLS 终端 ──
|
||||||
|
nginx:
|
||||||
|
image: nginx:1.27-alpine
|
||||||
|
container_name: hms-nginx
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
- "443:443"
|
||||||
|
volumes:
|
||||||
|
- ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||||||
|
- ./nginx/ssl:/etc/nginx/ssl:ro
|
||||||
|
- nginx_logs:/var/log/nginx
|
||||||
|
depends_on:
|
||||||
|
app:
|
||||||
|
condition: service_healthy
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "--spider", "-q", "http://localhost:80"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: "0.5"
|
||||||
|
memory: 128M
|
||||||
|
networks:
|
||||||
|
- hms-internal
|
||||||
|
|
||||||
|
# ── HMS 应用服务器 ──
|
||||||
app:
|
app:
|
||||||
build:
|
build:
|
||||||
context: ..
|
context: ..
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
container_name: hms-server
|
container_name: hms-server
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
expose:
|
||||||
- "${APP_PORT:-3000}:3000"
|
- "3000"
|
||||||
- "${METRICS_PORT:-9090}:9090"
|
- "9090"
|
||||||
env_file:
|
env_file:
|
||||||
- .env.production
|
- .env.production
|
||||||
environment:
|
environment:
|
||||||
@@ -40,8 +69,7 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- hms-internal
|
- hms-internal
|
||||||
|
|
||||||
# 每日自动备份 — 每天凌晨 02:00 执行 pg_dump,保留 7 天
|
# ── 每日自动备份(含加密)──
|
||||||
# 手动触发: docker compose -f docker/docker-compose.yml -f docker/docker-compose.production.yml run --rm backup
|
|
||||||
backup:
|
backup:
|
||||||
image: postgres:16-alpine
|
image: postgres:16-alpine
|
||||||
container_name: hms-backup
|
container_name: hms-backup
|
||||||
@@ -59,6 +87,7 @@ services:
|
|||||||
BACKUP_DIR: /backups
|
BACKUP_DIR: /backups
|
||||||
KEEP_DAYS: "${BACKUP_KEEP_DAYS:-7}"
|
KEEP_DAYS: "${BACKUP_KEEP_DAYS:-7}"
|
||||||
BACKUP_CRON: "${BACKUP_CRON:-0 2 * * *}"
|
BACKUP_CRON: "${BACKUP_CRON:-0 2 * * *}"
|
||||||
|
BACKUP_PASSPHRASE: "${BACKUP_PASSPHRASE:-}"
|
||||||
volumes:
|
volumes:
|
||||||
- ./backup.sh:/usr/local/bin/backup.sh:ro
|
- ./backup.sh:/usr/local/bin/backup.sh:ro
|
||||||
- backup_data:/backups
|
- backup_data:/backups
|
||||||
@@ -68,11 +97,76 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- hms-internal
|
- hms-internal
|
||||||
|
|
||||||
|
# ── uploads 文件备份(同步到宿主机)──
|
||||||
|
uploads-backup:
|
||||||
|
image: alpine:3.20
|
||||||
|
container_name: hms-uploads-backup
|
||||||
|
restart: unless-stopped
|
||||||
|
entrypoint: >
|
||||||
|
sh -c "
|
||||||
|
echo '$$UPLOADS_BACKUP_CRON rsync -a --delete /source/uploads/ /backup/uploads/' > /etc/crontabs/root &&
|
||||||
|
crond -f -l 2
|
||||||
|
"
|
||||||
|
environment:
|
||||||
|
UPLOADS_BACKUP_CRON: "${UPLOADS_BACKUP_CRON:-0 3 * * *}"
|
||||||
|
volumes:
|
||||||
|
- app-uploads:/source/uploads:ro
|
||||||
|
- uploads_backup_data:/backup/uploads
|
||||||
|
networks:
|
||||||
|
- hms-internal
|
||||||
|
|
||||||
|
# ── Prometheus 监控 ──
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:v3.1.0
|
||||||
|
container_name: hms-prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
command:
|
||||||
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||||
|
- "--storage.tsdb.retention.time=30d"
|
||||||
|
- "--storage.tsdb.retention.size=2GB"
|
||||||
|
- "--web.enable-lifecycle"
|
||||||
|
volumes:
|
||||||
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||||
|
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||||
|
- prometheus_data:/prometheus
|
||||||
|
expose:
|
||||||
|
- "9090"
|
||||||
|
networks:
|
||||||
|
- hms-internal
|
||||||
|
|
||||||
|
# ── Grafana 可视化 ──
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:11.4.0
|
||||||
|
container_name: hms-grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}"
|
||||||
|
GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-}"
|
||||||
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||||
|
GF_SERVER_ROOT_URL: "${GRAFANA_ROOT_URL:-http://localhost:3001}"
|
||||||
|
volumes:
|
||||||
|
- grafana_data:/var/lib/grafana
|
||||||
|
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
||||||
|
expose:
|
||||||
|
- "3000"
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
networks:
|
||||||
|
- hms-internal
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
app-uploads:
|
app-uploads:
|
||||||
driver: local
|
driver: local
|
||||||
backup_data:
|
backup_data:
|
||||||
driver: local
|
driver: local
|
||||||
|
uploads_backup_data:
|
||||||
|
driver: local
|
||||||
|
nginx_logs:
|
||||||
|
driver: local
|
||||||
|
prometheus_data:
|
||||||
|
driver: local
|
||||||
|
grafana_data:
|
||||||
|
driver: local
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
hms-internal:
|
hms-internal:
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ services:
|
|||||||
redis:
|
redis:
|
||||||
image: redis:7-alpine
|
image: redis:7-alpine
|
||||||
container_name: erp-redis
|
container_name: erp-redis
|
||||||
command: redis-server --requirepass ${REDIS_PASSWORD:-erp_redis_dev}
|
command: redis-server --requirepass ${REDIS_PASSWORD:-erp_redis_dev} --appendonly yes
|
||||||
expose:
|
expose:
|
||||||
- "6379"
|
- "6379"
|
||||||
volumes:
|
volumes:
|
||||||
|
|||||||
96
docker/nginx/nginx.conf
Normal file
96
docker/nginx/nginx.conf
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
upstream hms_backend {
|
||||||
|
server app:3000;
|
||||||
|
keepalive 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name _;
|
||||||
|
return 301 https://$host$request_uri;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
# ── TLS ──
|
||||||
|
ssl_certificate /etc/nginx/ssl/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/nginx/ssl/privkey.pem;
|
||||||
|
ssl_protocols TLSv1.2 TLSv1.3;
|
||||||
|
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384;
|
||||||
|
ssl_prefer_server_ciphers off;
|
||||||
|
ssl_session_cache shared:SSL:10m;
|
||||||
|
ssl_session_timeout 1d;
|
||||||
|
ssl_session_tickets off;
|
||||||
|
|
||||||
|
# ── 安全头 ──
|
||||||
|
add_header Strict-Transport-Security "max-age=63072000; includeSubDomains; preload" always;
|
||||||
|
add_header X-Content-Type-Options "nosniff" always;
|
||||||
|
add_header X-Frame-Options "DENY" always;
|
||||||
|
add_header X-XSS-Protection "1; mode=block" always;
|
||||||
|
add_header Referrer-Policy "strict-origin-when-cross-origin" always;
|
||||||
|
add_header Permissions-Policy "camera=(), microphone=(), geolocation=()" always;
|
||||||
|
add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline'; style-src 'self' 'unsafe-inline'; img-src 'self' data: blob:; font-src 'self'; connect-src 'self' wss:; frame-ancestors 'none'" always;
|
||||||
|
|
||||||
|
# ── 日志 ──
|
||||||
|
access_log /var/log/nginx/hms_access.log;
|
||||||
|
error_log /var/log/nginx/hms_error.log warn;
|
||||||
|
|
||||||
|
# ── 上传文件(化验单/体检报告)──
|
||||||
|
location /uploads/ {
|
||||||
|
proxy_pass http://hms_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
# 大文件上传限制
|
||||||
|
client_max_body_size 50m;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── SSE(消息推送/AI 分析)──
|
||||||
|
location ~ ^/api/v1/(message|ai)/.*sse {
|
||||||
|
proxy_pass http://hms_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
proxy_buffering off;
|
||||||
|
proxy_cache off;
|
||||||
|
proxy_read_timeout 86400s;
|
||||||
|
chunked_transfer_encoding on;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── API 反代 ──
|
||||||
|
location /api/ {
|
||||||
|
proxy_pass http://hms_backend;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Connection "";
|
||||||
|
client_max_body_size 50m;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 健康检查 ──
|
||||||
|
location /health {
|
||||||
|
proxy_pass http://hms_backend/api/v1/health;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── 指标(仅内网可访问)──
|
||||||
|
location /metrics {
|
||||||
|
# 生产环境应限制为 Prometheus 访问
|
||||||
|
allow 172.16.0.0/12;
|
||||||
|
allow 10.0.0.0/8;
|
||||||
|
deny all;
|
||||||
|
proxy_pass http://hms_backend:9090/metrics;
|
||||||
|
access_log off;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
return 404;
|
||||||
|
}
|
||||||
|
}
|
||||||
3
docker/nginx/ssl/.gitignore
vendored
Normal file
3
docker/nginx/ssl/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
*
|
||||||
|
!.gitkeep
|
||||||
|
!.gitignore
|
||||||
8
docker/nginx/ssl/.gitkeep
Normal file
8
docker/nginx/ssl/.gitkeep
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# 将 SSL 证书放置在此目录
|
||||||
|
# 必需文件: fullchain.pem + privkey.pem
|
||||||
|
# 生产环境建议使用 Let's Encrypt 或云服务商证书管理
|
||||||
|
#
|
||||||
|
# Let's Encrypt 示例:
|
||||||
|
# certbot certonly --standalone -d your-domain.com
|
||||||
|
# cp /etc/letsencrypt/live/your-domain.com/fullchain.pem .
|
||||||
|
# cp /etc/letsencrypt/live/your-domain.com/privkey.pem .
|
||||||
103
docker/prometheus/alerts.yml
Normal file
103
docker/prometheus/alerts.yml
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
groups:
|
||||||
|
# ── 系统级告警 ──
|
||||||
|
- name: system
|
||||||
|
rules:
|
||||||
|
- alert: HMSHighMemoryUsage
|
||||||
|
expr: process_resident_memory_bytes > 800000000
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "HMS 内存使用超过 800MB"
|
||||||
|
description: "当前值: {{ $value | humanize }}B"
|
||||||
|
|
||||||
|
- alert: HMSHighMemoryCritical
|
||||||
|
expr: process_resident_memory_bytes > 1000000000
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "HMS 内存使用超过 1GB(危险)"
|
||||||
|
description: "当前值: {{ $value | humanize }}B"
|
||||||
|
|
||||||
|
- alert: HMSHighCPU
|
||||||
|
expr: rate(process_cpu_seconds_total[5m]) > 0.8
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "HMS CPU 使用率超过 80%"
|
||||||
|
|
||||||
|
# ── 应用级告警 ──
|
||||||
|
- name: application
|
||||||
|
rules:
|
||||||
|
- alert: HMSHighErrorRate
|
||||||
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "API 5xx 错误率超过 5%"
|
||||||
|
description: "当前错误率: {{ $value | humanizePercentage }}"
|
||||||
|
|
||||||
|
- alert: HMSSlowResponses
|
||||||
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "95% 请求响应时间超过 2 秒"
|
||||||
|
|
||||||
|
- alert: HMSInstanceDown
|
||||||
|
expr: up{job="hms"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "HMS 服务不可达"
|
||||||
|
|
||||||
|
# ── 数据库告警 ──
|
||||||
|
- name: database
|
||||||
|
rules:
|
||||||
|
- alert: HMSPostgresConnectionsHigh
|
||||||
|
expr: pg_stat_activity_count > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL 活跃连接数超过 80"
|
||||||
|
|
||||||
|
- alert: HMSPostgresReplicationLag
|
||||||
|
expr: pg_replication_lag > 30
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PostgreSQL 复制延迟超过 30 秒"
|
||||||
|
|
||||||
|
- alert: HMSBackupMissing
|
||||||
|
expr: time() - hms_last_backup_timestamp > 86400 * 2
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "数据库备份超过 48 小时未执行"
|
||||||
|
|
||||||
|
# ── Redis 告警 ──
|
||||||
|
- name: redis
|
||||||
|
rules:
|
||||||
|
- alert: HMSRedisMemoryHigh
|
||||||
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Redis 内存使用超过 90%"
|
||||||
|
|
||||||
|
- alert: HMSRedisDown
|
||||||
|
expr: redis_up == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Redis 服务不可达"
|
||||||
32
docker/prometheus/prometheus.yml
Normal file
32
docker/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- "alerts.yml"
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: "hms"
|
||||||
|
metrics_path: /metrics
|
||||||
|
static_configs:
|
||||||
|
- targets: ["app:9090"]
|
||||||
|
labels:
|
||||||
|
service: "hms-server"
|
||||||
|
|
||||||
|
- job_name: "postgres"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["postgres-exporter:9187"]
|
||||||
|
labels:
|
||||||
|
service: "postgresql"
|
||||||
|
|
||||||
|
- job_name: "redis"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["redis-exporter:9121"]
|
||||||
|
labels:
|
||||||
|
service: "redis"
|
||||||
|
|
||||||
|
- job_name: "nginx"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["nginx-exporter:9113"]
|
||||||
|
labels:
|
||||||
|
service: "nginx"
|
||||||
43
docker/restore.sh
Normal file
43
docker/restore.sh
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# PostgreSQL 备份恢复脚本
|
||||||
|
# 用法: BACKUP_PASSPHRASE=xxx ./docker/restore.sh /backups/erp_20260521_020000.sql.gz.enc
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
BACKUP_FILE="${1:?用法: restore.sh <备份文件路径>}"
|
||||||
|
PG_HOST="${PGHOST:-postgres}"
|
||||||
|
PG_PORT="${PGPORT:-5432}"
|
||||||
|
PG_USER="${PGUSER:-erp}"
|
||||||
|
PG_DB="${PGDATABASE:-erp}"
|
||||||
|
|
||||||
|
if [ ! -f "${BACKUP_FILE}" ]; then
|
||||||
|
echo "错误: 文件不存在: ${BACKUP_FILE}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[$(date -Iseconds)] 恢复目标: ${PG_HOST}:${PG_PORT}/${PG_DB}"
|
||||||
|
echo "[$(date -Iseconds)] 备份文件: ${BACKUP_FILE}"
|
||||||
|
|
||||||
|
# 解密(如果是加密文件)
|
||||||
|
if [[ "${BACKUP_FILE}" == *.enc ]]; then
|
||||||
|
if [ -z "${BACKUP_PASSPHRASE:-}" ]; then
|
||||||
|
echo "错误: 加密备份需要设置 BACKUP_PASSPHRASE 环境变量" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
DECRYPTED="${BACKUP_FILE%.enc}"
|
||||||
|
echo "[$(date -Iseconds)] 解密中..."
|
||||||
|
openssl enc -d -aes-256-cbc -pbkdf2 -pass "pass:${BACKUP_PASSPHRASE}" \
|
||||||
|
-in "${BACKUP_FILE}" -out "${DECRYPTED}"
|
||||||
|
BACKUP_FILE="${DECRYPTED}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 解压并恢复
|
||||||
|
echo "[$(date -Iseconds)] 恢复中..."
|
||||||
|
gunzip -c "${BACKUP_FILE}" | psql -h "${PG_HOST}" -p "${PG_PORT}" -U "${PG_USER}" -d "${PG_DB}"
|
||||||
|
|
||||||
|
echo "[$(date -Iseconds)] 恢复完成"
|
||||||
|
|
||||||
|
# 清理解密文件
|
||||||
|
if [ -n "${DECRYPTED:-}" ] && [ -f "${DECRYPTED}" ]; then
|
||||||
|
rm -f "${DECRYPTED}"
|
||||||
|
echo "[$(date -Iseconds)] 已清理解密临时文件"
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user