PP-04 核实属实:11 条告警规则在 prometheus 加载但无 alertmanager(告警 无通知出口),grafana provisioning 目录空,exporter 服务也未部署 ("配置齐全运行为零")。 MVP 打通告警链路 + 让 grafana 可用(不依赖 exporter,基于 app metrics): - docker-compose.production.yml 加 alertmanager 服务 + alertmanager_data 卷 - prometheus.yml 加 alerting 指向 alertmanager:9093 - alertmanager/config.yml 路由(SEV-1 critical 即时通知 + 分组) - grafana/provisioning/datasources 自动连 prometheus - grafana/provisioning/dashboards provider 就绪 待办(上线前):① alertmanager 占位 webhook 替换为真实渠道(钉钉/企微/邮件) ② 补 grafana dashboard JSON ③ 部署 postgres/redis/nginx exporter 让 prometheus 抓得到
193 lines
5.1 KiB
YAML
193 lines
5.1 KiB
YAML
# 生产环境 Docker Compose 配置
|
||
# 使用方式: docker compose -f docker/docker-compose.yml -f docker/docker-compose.production.yml up -d
|
||
|
||
services:
|
||
# ── Nginx 反代 + TLS 终端 ──
|
||
nginx:
|
||
image: nginx:1.27-alpine
|
||
container_name: hms-nginx
|
||
restart: unless-stopped
|
||
ports:
|
||
- "80:80"
|
||
- "443:443"
|
||
volumes:
|
||
- ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
|
||
- ./nginx/ssl:/etc/nginx/ssl:ro
|
||
- nginx_logs:/var/log/nginx
|
||
depends_on:
|
||
app:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD", "wget", "--spider", "-q", "http://localhost:80"]
|
||
interval: 30s
|
||
timeout: 5s
|
||
retries: 3
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "0.5"
|
||
memory: 128M
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── HMS 应用服务器 ──
|
||
app:
|
||
build:
|
||
context: ..
|
||
dockerfile: Dockerfile
|
||
container_name: hms-server
|
||
restart: unless-stopped
|
||
expose:
|
||
- "3000"
|
||
- "9090"
|
||
env_file:
|
||
- .env.production
|
||
environment:
|
||
ERP__DATABASE__URL: postgres://${POSTGRES_USER:-erp}:${POSTGRES_PASSWORD}@postgres:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-erp}
|
||
ERP__REDIS__URL: redis://:${REDIS_PASSWORD}@redis:${REDIS_PORT:-6379}
|
||
volumes:
|
||
- app-uploads:/app/uploads
|
||
depends_on:
|
||
postgres:
|
||
condition: service_healthy
|
||
redis:
|
||
condition: service_healthy
|
||
healthcheck:
|
||
test: ["CMD", "curl", "-f", "http://localhost:3000/api/v1/health"]
|
||
interval: 30s
|
||
timeout: 5s
|
||
start_period: 60s
|
||
retries: 3
|
||
deploy:
|
||
resources:
|
||
limits:
|
||
cpus: "2"
|
||
memory: 1024M
|
||
reservations:
|
||
cpus: "0.5"
|
||
memory: 256M
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── 每日自动备份(含加密)──
|
||
backup:
|
||
image: postgres:16-alpine
|
||
container_name: hms-backup
|
||
restart: unless-stopped
|
||
entrypoint: >
|
||
sh -c "
|
||
echo '$$BACKUP_CRON /usr/local/bin/backup.sh' > /etc/crontabs/root &&
|
||
crond -f -l 2
|
||
"
|
||
environment:
|
||
PGHOST: postgres
|
||
PGPORT: "${POSTGRES_PORT:-5432}"
|
||
PGUSER: "${POSTGRES_USER:-erp}"
|
||
PGDATABASE: "${POSTGRES_DB:-erp}"
|
||
BACKUP_DIR: /backups
|
||
KEEP_DAYS: "${BACKUP_KEEP_DAYS:-7}"
|
||
BACKUP_CRON: "${BACKUP_CRON:-0 2 * * *}"
|
||
BACKUP_PASSPHRASE: "${BACKUP_PASSPHRASE:-}"
|
||
volumes:
|
||
- ./backup.sh:/usr/local/bin/backup.sh:ro
|
||
- backup_data:/backups
|
||
depends_on:
|
||
postgres:
|
||
condition: service_healthy
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── uploads 文件备份(同步到宿主机)──
|
||
uploads-backup:
|
||
image: alpine:3.20
|
||
container_name: hms-uploads-backup
|
||
restart: unless-stopped
|
||
entrypoint: >
|
||
sh -c "
|
||
echo '$$UPLOADS_BACKUP_CRON rsync -a --delete /source/uploads/ /backup/uploads/' > /etc/crontabs/root &&
|
||
crond -f -l 2
|
||
"
|
||
environment:
|
||
UPLOADS_BACKUP_CRON: "${UPLOADS_BACKUP_CRON:-0 3 * * *}"
|
||
volumes:
|
||
- app-uploads:/source/uploads:ro
|
||
- uploads_backup_data:/backup/uploads
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── Prometheus 监控 ──
|
||
prometheus:
|
||
image: prom/prometheus:v3.1.0
|
||
container_name: hms-prometheus
|
||
restart: unless-stopped
|
||
command:
|
||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||
- "--storage.tsdb.retention.time=30d"
|
||
- "--storage.tsdb.retention.size=2GB"
|
||
- "--web.enable-lifecycle"
|
||
volumes:
|
||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||
- prometheus_data:/prometheus
|
||
expose:
|
||
- "9090"
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── Alertmanager 告警通知出口 ──
|
||
# PP-04: 之前 11 条告警规则在 prometheus 加载但无 alertmanager,告警触发无人知晓
|
||
alertmanager:
|
||
image: prom/alertmanager:v0.27.0
|
||
container_name: hms-alertmanager
|
||
restart: unless-stopped
|
||
volumes:
|
||
- ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro
|
||
- alertmanager_data:/alertmanager
|
||
command:
|
||
- "--config.file=/etc/alertmanager/config.yml"
|
||
- "--storage.path=/alertmanager"
|
||
expose:
|
||
- "9093"
|
||
networks:
|
||
- hms-internal
|
||
|
||
# ── Grafana 可视化 ──
|
||
grafana:
|
||
image: grafana/grafana:11.4.0
|
||
container_name: hms-grafana
|
||
restart: unless-stopped
|
||
environment:
|
||
GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}"
|
||
GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-}"
|
||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||
GF_SERVER_ROOT_URL: "${GRAFANA_ROOT_URL:-http://localhost:3001}"
|
||
volumes:
|
||
- grafana_data:/var/lib/grafana
|
||
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
||
expose:
|
||
- "3000"
|
||
depends_on:
|
||
- prometheus
|
||
networks:
|
||
- hms-internal
|
||
|
||
volumes:
|
||
app-uploads:
|
||
driver: local
|
||
backup_data:
|
||
driver: local
|
||
uploads_backup_data:
|
||
driver: local
|
||
nginx_logs:
|
||
driver: local
|
||
prometheus_data:
|
||
driver: local
|
||
grafana_data:
|
||
driver: local
|
||
alertmanager_data:
|
||
driver: local
|
||
|
||
networks:
|
||
hms-internal:
|
||
driver: bridge
|