Files
hms/docker/docker-compose.production.yml
iven 6457c53d9c feat(docker): PP-04 可观测性 MVP — Alertmanager 告警出口 + Grafana provisioning
PP-04 核实属实:11 条告警规则在 prometheus 加载但无 alertmanager(告警
无通知出口),grafana provisioning 目录空,exporter 服务也未部署
("配置齐全运行为零")。

MVP 打通告警链路 + 让 grafana 可用(不依赖 exporter,基于 app metrics):
- docker-compose.production.yml 加 alertmanager 服务 + alertmanager_data 卷
- prometheus.yml 加 alerting 指向 alertmanager:9093
- alertmanager/config.yml 路由(SEV-1 critical 即时通知 + 分组)
- grafana/provisioning/datasources 自动连 prometheus
- grafana/provisioning/dashboards provider 就绪

待办(上线前):① alertmanager 占位 webhook 替换为真实渠道(钉钉/企微/邮件)
② 补 grafana dashboard JSON ③ 部署 postgres/redis/nginx exporter 让 prometheus 抓得到
2026-06-26 09:25:43 +08:00

193 lines
5.1 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 生产环境 Docker Compose 配置
# 使用方式: docker compose -f docker/docker-compose.yml -f docker/docker-compose.production.yml up -d
services:
# ── Nginx 反代 + TLS 终端 ──
nginx:
image: nginx:1.27-alpine
container_name: hms-nginx
restart: unless-stopped
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/conf.d/default.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
- nginx_logs:/var/log/nginx
depends_on:
app:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:80"]
interval: 30s
timeout: 5s
retries: 3
deploy:
resources:
limits:
cpus: "0.5"
memory: 128M
networks:
- hms-internal
# ── HMS 应用服务器 ──
app:
build:
context: ..
dockerfile: Dockerfile
container_name: hms-server
restart: unless-stopped
expose:
- "3000"
- "9090"
env_file:
- .env.production
environment:
ERP__DATABASE__URL: postgres://${POSTGRES_USER:-erp}:${POSTGRES_PASSWORD}@postgres:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-erp}
ERP__REDIS__URL: redis://:${REDIS_PASSWORD}@redis:${REDIS_PORT:-6379}
volumes:
- app-uploads:/app/uploads
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/api/v1/health"]
interval: 30s
timeout: 5s
start_period: 60s
retries: 3
deploy:
resources:
limits:
cpus: "2"
memory: 1024M
reservations:
cpus: "0.5"
memory: 256M
networks:
- hms-internal
# ── 每日自动备份(含加密)──
backup:
image: postgres:16-alpine
container_name: hms-backup
restart: unless-stopped
entrypoint: >
sh -c "
echo '$$BACKUP_CRON /usr/local/bin/backup.sh' > /etc/crontabs/root &&
crond -f -l 2
"
environment:
PGHOST: postgres
PGPORT: "${POSTGRES_PORT:-5432}"
PGUSER: "${POSTGRES_USER:-erp}"
PGDATABASE: "${POSTGRES_DB:-erp}"
BACKUP_DIR: /backups
KEEP_DAYS: "${BACKUP_KEEP_DAYS:-7}"
BACKUP_CRON: "${BACKUP_CRON:-0 2 * * *}"
BACKUP_PASSPHRASE: "${BACKUP_PASSPHRASE:-}"
volumes:
- ./backup.sh:/usr/local/bin/backup.sh:ro
- backup_data:/backups
depends_on:
postgres:
condition: service_healthy
networks:
- hms-internal
# ── uploads 文件备份(同步到宿主机)──
uploads-backup:
image: alpine:3.20
container_name: hms-uploads-backup
restart: unless-stopped
entrypoint: >
sh -c "
echo '$$UPLOADS_BACKUP_CRON rsync -a --delete /source/uploads/ /backup/uploads/' > /etc/crontabs/root &&
crond -f -l 2
"
environment:
UPLOADS_BACKUP_CRON: "${UPLOADS_BACKUP_CRON:-0 3 * * *}"
volumes:
- app-uploads:/source/uploads:ro
- uploads_backup_data:/backup/uploads
networks:
- hms-internal
# ── Prometheus 监控 ──
prometheus:
image: prom/prometheus:v3.1.0
container_name: hms-prometheus
restart: unless-stopped
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.retention.time=30d"
- "--storage.tsdb.retention.size=2GB"
- "--web.enable-lifecycle"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
expose:
- "9090"
networks:
- hms-internal
# ── Alertmanager 告警通知出口 ──
# PP-04: 之前 11 条告警规则在 prometheus 加载但无 alertmanager告警触发无人知晓
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: hms-alertmanager
restart: unless-stopped
volumes:
- ./alertmanager/config.yml:/etc/alertmanager/config.yml:ro
- alertmanager_data:/alertmanager
command:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager"
expose:
- "9093"
networks:
- hms-internal
# ── Grafana 可视化 ──
grafana:
image: grafana/grafana:11.4.0
container_name: hms-grafana
restart: unless-stopped
environment:
GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}"
GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-}"
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: "${GRAFANA_ROOT_URL:-http://localhost:3001}"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning:ro
expose:
- "3000"
depends_on:
- prometheus
networks:
- hms-internal
volumes:
app-uploads:
driver: local
backup_data:
driver: local
uploads_backup_data:
driver: local
nginx_logs:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
alertmanager_data:
driver: local
networks:
hms-internal:
driver: bridge