feat(server): 可观测性 Phase 1 — 健康检查路由 + Prometheus 指标 + 连接池/事件积压监控
Some checks failed
CI / rust-check (push) Has been cancelled
CI / rust-test (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / security-audit (push) Has been cancelled

- 添加 /health/live 存活探针别名(原 /health + /health/ready 保留)
- 新增 metrics middleware:http_requests_total 计数器 + http_request_duration_seconds 直方图
- Prometheus exporter 独立端口 9090(可通过 ERP__SERVER__METRICS_PORT 覆盖)
- 后台任务每 30s 采样 DB 连接池活跃/空闲连接数(pg_stat_activity)
- 后台任务每 30s 采样 EventBus pending 事件积压数
- UUID 路径归一化避免高基数(/api/v1/users/:id/posts)
This commit is contained in:
iven
2026-04-28 20:39:11 +08:00
parent f99892ee16
commit 5ab8bf8479
9 changed files with 402 additions and 17 deletions

209
Cargo.lock generated
View File

@@ -288,6 +288,28 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "aws-lc-rs"
version = "1.16.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f"
dependencies = [
"aws-lc-sys",
"zeroize",
]
[[package]]
name = "aws-lc-sys"
version = "0.40.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f50037ee5e1e41e7b8f9d161680a725bd1626cb6f8c7e901f91f942850852fe7"
dependencies = [
"cc",
"cmake",
"dunce",
"fs_extra",
]
[[package]]
name = "axum"
version = "0.8.8"
@@ -555,7 +577,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8144c22e24bbcf26ade86cb6501a0916c46b7e4787abdb0045a467eb1645a1d"
dependencies = [
"ambient-authority",
"rand",
"rand 0.8.5",
]
[[package]]
@@ -681,6 +703,15 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
[[package]]
name = "cmake"
version = "0.1.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
dependencies = [
"cc",
]
[[package]]
name = "cobs"
version = "0.3.0"
@@ -1056,7 +1087,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
dependencies = [
"generic-array",
"rand_core",
"rand_core 0.6.4",
"typenum",
]
@@ -1330,6 +1361,12 @@ dependencies = [
"dtoa",
]
[[package]]
name = "dunce"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
[[package]]
name = "either"
version = "1.15.0"
@@ -1453,7 +1490,7 @@ dependencies = [
"dashmap",
"hex",
"hmac",
"rand",
"rand 0.8.5",
"sea-orm",
"serde",
"serde_json",
@@ -1663,6 +1700,8 @@ dependencies = [
"erp-points",
"erp-server-migration",
"erp-workflow",
"metrics",
"metrics-exporter-prometheus",
"moka",
"redis",
"sea-orm",
@@ -1842,6 +1881,12 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "funty"
version = "2.0.0"
@@ -2286,6 +2331,7 @@ dependencies = [
"hyper",
"hyper-util",
"rustls",
"rustls-native-certs",
"tokio",
"tokio-rustls",
"tower-service",
@@ -2478,8 +2524,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af1955a75fa080c677d3972822ec4bad316169ab1cfc6c257a942c2265dbe5fe"
dependencies = [
"bitmaps",
"rand_core",
"rand_xoshiro",
"rand_core 0.6.4",
"rand_xoshiro 0.6.0",
"sized-chunks",
"typenum",
"version_check",
@@ -2856,6 +2902,53 @@ dependencies = [
"autocfg",
]
[[package]]
name = "metrics"
version = "0.24.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5312e9ba3771cfa961b585728215e3d972c950a3eed9252aa093d6301277e8"
dependencies = [
"ahash 0.8.12",
"portable-atomic",
]
[[package]]
name = "metrics-exporter-prometheus"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd7399781913e5393588a8d8c6a2867bf85fb38eaf2502fdce465aad2dc6f034"
dependencies = [
"base64 0.22.1",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-util",
"indexmap",
"ipnet",
"metrics",
"metrics-util",
"quanta",
"thiserror 1.0.69",
"tokio",
"tracing",
]
[[package]]
name = "metrics-util"
version = "0.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8496cc523d1f94c1385dd8f0f0c2c480b2b8aeccb5b7e4485ad6365523ae376"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
"hashbrown 0.15.5",
"metrics",
"quanta",
"rand 0.9.4",
"rand_xoshiro 0.7.0",
"sketches-ddsketch",
]
[[package]]
name = "mime"
version = "0.3.17"
@@ -3009,7 +3102,7 @@ dependencies = [
"num-integer",
"num-iter",
"num-traits",
"rand",
"rand 0.8.5",
"smallvec",
"zeroize",
]
@@ -3218,7 +3311,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
dependencies = [
"base64ct",
"rand_core",
"rand_core 0.6.4",
"subtle",
]
@@ -3342,7 +3435,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared",
"rand",
"rand 0.8.5",
]
[[package]]
@@ -3572,6 +3665,21 @@ dependencies = [
"syn 2.0.117",
]
[[package]]
name = "quanta"
version = "0.12.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
dependencies = [
"crossbeam-utils",
"libc",
"once_cell",
"raw-cpuid",
"wasi",
"web-sys",
"winapi",
]
[[package]]
name = "quote"
version = "1.0.45"
@@ -3606,8 +3714,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
[[package]]
name = "rand"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.5",
]
[[package]]
@@ -3617,7 +3735,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.5",
]
[[package]]
@@ -3629,13 +3757,40 @@ dependencies = [
"getrandom 0.2.17",
]
[[package]]
name = "rand_core"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
dependencies = [
"getrandom 0.3.4",
]
[[package]]
name = "rand_xoshiro"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
dependencies = [
"rand_core",
"rand_core 0.6.4",
]
[[package]]
name = "rand_xoshiro"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
dependencies = [
"rand_core 0.9.5",
]
[[package]]
name = "raw-cpuid"
version = "11.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
dependencies = [
"bitflags",
]
[[package]]
@@ -3876,7 +4031,7 @@ dependencies = [
"num-traits",
"pkcs1",
"pkcs8",
"rand_core",
"rand_core 0.6.4",
"signature",
"spki",
"subtle",
@@ -3903,7 +4058,7 @@ dependencies = [
"borsh",
"bytes",
"num-traits",
"rand",
"rand 0.8.5",
"rkyv",
"serde",
"serde_json",
@@ -3982,6 +4137,7 @@ version = "0.23.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
dependencies = [
"aws-lc-rs",
"once_cell",
"ring",
"rustls-pki-types",
@@ -3990,6 +4146,18 @@ dependencies = [
"zeroize",
]
[[package]]
name = "rustls-native-certs"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
dependencies = [
"openssl-probe",
"rustls-pki-types",
"schannel",
"security-framework",
]
[[package]]
name = "rustls-pki-types"
version = "1.14.0"
@@ -4005,6 +4173,7 @@ version = "0.103.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
dependencies = [
"aws-lc-rs",
"ring",
"rustls-pki-types",
"untrusted",
@@ -4398,7 +4567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
dependencies = [
"digest",
"rand_core",
"rand_core 0.6.4",
]
[[package]]
@@ -4441,6 +4610,12 @@ dependencies = [
"typenum",
]
[[package]]
name = "sketches-ddsketch"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b"
[[package]]
name = "slab"
version = "0.4.12"
@@ -4618,7 +4793,7 @@ dependencies = [
"memchr",
"once_cell",
"percent-encoding",
"rand",
"rand 0.8.5",
"rsa",
"rust_decimal",
"serde",
@@ -4662,7 +4837,7 @@ dependencies = [
"memchr",
"num-bigint",
"once_cell",
"rand",
"rand 0.8.5",
"rust_decimal",
"serde",
"serde_json",

View File

@@ -119,3 +119,7 @@ handlebars = "6"
# HTML sanitization
ammonia = "4"
# Metrics
metrics = "0.24"
metrics-exporter-prometheus = "0.16"

View File

@@ -36,6 +36,8 @@ anyhow.workspace = true
uuid.workspace = true
chrono.workspace = true
moka = { version = "0.12", features = ["sync"] }
metrics.workspace = true
metrics-exporter-prometheus.workspace = true
[dev-dependencies]
erp-auth = { workspace = true }

View File

@@ -20,6 +20,12 @@ pub struct AppConfig {
pub struct ServerConfig {
pub host: String,
pub port: u16,
#[serde(default = "default_metrics_port")]
pub metrics_port: u16,
}
fn default_metrics_port() -> u16 {
9090
}
#[derive(Debug, Clone, Deserialize)]

View File

@@ -130,5 +130,6 @@ async fn check_redis(client: &redis::Client) -> ComponentStatus {
pub fn health_check_router() -> Router<AppState> {
Router::new()
.route("/health", get(health_check))
.route("/health/live", get(health_check))
.route("/health/ready", get(readiness_check))
}

View File

@@ -432,6 +432,9 @@ async fn main() -> anyhow::Result<()> {
// Start event cleanup (archive old published events + purge processed_events)
tasks::start_event_cleanup(db.clone());
// Start DB connection pool metrics sampling (every 30s)
tasks::start_pool_metrics(db.clone());
// Start timeout checker (scan overdue tasks every 60s)
erp_workflow::WorkflowModule::start_timeout_checker(db.clone(), event_bus.clone());
tracing::info!("Timeout checker started");
@@ -611,8 +614,13 @@ async fn main() -> anyhow::Result<()> {
let app = Router::new()
.nest("/api/v1", unthrottled_routes.merge(public_routes).merge(protected_routes))
.nest("/uploads", uploads_router)
.layer(axum::middleware::from_fn(middleware::metrics::metrics_middleware))
.layer(cors);
// Start Prometheus metrics exporter on a separate port
let metrics_port = state.config.server.metrics_port;
middleware::metrics::start_metrics_server(metrics_port);
let addr = format!("{}:{}", host, port);
let listener = tokio::net::TcpListener::bind(&addr).await?;
tracing::info!(addr = %addr, "Server listening");

View File

@@ -0,0 +1,122 @@
use axum::extract::Request;
use axum::http::Method;
use axum::middleware::Next;
use axum::response::{IntoResponse, Response};
use metrics::{counter, histogram};
use std::time::Instant;
/// HTTP 请求指标中间件。
///
/// 记录两个 Prometheus 指标:
/// - `http_requests_total` — 计数器,标签: method, path, status
/// - `http_request_duration_seconds` — 直方图,标签: method, path, status
pub async fn metrics_middleware(req: Request, next: Next) -> Response {
let method = method_label(req.method());
let path = path_label(req.uri().path());
let start = Instant::now();
let resp = next.run(req).await;
let elapsed = start.elapsed();
let status = resp.status().as_u16().to_string();
let labels = [
("method", method.clone()),
("path", path.clone()),
("status", status.clone()),
];
counter!("http_requests_total", &labels).increment(1);
histogram!("http_request_duration_seconds", &labels).record(elapsed.as_secs_f64());
resp
}
fn method_label(method: &Method) -> String {
method.as_str().to_owned()
}
/// 归一化路径:将 UUID 段替换为 `:id`,避免高基数。
fn path_label(path: &str) -> String {
let parts: Vec<&str> = path
.split('/')
.filter(|s| !s.is_empty())
.map(|s| if looks_like_uuid(s) { ":id" } else { s })
.collect();
if parts.is_empty() {
"/".to_string()
} else {
format!("/{}", parts.join("/"))
}
}
fn looks_like_uuid(s: &str) -> bool {
s.len() == 36
&& s.chars().filter(|c| *c == '-').count() == 4
&& s.chars().all(|c| c.is_ascii_hexdigit() || c == '-')
}
/// 在独立端口启动 Prometheus exporter。
pub fn start_metrics_server(port: u16) {
let builder = metrics_exporter_prometheus::PrometheusBuilder::new();
let recorder = builder.build_recorder();
let handle = recorder.handle();
if let Err(e) = metrics::set_global_recorder(recorder) {
tracing::error!(error = %e, "Failed to install Prometheus recorder");
return;
}
tokio::spawn(async move {
let app = axum::Router::new()
.route(
"/metrics",
axum::routing::get(move || {
let handle = handle.clone();
async move {
let body = handle.render();
axum::response::IntoResponse::into_response(
([(axum::http::header::CONTENT_TYPE, "text/plain; version=0.0.4")], body),
)
}
}),
)
.fallback(|| async { axum::http::StatusCode::NOT_FOUND.into_response() as Response });
let addr = format!("0.0.0.0:{port}");
match tokio::net::TcpListener::bind(&addr).await {
Ok(listener) => {
tracing::info!(addr = %addr, "Prometheus metrics server listening");
if let Err(e) = axum::serve(listener, app).await {
tracing::error!(error = %e, "Metrics server error");
}
}
Err(e) => {
tracing::error!(error = %e, addr = %addr, "Failed to bind metrics server");
}
}
});
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn path_label_normalizes_uuids() {
assert_eq!(path_label("/api/v1/users"), "/api/v1/users");
assert_eq!(
path_label("/api/v1/users/01234567-89ab-cdef-0123-456789abcdef/posts"),
"/api/v1/users/:id/posts"
);
assert_eq!(path_label("/"), "/");
assert_eq!(path_label(""), "/");
}
#[test]
fn is_uuid_checks_format() {
assert!(looks_like_uuid("01234567-89ab-cdef-0123-456789abcdef"));
assert!(!looks_like_uuid("not-a-uuid"));
assert!(!looks_like_uuid("short"));
}
}

View File

@@ -1,2 +1,3 @@
pub mod metrics;
pub mod rate_limit;
pub mod tenant_rls;

View File

@@ -51,3 +51,69 @@ async fn run_cleanup(db: &sea_orm::DatabaseConnection) -> Result<(), sea_orm::Db
Ok(())
}
/// 启动 DB 连接池 + EventBus 积压指标采样任务。
///
/// 每 30 秒采样一次并导出为 Prometheus gauge
/// - `db_pool_connections_active` — 当前活跃连接数
/// - `db_pool_connections_idle` — 当前空闲连接数
/// - `eventbus_pending_total` — pending 状态的领域事件数
pub fn start_pool_metrics(db: sea_orm::DatabaseConnection) {
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(30));
loop {
interval.tick().await;
sample_pool_metrics(&db).await;
sample_eventbus_backlog(&db).await;
}
});
tracing::info!("DB 连接池 + EventBus 积压指标采样已启动(每 30 秒采样一次)");
}
async fn sample_pool_metrics(db: &sea_orm::DatabaseConnection) {
use sea_orm::FromQueryResult;
#[derive(FromQueryResult)]
struct CountRow {
cnt: i64,
}
// 通过 pg_stat_activity 查询当前连接数
let stmt = sea_orm::Statement::from_string(
sea_orm::DatabaseBackend::Postgres,
"SELECT COUNT(*)::bigint AS cnt FROM pg_stat_activity WHERE state = 'active'".to_string(),
);
if let Ok(Some(row)) = CountRow::find_by_statement(stmt).one(db).await {
metrics::gauge!("db_pool_connections_active").set(row.cnt as f64);
}
let stmt = sea_orm::Statement::from_string(
sea_orm::DatabaseBackend::Postgres,
"SELECT COUNT(*)::bigint AS cnt FROM pg_stat_activity WHERE state = 'idle'".to_string(),
);
if let Ok(Some(row)) = CountRow::find_by_statement(stmt).one(db).await {
metrics::gauge!("db_pool_connections_idle").set(row.cnt as f64);
}
}
async fn sample_eventbus_backlog(db: &sea_orm::DatabaseConnection) {
use sea_orm::FromQueryResult;
#[derive(FromQueryResult)]
struct CountRow {
cnt: i64,
}
let stmt = sea_orm::Statement::from_string(
sea_orm::DatabaseBackend::Postgres,
"SELECT COUNT(*)::bigint AS cnt FROM domain_events WHERE status = 'pending'".to_string(),
);
match CountRow::find_by_statement(stmt).one(db).await {
Ok(Some(row)) => {
metrics::gauge!("eventbus_pending_total").set(row.cnt as f64);
}
_ => {
tracing::debug!("EventBus 积压采样:无法获取 pending 事件数");
}
}
}