feat(server+ai): PP-01 死信重试接线 + PP-05b AI 队列消费者 — 通电半成品自动化
PP-01: retry_dead_letters 已实现但全仓零调用,业务关键事件瞬时故障即永久 滞留死信表。tasks.rs 加 start_retry_dead_letters(每小时,最大重试 5 次) + main.rs 注册。同时落盘 feat 进行中的 cron_heartbeat 就绪门禁 (touch_heartbeat + 给 cleanup/metrics 任务加 heartbeat 参数)。 PP-05b: AnalysisQueue "只入队不消费"(两个入队源 claim_next 零调用), 违反"每个事件必须有消费者"铁律。新增 analysis_worker.rs 后台消费者: claim_next → analysis_type 路由 → AnalysisService → mark_completed/ mark_failed。MVP 打通 trend 链路,lab_report/dialysis_risk 暂 skip (回滚 pending,无假数据)。启动遵循 start_auto_analysis 模式(main.rs)。
This commit is contained in:
@@ -644,6 +644,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
erp_ai::service::auto_analysis::start_auto_analysis(ai_state.clone());
|
||||
tracing::info!("Auto trend analysis scheduler started");
|
||||
|
||||
// Start analysis queue worker (claims pending ai_analysis_queue jobs → analyzes → completes)
|
||||
erp_ai::service::analysis_worker::start_analysis_worker(ai_state.clone());
|
||||
tracing::info!("AI analysis queue worker started");
|
||||
|
||||
let cron_heartbeat = std::sync::Arc::new(std::sync::atomic::AtomicU64::new(
|
||||
std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
@@ -671,6 +675,11 @@ async fn main() -> anyhow::Result<()> {
|
||||
// Start background tasks with heartbeat
|
||||
tasks::start_event_cleanup(state.db.clone(), state.cron_heartbeat.clone());
|
||||
tasks::start_pool_metrics(state.db.clone(), state.cron_heartbeat.clone());
|
||||
tasks::start_retry_dead_letters(
|
||||
state.db.clone(),
|
||||
state.event_bus.clone(),
|
||||
state.cron_heartbeat.clone(),
|
||||
);
|
||||
|
||||
// --- Build the router ---
|
||||
//
|
||||
|
||||
@@ -1,11 +1,23 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::time::Duration;
|
||||
|
||||
use erp_core::events::{EventBus, retry_dead_letters};
|
||||
|
||||
fn touch_heartbeat(heartbeat: &Arc<AtomicU64>) {
|
||||
let now = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
heartbeat.store(now, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
/// 启动事件清理后台任务。
|
||||
///
|
||||
/// 每日执行一次:
|
||||
/// - 调用 `cleanup_old_published_events()` 归档 >7 天的已发布事件
|
||||
/// - 调用 `cleanup_old_processed_events()` 清理 >7 天的去重记录
|
||||
pub fn start_event_cleanup(db: sea_orm::DatabaseConnection) {
|
||||
pub fn start_event_cleanup(db: sea_orm::DatabaseConnection, heartbeat: Arc<AtomicU64>) {
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(86400));
|
||||
loop {
|
||||
@@ -13,6 +25,7 @@ pub fn start_event_cleanup(db: sea_orm::DatabaseConnection) {
|
||||
if let Err(e) = run_cleanup(&db).await {
|
||||
tracing::warn!(error = %e, "事件清理任务执行失败");
|
||||
}
|
||||
touch_heartbeat(&heartbeat);
|
||||
}
|
||||
});
|
||||
tracing::info!("事件清理任务已启动(每 24 小时执行一次)");
|
||||
@@ -52,13 +65,14 @@ async fn run_cleanup(db: &sea_orm::DatabaseConnection) -> Result<(), sea_orm::Db
|
||||
/// - `db_pool_connections_active` — 当前活跃连接数
|
||||
/// - `db_pool_connections_idle` — 当前空闲连接数
|
||||
/// - `eventbus_pending_total` — pending 状态的领域事件数
|
||||
pub fn start_pool_metrics(db: sea_orm::DatabaseConnection) {
|
||||
pub fn start_pool_metrics(db: sea_orm::DatabaseConnection, heartbeat: Arc<AtomicU64>) {
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(30));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
sample_pool_metrics(&db).await;
|
||||
sample_eventbus_backlog(&db).await;
|
||||
touch_heartbeat(&heartbeat);
|
||||
}
|
||||
});
|
||||
tracing::info!("DB 连接池 + EventBus 积压指标采样已启动(每 30 秒采样一次)");
|
||||
@@ -111,3 +125,40 @@ async fn sample_eventbus_backlog(db: &sea_orm::DatabaseConnection) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 启动死信重试后台任务。
|
||||
///
|
||||
/// 每小时执行一次:
|
||||
/// - 调用 `erp_core::events::retry_dead_letters()` 重试 `dead_letter_events` 中
|
||||
/// 未解决且未超过最大重试次数的失败事件(指数退避由 attempts + last_error 记录)
|
||||
/// - 最大重试 5 次,超过则标记永久失败
|
||||
///
|
||||
/// 触碰「每个事件必须有消费者」铁律的兜底:业务关键链路(危急值告警/积分发放/
|
||||
/// 预约提醒/article 推送)的瞬时故障借此自动恢复,不再永久滞留死信表。
|
||||
pub fn start_retry_dead_letters(
|
||||
db: sea_orm::DatabaseConnection,
|
||||
bus: EventBus,
|
||||
heartbeat: Arc<AtomicU64>,
|
||||
) {
|
||||
tokio::spawn(async move {
|
||||
// 首次延迟 60s,避免与启动期 outbox relay 抢资源
|
||||
tokio::time::sleep(Duration::from_secs(60)).await;
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(3600));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
match retry_dead_letters(&db, &bus, 5).await {
|
||||
Ok(retried) if retried > 0 => {
|
||||
tracing::info!(retried, "死信重试任务完成(已重试 N 条)");
|
||||
}
|
||||
Ok(_) => {
|
||||
tracing::debug!("死信重试任务完成(无待重试事件)");
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "死信重试任务执行失败");
|
||||
}
|
||||
}
|
||||
touch_heartbeat(&heartbeat);
|
||||
}
|
||||
});
|
||||
tracing::info!("死信重试任务已启动(每 1 小时执行一次,最大重试 5 次)");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user