fix: 发布前审计 Batch 1 — Pipeline 内存泄漏/超时 + Director 死锁 + Rate Limit Worker
Pipeline executor: - 添加 cleanup() 方法,MAX_COMPLETED_RUNS=100 上限淘汰旧记录 - 每步执行添加 tokio::time::timeout(使用 PipelineSpec.timeout_secs,默认 300s) - Delay ms 上限 60000,超出 warn 并截断 Director send_to_agent: - 重构为 oneshot::channel 响应模式,避免 inbox + pending_requests 锁竞争 - 添加 ensure_inbox_reader() 独立任务分发响应到对应 oneshot sender cleanup_rate_limit Worker: - 实现 Worker body: DELETE FROM rate_limit_events WHERE created_at < NOW() - INTERVAL '1 hour' 651 tests passed, 0 failed
This commit is contained in:
@@ -40,6 +40,15 @@ pub enum ExecuteError {
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
/// Maximum completed/failed/cancelled runs to keep in memory
|
||||
const MAX_COMPLETED_RUNS: usize = 100;
|
||||
|
||||
/// Maximum allowed delay in milliseconds (60 seconds)
|
||||
const MAX_DELAY_MS: u64 = 60_000;
|
||||
|
||||
/// Default per-step timeout (5 minutes)
|
||||
const DEFAULT_STEP_TIMEOUT_SECS: u64 = 300;
|
||||
|
||||
/// Pipeline executor
|
||||
pub struct PipelineExecutor {
|
||||
/// Action registry
|
||||
@@ -107,35 +116,50 @@ impl PipelineExecutor {
|
||||
// Create execution context
|
||||
let mut context = ExecutionContext::new(inputs);
|
||||
|
||||
// Determine per-step timeout from pipeline spec (0 means use default)
|
||||
let step_timeout = if pipeline.spec.timeout_secs > 0 {
|
||||
pipeline.spec.timeout_secs
|
||||
} else {
|
||||
DEFAULT_STEP_TIMEOUT_SECS
|
||||
};
|
||||
|
||||
// Execute steps
|
||||
let result = self.execute_steps(pipeline, &mut context, &run_id).await;
|
||||
let result = self.execute_steps(pipeline, &mut context, &run_id, step_timeout).await;
|
||||
|
||||
// Update run state
|
||||
let mut runs = self.runs.write().await;
|
||||
if let Some(run) = runs.get_mut(&run_id) {
|
||||
match result {
|
||||
Ok(outputs) => {
|
||||
run.status = RunStatus::Completed;
|
||||
run.outputs = Some(serde_json::to_value(&outputs).unwrap_or(Value::Null));
|
||||
}
|
||||
Err(e) => {
|
||||
run.status = RunStatus::Failed;
|
||||
run.error = Some(e.to_string());
|
||||
let return_value = {
|
||||
let mut runs = self.runs.write().await;
|
||||
if let Some(run) = runs.get_mut(&run_id) {
|
||||
match result {
|
||||
Ok(outputs) => {
|
||||
run.status = RunStatus::Completed;
|
||||
run.outputs = Some(serde_json::to_value(&outputs).unwrap_or(Value::Null));
|
||||
}
|
||||
Err(e) => {
|
||||
run.status = RunStatus::Failed;
|
||||
run.error = Some(e.to_string());
|
||||
}
|
||||
}
|
||||
run.ended_at = Some(Utc::now());
|
||||
Ok(run.clone())
|
||||
} else {
|
||||
Err(ExecuteError::Action("执行后未找到运行记录".to_string()))
|
||||
}
|
||||
run.ended_at = Some(Utc::now());
|
||||
return Ok(run.clone());
|
||||
}
|
||||
};
|
||||
|
||||
Err(ExecuteError::Action("执行后未找到运行记录".to_string()))
|
||||
// Auto-cleanup old completed runs (after releasing the write lock)
|
||||
self.cleanup().await;
|
||||
|
||||
return_value
|
||||
}
|
||||
|
||||
/// Execute pipeline steps
|
||||
/// Execute pipeline steps with per-step timeout
|
||||
async fn execute_steps(
|
||||
&self,
|
||||
pipeline: &Pipeline,
|
||||
context: &mut ExecutionContext,
|
||||
run_id: &str,
|
||||
step_timeout_secs: u64,
|
||||
) -> Result<HashMap<String, Value>, ExecuteError> {
|
||||
let total_steps = pipeline.spec.steps.len();
|
||||
|
||||
@@ -161,8 +185,15 @@ impl PipelineExecutor {
|
||||
|
||||
tracing::info!("Executing step {} ({}/{})", step.id, idx + 1, total_steps);
|
||||
|
||||
// Execute action
|
||||
let result = self.execute_action(&step.action, context).await?;
|
||||
// Execute action with per-step timeout
|
||||
let timeout_duration = std::time::Duration::from_secs(step_timeout_secs);
|
||||
let result = tokio::time::timeout(
|
||||
timeout_duration,
|
||||
self.execute_action(&step.action, context),
|
||||
).await.map_err(|_| {
|
||||
tracing::error!("Step {} timed out after {}s", step.id, step_timeout_secs);
|
||||
ExecuteError::Timeout
|
||||
})??;
|
||||
|
||||
// Store result
|
||||
context.set_output(&step.id, result.clone());
|
||||
@@ -336,7 +367,16 @@ impl PipelineExecutor {
|
||||
}
|
||||
|
||||
Action::Delay { ms } => {
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(*ms)).await;
|
||||
let capped_ms = if *ms > MAX_DELAY_MS {
|
||||
tracing::warn!(
|
||||
"Delay ms {} exceeds max {}, capping to {}",
|
||||
ms, MAX_DELAY_MS, MAX_DELAY_MS
|
||||
);
|
||||
MAX_DELAY_MS
|
||||
} else {
|
||||
*ms
|
||||
};
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(capped_ms)).await;
|
||||
Ok(Value::Null)
|
||||
}
|
||||
|
||||
@@ -508,6 +548,33 @@ impl PipelineExecutor {
|
||||
pub async fn list_runs(&self) -> Vec<PipelineRun> {
|
||||
self.runs.read().await.values().cloned().collect()
|
||||
}
|
||||
|
||||
/// Clean up old completed/failed/cancelled runs to prevent memory leaks.
|
||||
/// Keeps at most MAX_COMPLETED_RUNS finished runs, evicting the oldest first.
|
||||
pub async fn cleanup(&self) {
|
||||
let mut runs = self.runs.write().await;
|
||||
|
||||
// Collect IDs of finished runs (completed, failed, cancelled)
|
||||
let mut finished: Vec<(String, chrono::DateTime<Utc>)> = runs
|
||||
.iter()
|
||||
.filter(|(_, r)| matches!(r.status, RunStatus::Completed | RunStatus::Failed | RunStatus::Cancelled))
|
||||
.map(|(id, r)| (id.clone(), r.ended_at.unwrap_or(r.started_at)))
|
||||
.collect();
|
||||
|
||||
let to_remove = finished.len().saturating_sub(MAX_COMPLETED_RUNS);
|
||||
if to_remove > 0 {
|
||||
// Sort by end time ascending (oldest first)
|
||||
finished.sort_by_key(|(_, t)| *t);
|
||||
for (id, _) in finished.into_iter().take(to_remove) {
|
||||
runs.remove(&id);
|
||||
// Also clean up cancellation flag
|
||||
drop(runs);
|
||||
self.cancellations.write().await.remove(&id);
|
||||
runs = self.runs.write().await;
|
||||
}
|
||||
tracing::debug!("Cleaned up {} old pipeline runs", to_remove);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user