fix: 发布前审计 Batch 1 — Pipeline 内存泄漏/超时 + Director 死锁 + Rate Limit Worker

Pipeline executor: - 添加 cleanup() 方法，MAX_COMPLETED_RUNS=100 上限淘汰旧记录 - 每步执行添加 tokio::time::timeout（使用 PipelineSpec.timeout_secs，默认 300s） - Delay ms 上限 60000，超出 warn 并截断 Director send_to_agent: - 重构为 oneshot::channel 响应模式，避免 inbox + pending_requests 锁竞争 - 添加 ensure_inbox_reader() 独立任务分发响应到对应 oneshot sender cleanup_rate_limit Worker: - 实现 Worker body: DELETE FROM rate_limit_events WHERE created_at < NOW() - INTERVAL '1 hour' 651 tests passed, 0 failed
2026-04-18 14:09:16 +08:00
parent 35a11504d7
commit f3fb5340b5
3 changed files with 171 additions and 57 deletions
--- a/crates/zclaw-pipeline/src/executor.rs
+++ b/crates/zclaw-pipeline/src/executor.rs
@@ -40,6 +40,15 @@ pub enum ExecuteError {
    Io(#[from] std::io::Error),
 }

+/// Maximum completed/failed/cancelled runs to keep in memory
+const MAX_COMPLETED_RUNS: usize = 100;
+
+/// Maximum allowed delay in milliseconds (60 seconds)
+const MAX_DELAY_MS: u64 = 60_000;
+
+/// Default per-step timeout (5 minutes)
+const DEFAULT_STEP_TIMEOUT_SECS: u64 = 300;
+
 /// Pipeline executor
 pub struct PipelineExecutor {
    /// Action registry
@@ -107,35 +116,50 @@ impl PipelineExecutor {
        // Create execution context
        let mut context = ExecutionContext::new(inputs);

+        // Determine per-step timeout from pipeline spec (0 means use default)
+        let step_timeout = if pipeline.spec.timeout_secs > 0 {
+            pipeline.spec.timeout_secs
+        } else {
+            DEFAULT_STEP_TIMEOUT_SECS
+        };
+
        // Execute steps
-        let result = self.execute_steps(pipeline, &mut context, &run_id).await;
+        let result = self.execute_steps(pipeline, &mut context, &run_id, step_timeout).await;

        // Update run state
-        let mut runs = self.runs.write().await;
-        if let Some(run) = runs.get_mut(&run_id) {
-            match result {
-                Ok(outputs) => {
-                    run.status = RunStatus::Completed;
-                    run.outputs = Some(serde_json::to_value(&outputs).unwrap_or(Value::Null));
-                }
-                Err(e) => {
-                    run.status = RunStatus::Failed;
-                    run.error = Some(e.to_string());
+        let return_value = {
+            let mut runs = self.runs.write().await;
+            if let Some(run) = runs.get_mut(&run_id) {
+                match result {
+                    Ok(outputs) => {
+                        run.status = RunStatus::Completed;
+                        run.outputs = Some(serde_json::to_value(&outputs).unwrap_or(Value::Null));
+                    }
+                    Err(e) => {
+                        run.status = RunStatus::Failed;
+                        run.error = Some(e.to_string());
+                    }
                }
+                run.ended_at = Some(Utc::now());
+                Ok(run.clone())
+            } else {
+                Err(ExecuteError::Action("执行后未找到运行记录".to_string()))
            }
-            run.ended_at = Some(Utc::now());
-            return Ok(run.clone());
-        }
+        };

-        Err(ExecuteError::Action("执行后未找到运行记录".to_string()))
+        // Auto-cleanup old completed runs (after releasing the write lock)
+        self.cleanup().await;
+
+        return_value
    }

-    /// Execute pipeline steps
+    /// Execute pipeline steps with per-step timeout
    async fn execute_steps(
        &self,
        pipeline: &Pipeline,
        context: &mut ExecutionContext,
        run_id: &str,
+        step_timeout_secs: u64,
    ) -> Result<HashMap<String, Value>, ExecuteError> {
        let total_steps = pipeline.spec.steps.len();

@@ -161,8 +185,15 @@ impl PipelineExecutor {

            tracing::info!("Executing step {} ({}/{})", step.id, idx + 1, total_steps);

-            // Execute action
-            let result = self.execute_action(&step.action, context).await?;
+            // Execute action with per-step timeout
+            let timeout_duration = std::time::Duration::from_secs(step_timeout_secs);
+            let result = tokio::time::timeout(
+                timeout_duration,
+                self.execute_action(&step.action, context),
+            ).await.map_err(|_| {
+                tracing::error!("Step {} timed out after {}s", step.id, step_timeout_secs);
+                ExecuteError::Timeout
+            })??;

            // Store result
            context.set_output(&step.id, result.clone());
@@ -336,7 +367,16 @@ impl PipelineExecutor {
                }

                Action::Delay { ms } => {
-                    tokio::time::sleep(tokio::time::Duration::from_millis(*ms)).await;
+                    let capped_ms = if *ms > MAX_DELAY_MS {
+                        tracing::warn!(
+                            "Delay ms {} exceeds max {}, capping to {}",
+                            ms, MAX_DELAY_MS, MAX_DELAY_MS
+                        );
+                        MAX_DELAY_MS
+                    } else {
+                        *ms
+                    };
+                    tokio::time::sleep(tokio::time::Duration::from_millis(capped_ms)).await;
                    Ok(Value::Null)
                }

@@ -508,6 +548,33 @@ impl PipelineExecutor {
    pub async fn list_runs(&self) -> Vec<PipelineRun> {
        self.runs.read().await.values().cloned().collect()
    }
+
+    /// Clean up old completed/failed/cancelled runs to prevent memory leaks.
+    /// Keeps at most MAX_COMPLETED_RUNS finished runs, evicting the oldest first.
+    pub async fn cleanup(&self) {
+        let mut runs = self.runs.write().await;
+
+        // Collect IDs of finished runs (completed, failed, cancelled)
+        let mut finished: Vec<(String, chrono::DateTime<Utc>)> = runs
+            .iter()
+            .filter(|(_, r)| matches!(r.status, RunStatus::Completed | RunStatus::Failed | RunStatus::Cancelled))
+            .map(|(id, r)| (id.clone(), r.ended_at.unwrap_or(r.started_at)))
+            .collect();
+
+        let to_remove = finished.len().saturating_sub(MAX_COMPLETED_RUNS);
+        if to_remove > 0 {
+            // Sort by end time ascending (oldest first)
+            finished.sort_by_key(|(_, t)| *t);
+            for (id, _) in finished.into_iter().take(to_remove) {
+                runs.remove(&id);
+                // Also clean up cancellation flag
+                drop(runs);
+                self.cancellations.write().await.remove(&id);
+                runs = self.runs.write().await;
+            }
+            tracing::debug!("Cleaned up {} old pipeline runs", to_remove);
+        }
+    }
 }

 #[cfg(test)]