fix: 发布前审计 Batch 1 — Pipeline 内存泄漏/超时 + Director 死锁 + Rate Limit Worker

Pipeline executor: - 添加 cleanup() 方法，MAX_COMPLETED_RUNS=100 上限淘汰旧记录 - 每步执行添加 tokio::time::timeout（使用 PipelineSpec.timeout_secs，默认 300s） - Delay ms 上限 60000，超出 warn 并截断 Director send_to_agent: - 重构为 oneshot::channel 响应模式，避免 inbox + pending_requests 锁竞争 - 添加 ensure_inbox_reader() 独立任务分发响应到对应 oneshot sender cleanup_rate_limit Worker: - 实现 Worker body: DELETE FROM rate_limit_events WHERE created_at < NOW() - INTERVAL '1 hour' 651 tests passed, 0 failed
2026-04-18 14:09:16 +08:00
parent 35a11504d7
commit f3fb5340b5
3 changed files with 171 additions and 57 deletions
--- a/crates/zclaw-kernel/src/director.rs
+++ b/crates/zclaw-kernel/src/director.rs
@@ -12,7 +12,7 @@

 use std::sync::Arc;
 use serde::{Deserialize, Serialize};
-use tokio::sync::{RwLock, Mutex, mpsc};
+use tokio::sync::{RwLock, Mutex, mpsc, oneshot};
 use zclaw_types::{AgentId, Result, ZclawError};
 use zclaw_protocols::{A2aEnvelope, A2aMessageType, A2aRecipient, A2aRouter, A2aAgentProfile, A2aCapability};
 use zclaw_runtime::{LlmDriver, CompletionRequest};
@@ -199,9 +199,9 @@ pub struct Director {
    director_id: AgentId,
    /// Optional LLM driver for intelligent scheduling
    llm_driver: Option<Arc<dyn LlmDriver>>,
-    /// Inbox for receiving responses (stores pending request IDs and their response channels)
-    pending_requests: Arc<Mutex<std::collections::HashMap<String, mpsc::Sender<A2aEnvelope>>>>,
-    /// Receiver for incoming messages
+    /// Pending request response channels (request_id → oneshot sender)
+    pending_requests: Arc<Mutex<std::collections::HashMap<String, oneshot::Sender<A2aEnvelope>>>>,
+    /// Receiver for incoming messages (consumed by inbox reader task)
    inbox: Arc<Mutex<Option<mpsc::Receiver<A2aEnvelope>>>>,
 }

@@ -481,13 +481,16 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
    }

    /// Send message to selected agent and wait for response
+    ///
+    /// Uses oneshot channels to avoid deadlock: each call creates its own
+    /// response channel, and a shared inbox reader dispatches responses.
    pub async fn send_to_agent(
        &self,
        agent: &DirectorAgent,
        message: String,
    ) -> Result<String> {
-        // Create a response channel for this request
-        let (_response_tx, mut _response_rx) = mpsc::channel::<A2aEnvelope>(1);
+        // Create a oneshot channel for this specific request's response
+        let (response_tx, response_rx) = oneshot::channel::<A2aEnvelope>();

        let envelope = A2aEnvelope::new(
            self.director_id.clone(),
@@ -500,50 +503,32 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
            }),
        );

-        // Store the request ID with its response channel
+        // Store the oneshot sender so the inbox reader can dispatch to it
        let request_id = envelope.id.clone();
        {
            let mut pending = self.pending_requests.lock().await;
-            pending.insert(request_id.clone(), _response_tx);
+            pending.insert(request_id.clone(), response_tx);
        }

        // Send the request
        self.router.route(envelope).await?;

-        // Wait for response with timeout
+        // Ensure the inbox reader is running
+        self.ensure_inbox_reader().await;
+
+        // Wait for response on our dedicated oneshot channel with timeout
        let timeout_duration = std::time::Duration::from_secs(self.config.response_timeout);
-        let request_id_clone = request_id.clone();

-        let response = tokio::time::timeout(timeout_duration, async {
-            // Poll the inbox for responses
-            let mut inbox_guard = self.inbox.lock().await;
-            if let Some(ref mut rx) = *inbox_guard {
-                while let Some(msg) = rx.recv().await {
-                    // Check if this is a response to our request
-                    if msg.message_type == A2aMessageType::Response {
-                        if let Some(ref reply_to) = msg.reply_to {
-                            if reply_to == &request_id_clone {
-                                // Found our response
-                                return Some(msg);
-                            }
-                        }
-                    }
-                    // Not our response, continue waiting
-                    // (In a real implementation, we'd re-queue non-matching messages)
-                }
-            }
-            None
-        }).await;
+        let response = tokio::time::timeout(timeout_duration, response_rx).await;

-        // Clean up pending request
+        // Clean up pending request (sender already consumed on success)
        {
            let mut pending = self.pending_requests.lock().await;
            pending.remove(&request_id);
        }

        match response {
-            Ok(Some(envelope)) => {
-                // Extract response text from payload
+            Ok(Ok(envelope)) => {
                let response_text = envelope.payload
                    .get("response")
                    .and_then(|v: &serde_json::Value| v.as_str())
@@ -551,7 +536,7 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
                    .to_string();
                Ok(response_text)
            }
-            Ok(None) => {
+            Ok(Err(_)) => {
                Err(ZclawError::Timeout("No response received".into()))
            }
            Err(_) => {
@@ -563,6 +548,44 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
        }
    }

+    /// Ensure the inbox reader task is running.
+    /// The inbox reader continuously reads from the shared inbox channel
+    /// and dispatches each response to the correct oneshot sender.
+    async fn ensure_inbox_reader(&self) {
+        // Quick check: if inbox has already been taken, reader is running
+        {
+            let inbox = self.inbox.lock().await;
+            if inbox.is_none() {
+                return; // Reader already spawned and consumed the receiver
+            }
+        }
+
+        // Take the receiver out (only once)
+        let rx = {
+            let mut inbox = self.inbox.lock().await;
+            inbox.take()
+        };
+
+        if let Some(mut rx) = rx {
+            let pending = self.pending_requests.clone();
+            tokio::spawn(async move {
+                while let Some(msg) = rx.recv().await {
+                    // Find and dispatch to the correct oneshot sender
+                    if msg.message_type == A2aMessageType::Response {
+                        if let Some(ref reply_to) = msg.reply_to {
+                            let mut pending_guard = pending.lock().await;
+                            if let Some(sender) = pending_guard.remove(reply_to) {
+                                // Send the response; if receiver already dropped, that's fine
+                                let _ = sender.send(msg);
+                            }
+                        }
+                    }
+                    // Non-response messages are dropped (notifications, etc.)
+                }
+            });
+        }
+    }
+
    /// Broadcast message to all agents
    pub async fn broadcast(&self, message: String) -> Result<()> {
        let envelope = A2aEnvelope::new(