fix: 发布前审计 Batch 1 — Pipeline 内存泄漏/超时 + Director 死锁 + Rate Limit Worker
Pipeline executor: - 添加 cleanup() 方法,MAX_COMPLETED_RUNS=100 上限淘汰旧记录 - 每步执行添加 tokio::time::timeout(使用 PipelineSpec.timeout_secs,默认 300s) - Delay ms 上限 60000,超出 warn 并截断 Director send_to_agent: - 重构为 oneshot::channel 响应模式,避免 inbox + pending_requests 锁竞争 - 添加 ensure_inbox_reader() 独立任务分发响应到对应 oneshot sender cleanup_rate_limit Worker: - 实现 Worker body: DELETE FROM rate_limit_events WHERE created_at < NOW() - INTERVAL '1 hour' 651 tests passed, 0 failed
This commit is contained in:
@@ -12,7 +12,7 @@
|
||||
|
||||
use std::sync::Arc;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::sync::{RwLock, Mutex, mpsc};
|
||||
use tokio::sync::{RwLock, Mutex, mpsc, oneshot};
|
||||
use zclaw_types::{AgentId, Result, ZclawError};
|
||||
use zclaw_protocols::{A2aEnvelope, A2aMessageType, A2aRecipient, A2aRouter, A2aAgentProfile, A2aCapability};
|
||||
use zclaw_runtime::{LlmDriver, CompletionRequest};
|
||||
@@ -199,9 +199,9 @@ pub struct Director {
|
||||
director_id: AgentId,
|
||||
/// Optional LLM driver for intelligent scheduling
|
||||
llm_driver: Option<Arc<dyn LlmDriver>>,
|
||||
/// Inbox for receiving responses (stores pending request IDs and their response channels)
|
||||
pending_requests: Arc<Mutex<std::collections::HashMap<String, mpsc::Sender<A2aEnvelope>>>>,
|
||||
/// Receiver for incoming messages
|
||||
/// Pending request response channels (request_id → oneshot sender)
|
||||
pending_requests: Arc<Mutex<std::collections::HashMap<String, oneshot::Sender<A2aEnvelope>>>>,
|
||||
/// Receiver for incoming messages (consumed by inbox reader task)
|
||||
inbox: Arc<Mutex<Option<mpsc::Receiver<A2aEnvelope>>>>,
|
||||
}
|
||||
|
||||
@@ -481,13 +481,16 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
|
||||
}
|
||||
|
||||
/// Send message to selected agent and wait for response
|
||||
///
|
||||
/// Uses oneshot channels to avoid deadlock: each call creates its own
|
||||
/// response channel, and a shared inbox reader dispatches responses.
|
||||
pub async fn send_to_agent(
|
||||
&self,
|
||||
agent: &DirectorAgent,
|
||||
message: String,
|
||||
) -> Result<String> {
|
||||
// Create a response channel for this request
|
||||
let (_response_tx, mut _response_rx) = mpsc::channel::<A2aEnvelope>(1);
|
||||
// Create a oneshot channel for this specific request's response
|
||||
let (response_tx, response_rx) = oneshot::channel::<A2aEnvelope>();
|
||||
|
||||
let envelope = A2aEnvelope::new(
|
||||
self.director_id.clone(),
|
||||
@@ -500,50 +503,32 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
|
||||
}),
|
||||
);
|
||||
|
||||
// Store the request ID with its response channel
|
||||
// Store the oneshot sender so the inbox reader can dispatch to it
|
||||
let request_id = envelope.id.clone();
|
||||
{
|
||||
let mut pending = self.pending_requests.lock().await;
|
||||
pending.insert(request_id.clone(), _response_tx);
|
||||
pending.insert(request_id.clone(), response_tx);
|
||||
}
|
||||
|
||||
// Send the request
|
||||
self.router.route(envelope).await?;
|
||||
|
||||
// Wait for response with timeout
|
||||
// Ensure the inbox reader is running
|
||||
self.ensure_inbox_reader().await;
|
||||
|
||||
// Wait for response on our dedicated oneshot channel with timeout
|
||||
let timeout_duration = std::time::Duration::from_secs(self.config.response_timeout);
|
||||
let request_id_clone = request_id.clone();
|
||||
|
||||
let response = tokio::time::timeout(timeout_duration, async {
|
||||
// Poll the inbox for responses
|
||||
let mut inbox_guard = self.inbox.lock().await;
|
||||
if let Some(ref mut rx) = *inbox_guard {
|
||||
while let Some(msg) = rx.recv().await {
|
||||
// Check if this is a response to our request
|
||||
if msg.message_type == A2aMessageType::Response {
|
||||
if let Some(ref reply_to) = msg.reply_to {
|
||||
if reply_to == &request_id_clone {
|
||||
// Found our response
|
||||
return Some(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Not our response, continue waiting
|
||||
// (In a real implementation, we'd re-queue non-matching messages)
|
||||
}
|
||||
}
|
||||
None
|
||||
}).await;
|
||||
let response = tokio::time::timeout(timeout_duration, response_rx).await;
|
||||
|
||||
// Clean up pending request
|
||||
// Clean up pending request (sender already consumed on success)
|
||||
{
|
||||
let mut pending = self.pending_requests.lock().await;
|
||||
pending.remove(&request_id);
|
||||
}
|
||||
|
||||
match response {
|
||||
Ok(Some(envelope)) => {
|
||||
// Extract response text from payload
|
||||
Ok(Ok(envelope)) => {
|
||||
let response_text = envelope.payload
|
||||
.get("response")
|
||||
.and_then(|v: &serde_json::Value| v.as_str())
|
||||
@@ -551,7 +536,7 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
|
||||
.to_string();
|
||||
Ok(response_text)
|
||||
}
|
||||
Ok(None) => {
|
||||
Ok(Err(_)) => {
|
||||
Err(ZclawError::Timeout("No response received".into()))
|
||||
}
|
||||
Err(_) => {
|
||||
@@ -563,6 +548,44 @@ Respond with ONLY the number (1-{}) of the agent who should speak next. No expla
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure the inbox reader task is running.
|
||||
/// The inbox reader continuously reads from the shared inbox channel
|
||||
/// and dispatches each response to the correct oneshot sender.
|
||||
async fn ensure_inbox_reader(&self) {
|
||||
// Quick check: if inbox has already been taken, reader is running
|
||||
{
|
||||
let inbox = self.inbox.lock().await;
|
||||
if inbox.is_none() {
|
||||
return; // Reader already spawned and consumed the receiver
|
||||
}
|
||||
}
|
||||
|
||||
// Take the receiver out (only once)
|
||||
let rx = {
|
||||
let mut inbox = self.inbox.lock().await;
|
||||
inbox.take()
|
||||
};
|
||||
|
||||
if let Some(mut rx) = rx {
|
||||
let pending = self.pending_requests.clone();
|
||||
tokio::spawn(async move {
|
||||
while let Some(msg) = rx.recv().await {
|
||||
// Find and dispatch to the correct oneshot sender
|
||||
if msg.message_type == A2aMessageType::Response {
|
||||
if let Some(ref reply_to) = msg.reply_to {
|
||||
let mut pending_guard = pending.lock().await;
|
||||
if let Some(sender) = pending_guard.remove(reply_to) {
|
||||
// Send the response; if receiver already dropped, that's fine
|
||||
let _ = sender.send(msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Non-response messages are dropped (notifications, etc.)
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Broadcast message to all agents
|
||||
pub async fn broadcast(&self, message: String) -> Result<()> {
|
||||
let envelope = A2aEnvelope::new(
|
||||
|
||||
Reference in New Issue
Block a user