//! Context compaction for the agent loop. //! //! Provides rule-based token estimation and message compaction to prevent //! conversations from exceeding LLM context windows. When the estimated //! token count exceeds the configured threshold, older messages are //! summarized into a single system message and only recent messages are //! retained. //! //! Supports two compaction modes: //! - **Rule-based**: Heuristic topic extraction (default, no LLM needed) //! - **LLM-based**: Uses an LLM driver to generate higher-quality summaries //! //! Optionally flushes old messages to the growth/memory system before discarding. use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; use zclaw_types::{AgentId, Message, SessionId}; use crate::driver::{CompletionRequest, ContentBlock, LlmDriver}; use crate::growth::GrowthIntegration; /// Number of recent messages to preserve after compaction. const DEFAULT_KEEP_RECENT: usize = 6; /// Heuristic token count estimation. /// /// CJK characters ≈ 1.5 tokens each, English words ≈ 1.3 tokens each. /// Intentionally conservative (overestimates) to avoid hitting real limits. pub fn estimate_tokens(text: &str) -> usize { if text.is_empty() { return 0; } let mut tokens: f64 = 0.0; for char in text.chars() { let code = char as u32; if (0x4E00..=0x9FFF).contains(&code) || (0x3400..=0x4DBF).contains(&code) || (0x20000..=0x2A6DF).contains(&code) || (0xF900..=0xFAFF).contains(&code) { // CJK ideographs — ~1.5 tokens tokens += 1.5; } else if (0xAC00..=0xD7AF).contains(&code) || (0x1100..=0x11FF).contains(&code) { // Korean Hangul syllables + Jamo — ~1.5 tokens tokens += 1.5; } else if (0x3040..=0x309F).contains(&code) || (0x30A0..=0x30FF).contains(&code) { // Japanese Hiragana + Katakana — ~1.5 tokens tokens += 1.5; } else if (0x3000..=0x303F).contains(&code) || (0xFF00..=0xFFEF).contains(&code) { // CJK / fullwidth punctuation — ~1.0 token tokens += 1.0; } else if (0x1F000..=0x1FAFF).contains(&code) || (0x2600..=0x27BF).contains(&code) { // Emoji & Symbols — ~2.0 tokens tokens += 2.0; } else if char == ' ' || char == '\n' || char == '\t' { // whitespace tokens += 0.25; } else { // ASCII / Latin characters — roughly 4 chars per token tokens += 0.3; } } tokens.ceil() as usize } /// Estimate total tokens for a list of messages (including framing overhead). pub fn estimate_messages_tokens(messages: &[Message]) -> usize { let mut total = 0; for msg in messages { match msg { Message::User { content } => { total += estimate_tokens(content); total += 4; } Message::Assistant { content, thinking } => { total += estimate_tokens(content); if let Some(th) = thinking { total += estimate_tokens(th); } total += 4; } Message::System { content } => { total += estimate_tokens(content); total += 4; } Message::ToolUse { input, .. } => { total += estimate_tokens(&input.to_string()); total += 4; } Message::ToolResult { output, .. } => { total += estimate_tokens(&output.to_string()); total += 4; } } } total } // ============================================================ // Calibration: adjust heuristic estimates using API feedback // ============================================================ const F64_1_0_BITS: u64 = 4607182418800017408u64; // 1.0f64.to_bits() /// Global calibration factor for token estimation (stored as f64 bits). /// /// Updated via exponential moving average when API returns actual token counts. /// Initial value is 1.0 (no adjustment). static CALIBRATION_FACTOR_BITS: AtomicU64 = AtomicU64::new(F64_1_0_BITS); /// Get the current calibration factor. pub fn get_calibration_factor() -> f64 { f64::from_bits(CALIBRATION_FACTOR_BITS.load(Ordering::Relaxed)) } /// Update calibration factor using exponential moving average. /// /// Compares estimated tokens with actual tokens from API response: /// - `ratio = actual / estimated` so underestimates push factor UP /// - EMA: `new = current * 0.7 + ratio * 0.3` /// - Clamped to [0.5, 2.0] to prevent runaway values pub fn update_calibration(estimated: usize, actual: u32) { if actual == 0 || estimated == 0 { return; } let ratio = actual as f64 / estimated as f64; let current = get_calibration_factor(); let new_factor = (current * 0.7 + ratio * 0.3).clamp(0.5, 2.0); CALIBRATION_FACTOR_BITS.store(new_factor.to_bits(), Ordering::Relaxed); tracing::debug!( "[Compaction] Calibration: estimated={}, actual={}, ratio={:.2}, factor {:.2} → {:.2}", estimated, actual, ratio, current, new_factor ); } /// Estimate total tokens for messages with calibration applied. fn estimate_messages_tokens_calibrated(messages: &[Message]) -> usize { let raw = estimate_messages_tokens(messages); let factor = get_calibration_factor(); if (factor - 1.0).abs() < f64::EPSILON { raw } else { ((raw as f64 * factor).ceil()) as usize } } /// Compact a message list by summarizing old messages and keeping recent ones. /// /// When `messages.len() > keep_recent`, the oldest messages are summarized /// into a single system message. System messages at the beginning of the /// conversation are always preserved. /// /// Returns the compacted message list and the number of original messages removed. pub fn compact_messages(messages: Vec, keep_recent: usize) -> (Vec, usize) { if messages.len() <= keep_recent { return (messages, 0); } // Preserve leading system messages (they contain compaction summaries from prior runs) let leading_system_count = messages .iter() .take_while(|m| matches!(m, Message::System { .. })) .count(); // Calculate split point: keep leading system + recent messages let keep_from_end = keep_recent.min(messages.len().saturating_sub(leading_system_count)); let split_index = messages.len().saturating_sub(keep_from_end); // Ensure we keep at least the leading system messages let split_index = split_index.max(leading_system_count); if split_index == 0 { return (messages, 0); } let old_messages = &messages[..split_index]; let recent_messages = &messages[split_index..]; let summary = generate_summary(old_messages); let removed_count = old_messages.len(); let mut compacted = Vec::with_capacity(1 + recent_messages.len()); compacted.push(Message::system(summary)); compacted.extend(recent_messages.iter().cloned()); (compacted, removed_count) } /// Check if compaction should be triggered and perform it if needed. /// /// Returns the (possibly compacted) message list. pub fn maybe_compact(messages: Vec, threshold: usize) -> Vec { let tokens = estimate_messages_tokens_calibrated(&messages); if tokens < threshold { return messages; } tracing::info!( "[Compaction] Triggered: {} tokens > {} threshold, {} messages", tokens, threshold, messages.len(), ); let (compacted, removed) = compact_messages(messages, DEFAULT_KEEP_RECENT); tracing::info!( "[Compaction] Removed {} messages, {} remain", removed, compacted.len(), ); compacted } /// Configuration for compaction behavior. #[derive(Debug, Clone)] pub struct CompactionConfig { /// Use LLM for generating summaries instead of rule-based extraction. pub use_llm: bool, /// Fall back to rule-based summary if LLM fails. pub llm_fallback_to_rules: bool, /// Flush memories from old messages before discarding them. pub memory_flush_enabled: bool, /// Maximum tokens for LLM-generated summary. pub summary_max_tokens: u32, } impl Default for CompactionConfig { fn default() -> Self { Self { use_llm: false, llm_fallback_to_rules: true, memory_flush_enabled: false, summary_max_tokens: 500, } } } /// Outcome of an async compaction operation. #[derive(Debug, Clone)] pub struct CompactionOutcome { /// The (possibly compacted) message list. pub messages: Vec, /// Number of messages removed during compaction. pub removed_count: usize, /// Number of memories flushed to the growth system. pub flushed_memories: usize, /// Whether LLM was used for summary generation. pub used_llm: bool, } /// Async compaction with optional LLM summary and memory flushing. /// /// When `messages` exceed `threshold` tokens: /// 1. If `memory_flush_enabled`, extract memories from old messages via growth system /// 2. Generate summary (LLM or rule-based depending on config) /// 3. Replace old messages with summary + keep recent messages pub async fn maybe_compact_with_config( messages: Vec, threshold: usize, config: &CompactionConfig, agent_id: &AgentId, session_id: &SessionId, driver: Option<&Arc>, growth: Option<&GrowthIntegration>, ) -> CompactionOutcome { let tokens = estimate_messages_tokens_calibrated(&messages); if tokens < threshold { return CompactionOutcome { messages, removed_count: 0, flushed_memories: 0, used_llm: false, }; } tracing::info!( "[Compaction] Triggered: {} tokens > {} threshold, {} messages", tokens, threshold, messages.len(), ); // Step 1: Flush memories from messages that are about to be compacted let flushed_memories = if config.memory_flush_enabled { if let Some(growth) = growth { match growth .process_conversation(agent_id, &messages, session_id.clone()) .await { Ok(count) => { tracing::info!( "[Compaction] Flushed {} memories before compaction", count ); count } Err(e) => { tracing::warn!("[Compaction] Memory flush failed: {}", e); 0 } } } else { tracing::debug!("[Compaction] Memory flush requested but no growth integration available"); 0 } } else { 0 }; // Step 2: Determine split point (same logic as compact_messages) let leading_system_count = messages .iter() .take_while(|m| matches!(m, Message::System { .. })) .count(); let keep_from_end = DEFAULT_KEEP_RECENT .min(messages.len().saturating_sub(leading_system_count)); let split_index = messages.len().saturating_sub(keep_from_end); let split_index = split_index.max(leading_system_count); if split_index == 0 { return CompactionOutcome { messages, removed_count: 0, flushed_memories, used_llm: false, }; } let old_messages = &messages[..split_index]; let recent_messages = &messages[split_index..]; let removed_count = old_messages.len(); // Step 3: Generate summary (LLM or rule-based) let summary = if config.use_llm { if let Some(driver) = driver { match generate_llm_summary(driver, old_messages, config.summary_max_tokens).await { Ok(llm_summary) => { tracing::info!( "[Compaction] Generated LLM summary ({} chars)", llm_summary.len() ); llm_summary } Err(e) => { if config.llm_fallback_to_rules { tracing::warn!( "[Compaction] LLM summary failed: {}, falling back to rules", e ); generate_summary(old_messages) } else { tracing::warn!( "[Compaction] LLM summary failed: {}, returning original messages", e ); return CompactionOutcome { messages, removed_count: 0, flushed_memories, used_llm: false, }; } } } } else { tracing::warn!( "[Compaction] LLM compaction requested but no driver available, using rules" ); generate_summary(old_messages) } } else { generate_summary(old_messages) }; let used_llm = config.use_llm && driver.is_some(); // Step 4: Build compacted message list let mut compacted = Vec::with_capacity(1 + recent_messages.len()); compacted.push(Message::system(summary)); compacted.extend(recent_messages.iter().cloned()); tracing::info!( "[Compaction] Removed {} messages, {} remain (llm={})", removed_count, compacted.len(), used_llm, ); CompactionOutcome { messages: compacted, removed_count, flushed_memories, used_llm, } } /// Generate a summary using an LLM driver. async fn generate_llm_summary( driver: &Arc, messages: &[Message], max_tokens: u32, ) -> Result { let mut conversation_text = String::new(); for msg in messages { match msg { Message::User { content } => { conversation_text.push_str(&format!("用户: {}\n", content)) } Message::Assistant { content, .. } => { conversation_text.push_str(&format!("助手: {}\n", content)) } Message::System { content } => { if !content.starts_with("[以下是之前对话的摘要]") { conversation_text.push_str(&format!("[系统]: {}\n", content)) } } Message::ToolUse { tool, input, .. } => { conversation_text.push_str(&format!( "[工具调用 {}]: {}\n", tool.as_str(), input )) } Message::ToolResult { output, .. } => { conversation_text.push_str(&format!("[工具结果]: {}\n", output)) } } } // Truncate conversation text if too long for the prompt itself let max_conversation_chars = 8000; if conversation_text.len() > max_conversation_chars { conversation_text.truncate(max_conversation_chars); conversation_text.push_str("\n...(对话已截断)"); } let prompt = format!( "请用简洁的中文总结以下对话的关键信息。保留重要的讨论主题、决策、结论和待办事项。\ 输出格式为段落式摘要,不超过200字。\n\n{}", conversation_text ); let request = CompletionRequest { model: String::new(), system: Some( "你是一个对话摘要助手。只输出摘要内容,不要添加额外解释。".to_string(), ), messages: vec![Message::user(&prompt)], tools: Vec::new(), max_tokens: Some(max_tokens), temperature: Some(0.3), stop: Vec::new(), stream: false, thinking_enabled: false, reasoning_effort: None, plan_mode: false, }; let response = driver .complete(request) .await .map_err(|e| format!("{}", e))?; // Extract text from content blocks let text_parts: Vec = response .content .iter() .filter_map(|block| match block { ContentBlock::Text { text } => Some(text.clone()), _ => None, }) .collect(); let summary = text_parts.join(""); if summary.is_empty() { return Err("LLM returned empty response".to_string()); } Ok(summary) } /// Generate a rule-based summary of old messages. fn generate_summary(messages: &[Message]) -> String { if messages.is_empty() { return "[对话开始]".to_string(); } let mut sections: Vec = vec!["[以下是之前对话的摘要]".to_string()]; let mut user_count = 0; let mut assistant_count = 0; let mut topics: Vec = Vec::new(); for msg in messages { match msg { Message::User { content } => { user_count += 1; let topic = extract_topic(content); if let Some(t) = topic { topics.push(t); } } Message::Assistant { .. } => { assistant_count += 1; } Message::System { content } => { // Skip system messages that are previous compaction summaries if !content.starts_with("[以下是之前对话的摘要]") { sections.push(format!("系统提示: {}", truncate(content, 60))); } } Message::ToolUse { tool, .. } => { sections.push(format!("工具调用: {}", tool.as_str())); } Message::ToolResult { .. } => { // Skip tool results in summary } } } if !topics.is_empty() { let topic_list: Vec = topics.iter().take(8).cloned().collect(); sections.push(format!("讨论主题: {}", topic_list.join("; "))); } sections.push(format!( "(已压缩 {} 条消息,其中用户 {} 条,助手 {} 条)", messages.len(), user_count, assistant_count, )); let summary = sections.join("\n"); // Enforce max length (char-safe for CJK) let max_chars = 800; if summary.chars().count() > max_chars { let truncated: String = summary.chars().take(max_chars).collect(); format!("{}...\n(摘要已截断)", truncated) } else { summary } } /// Extract the main topic from a user message (first sentence or first 50 chars). fn extract_topic(content: &str) -> Option { let trimmed = content.trim(); if trimmed.is_empty() { return None; } // Find sentence end markers for (i, char) in trimmed.char_indices() { if char == '。' || char == '!' || char == '?' || char == '\n' { let end = i + char.len_utf8(); if end <= 80 { return Some(trimmed[..end].trim().to_string()); } break; } } if trimmed.chars().count() <= 50 { return Some(trimmed.to_string()); } Some(format!("{}...", trimmed.chars().take(50).collect::())) } /// Truncate text to max_chars at char boundary. fn truncate(text: &str, max_chars: usize) -> String { if text.chars().count() <= max_chars { return text.to_string(); } let truncated: String = text.chars().take(max_chars).collect(); format!("{}...", truncated) } #[cfg(test)] mod tests { use super::*; #[test] fn test_estimate_tokens_empty() { assert_eq!(estimate_tokens(""), 0); } #[test] fn test_estimate_tokens_english() { let tokens = estimate_tokens("Hello world"); assert!(tokens > 0); } #[test] fn test_estimate_tokens_cjk() { let tokens = estimate_tokens("你好世界"); assert!(tokens > 3); // CJK chars are ~1.5 tokens each } #[test] fn test_estimate_messages_tokens() { let messages = vec![ Message::user("Hello"), Message::assistant("Hi there"), ]; let tokens = estimate_messages_tokens(&messages); assert!(tokens > 0); } #[test] fn test_compact_messages_under_threshold() { let messages = vec![ Message::user("Hello"), Message::assistant("Hi"), ]; let (result, removed) = compact_messages(messages, 6); assert_eq!(removed, 0); assert_eq!(result.len(), 2); } #[test] fn test_compact_messages_over_threshold() { let messages: Vec = (0..10) .flat_map(|i| { vec![ Message::user(format!("Question {}", i)), Message::assistant(format!("Answer {}", i)), ] }) .collect(); let (result, removed) = compact_messages(messages, 4); assert!(removed > 0); // Should have: 1 summary + 4 recent messages assert_eq!(result.len(), 5); // First message should be a system summary assert!(matches!(&result[0], Message::System { .. })); } #[test] fn test_compact_preserves_leading_system() { let messages = vec![ Message::system("You are helpful"), Message::user("Q1"), Message::assistant("A1"), Message::user("Q2"), Message::assistant("A2"), Message::user("Q3"), Message::assistant("A3"), ]; let (result, removed) = compact_messages(messages, 4); assert!(removed > 0); // Should start with compaction summary, then recent messages assert!(matches!(&result[0], Message::System { .. })); } #[test] fn test_maybe_compact_under_threshold() { let messages = vec![ Message::user("Short message"), Message::assistant("Short reply"), ]; let result = maybe_compact(messages, 100_000); assert_eq!(result.len(), 2); } #[test] fn test_extract_topic_sentence() { let topic = extract_topic("什么是Rust的所有权系统?").unwrap(); assert!(topic.contains("所有权")); } #[test] fn test_extract_topic_short() { let topic = extract_topic("Hello").unwrap(); assert_eq!(topic, "Hello"); } #[test] fn test_extract_topic_long() { let long = "This is a very long message that exceeds fifty characters in total length"; let topic = extract_topic(long).unwrap(); assert!(topic.ends_with("...")); } #[test] fn test_generate_summary() { let messages = vec![ Message::user("What is Rust?"), Message::assistant("Rust is a systems programming language"), Message::user("How does ownership work?"), Message::assistant("Ownership is Rust's memory management system"), ]; let summary = generate_summary(&messages); assert!(summary.contains("摘要")); assert!(summary.contains("2")); } }