zclaw_openfang/crates/zclaw-runtime/src/compaction.rs

//! Context compaction for the agent loop.
//!
//! Provides rule-based token estimation and message compaction to prevent
//! conversations from exceeding LLM context windows. When the estimated
//! token count exceeds the configured threshold, older messages are
//! summarized into a single system message and only recent messages are
//! retained.
//!
//! Supports two compaction modes:
//! - **Rule-based**: Heuristic topic extraction (default, no LLM needed)
//! - **LLM-based**: Uses an LLM driver to generate higher-quality summaries
//!
//! Optionally flushes old messages to the growth/memory system before discarding.

use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use zclaw_types::{AgentId, Message, SessionId};

use crate::driver::{CompletionRequest, ContentBlock, LlmDriver};
use crate::growth::GrowthIntegration;

/// Number of recent messages to preserve after compaction.
const DEFAULT_KEEP_RECENT: usize = 6;

/// Heuristic token count estimation.
///
/// CJK characters ≈ 1.5 tokens each, English words ≈ 1.3 tokens each.
/// Intentionally conservative (overestimates) to avoid hitting real limits.
pub fn estimate_tokens(text: &str) -> usize {
    if text.is_empty() {
        return 0;
    }

    let mut tokens: f64 = 0.0;
    for char in text.chars() {
        let code = char as u32;
        if (0x4E00..=0x9FFF).contains(&code)
            || (0x3400..=0x4DBF).contains(&code)
            || (0x20000..=0x2A6DF).contains(&code)
            || (0xF900..=0xFAFF).contains(&code)
        {
            // CJK ideographs — ~1.5 tokens
            tokens += 1.5;
        } else if (0xAC00..=0xD7AF).contains(&code) || (0x1100..=0x11FF).contains(&code) {
            // Korean Hangul syllables + Jamo — ~1.5 tokens
            tokens += 1.5;
        } else if (0x3040..=0x309F).contains(&code) || (0x30A0..=0x30FF).contains(&code) {
            // Japanese Hiragana + Katakana — ~1.5 tokens
            tokens += 1.5;
        } else if (0x3000..=0x303F).contains(&code) || (0xFF00..=0xFFEF).contains(&code) {
            // CJK / fullwidth punctuation — ~1.0 token
            tokens += 1.0;
        } else if (0x1F000..=0x1FAFF).contains(&code) || (0x2600..=0x27BF).contains(&code) {
            // Emoji & Symbols — ~2.0 tokens
            tokens += 2.0;
        } else if char == ' ' || char == '\n' || char == '\t' {
            // whitespace
            tokens += 0.25;
        } else {
            // ASCII / Latin characters — roughly 4 chars per token
            tokens += 0.3;
        }
    }

    tokens.ceil() as usize
}

/// Estimate total tokens for a list of messages (including framing overhead).
pub fn estimate_messages_tokens(messages: &[Message]) -> usize {
    let mut total = 0;
    for msg in messages {
        match msg {
            Message::User { content } => {
                total += estimate_tokens(content);
                total += 4;
            }
            Message::Assistant { content, thinking } => {
                total += estimate_tokens(content);
                if let Some(th) = thinking {
                    total += estimate_tokens(th);
                }
                total += 4;
            }
            Message::System { content } => {
                total += estimate_tokens(content);
                total += 4;
            }
            Message::ToolUse { input, .. } => {
                total += estimate_tokens(&input.to_string());
                total += 4;
            }
            Message::ToolResult { output, .. } => {
                total += estimate_tokens(&output.to_string());
                total += 4;
            }
        }
    }
    total
}

// ============================================================
// Calibration: adjust heuristic estimates using API feedback
// ============================================================

const F64_1_0_BITS: u64 = 4607182418800017408u64; // 1.0f64.to_bits()

/// Global calibration factor for token estimation (stored as f64 bits).
///
/// Updated via exponential moving average when API returns actual token counts.
/// Initial value is 1.0 (no adjustment).
static CALIBRATION_FACTOR_BITS: AtomicU64 = AtomicU64::new(F64_1_0_BITS);

/// Get the current calibration factor.
pub fn get_calibration_factor() -> f64 {
    f64::from_bits(CALIBRATION_FACTOR_BITS.load(Ordering::Relaxed))
}

/// Update calibration factor using exponential moving average.
///
/// Compares estimated tokens with actual tokens from API response:
/// - `ratio = actual / estimated` so underestimates push factor UP
/// - EMA: `new = current * 0.7 + ratio * 0.3`
/// - Clamped to [0.5, 2.0] to prevent runaway values
pub fn update_calibration(estimated: usize, actual: u32) {
    if actual == 0 || estimated == 0 {
        return;
    }
    let ratio = actual as f64 / estimated as f64;
    let current = get_calibration_factor();
    let new_factor = (current * 0.7 + ratio * 0.3).clamp(0.5, 2.0);
    CALIBRATION_FACTOR_BITS.store(new_factor.to_bits(), Ordering::Relaxed);
    tracing::debug!(
        "[Compaction] Calibration: estimated={}, actual={}, ratio={:.2}, factor {:.2} → {:.2}",
        estimated, actual, ratio, current, new_factor
    );
}

/// Estimate total tokens for messages with calibration applied.
fn estimate_messages_tokens_calibrated(messages: &[Message]) -> usize {
    let raw = estimate_messages_tokens(messages);
    let factor = get_calibration_factor();
    if (factor - 1.0).abs() < f64::EPSILON {
        raw
    } else {
        ((raw as f64 * factor).ceil()) as usize
    }
}

/// Compact a message list by summarizing old messages and keeping recent ones.
///
/// When `messages.len() > keep_recent`, the oldest messages are summarized
/// into a single system message. System messages at the beginning of the
/// conversation are always preserved.
///
/// Returns the compacted message list and the number of original messages removed.
pub fn compact_messages(messages: Vec<Message>, keep_recent: usize) -> (Vec<Message>, usize) {
    if messages.len() <= keep_recent {
        return (messages, 0);
    }

    // Preserve leading system messages (they contain compaction summaries from prior runs)
    let leading_system_count = messages
        .iter()
        .take_while(|m| matches!(m, Message::System { .. }))
        .count();

    // Calculate split point: keep leading system + recent messages
    let keep_from_end = keep_recent.min(messages.len().saturating_sub(leading_system_count));
    let split_index = messages.len().saturating_sub(keep_from_end);

    // Ensure we keep at least the leading system messages
    let split_index = split_index.max(leading_system_count);

    if split_index == 0 {
        return (messages, 0);
    }

    let old_messages = &messages[..split_index];
    let recent_messages = &messages[split_index..];

    let summary = generate_summary(old_messages);
    let removed_count = old_messages.len();

    let mut compacted = Vec::with_capacity(1 + recent_messages.len());
    compacted.push(Message::system(summary));
    compacted.extend(recent_messages.iter().cloned());

    (compacted, removed_count)
}

/// Check if compaction should be triggered and perform it if needed.
///
/// Returns the (possibly compacted) message list.
pub fn maybe_compact(messages: Vec<Message>, threshold: usize) -> Vec<Message> {
    let tokens = estimate_messages_tokens_calibrated(&messages);
    if tokens < threshold {
        return messages;
    }

    tracing::info!(
        "[Compaction] Triggered: {} tokens > {} threshold, {} messages",
        tokens,
        threshold,
        messages.len(),
    );

    let (compacted, removed) = compact_messages(messages, DEFAULT_KEEP_RECENT);
    tracing::info!(
        "[Compaction] Removed {} messages, {} remain",
        removed,
        compacted.len(),
    );

    compacted
}

/// Configuration for compaction behavior.
#[derive(Debug, Clone)]
pub struct CompactionConfig {
    /// Use LLM for generating summaries instead of rule-based extraction.
    pub use_llm: bool,
    /// Fall back to rule-based summary if LLM fails.
    pub llm_fallback_to_rules: bool,
    /// Flush memories from old messages before discarding them.
    pub memory_flush_enabled: bool,
    /// Maximum tokens for LLM-generated summary.
    pub summary_max_tokens: u32,
}

impl Default for CompactionConfig {
    fn default() -> Self {
        Self {
            use_llm: false,
            llm_fallback_to_rules: true,
            memory_flush_enabled: false,
            summary_max_tokens: 500,
        }
    }
}

/// Outcome of an async compaction operation.
#[derive(Debug, Clone)]
pub struct CompactionOutcome {
    /// The (possibly compacted) message list.
    pub messages: Vec<Message>,
    /// Number of messages removed during compaction.
    pub removed_count: usize,
    /// Number of memories flushed to the growth system.
    pub flushed_memories: usize,
    /// Whether LLM was used for summary generation.
    pub used_llm: bool,
}

/// Async compaction with optional LLM summary and memory flushing.
///
/// When `messages` exceed `threshold` tokens:
/// 1. If `memory_flush_enabled`, extract memories from old messages via growth system
/// 2. Generate summary (LLM or rule-based depending on config)
/// 3. Replace old messages with summary + keep recent messages
pub async fn maybe_compact_with_config(
    messages: Vec<Message>,
    threshold: usize,
    config: &CompactionConfig,
    agent_id: &AgentId,
    session_id: &SessionId,
    driver: Option<&Arc<dyn LlmDriver>>,
    growth: Option<&GrowthIntegration>,
) -> CompactionOutcome {
    let tokens = estimate_messages_tokens_calibrated(&messages);
    if tokens < threshold {
        return CompactionOutcome {
            messages,
            removed_count: 0,
            flushed_memories: 0,
            used_llm: false,
        };
    }

    tracing::info!(
        "[Compaction] Triggered: {} tokens > {} threshold, {} messages",
        tokens,
        threshold,
        messages.len(),
    );

    // Step 1: Flush memories from messages that are about to be compacted
    let flushed_memories = if config.memory_flush_enabled {
        if let Some(growth) = growth {
            match growth
                .process_conversation(agent_id, &messages, session_id.clone())
                .await
            {
                Ok(count) => {
                    tracing::info!(
                        "[Compaction] Flushed {} memories before compaction",
                        count
                    );
                    count
                }
                Err(e) => {
                    tracing::warn!("[Compaction] Memory flush failed: {}", e);
                    0
                }
            }
        } else {
            tracing::debug!("[Compaction] Memory flush requested but no growth integration available");
            0
        }
    } else {
        0
    };

    // Step 2: Determine split point (same logic as compact_messages)
    let leading_system_count = messages
        .iter()
        .take_while(|m| matches!(m, Message::System { .. }))
        .count();
    let keep_from_end = DEFAULT_KEEP_RECENT
        .min(messages.len().saturating_sub(leading_system_count));
    let split_index = messages.len().saturating_sub(keep_from_end);
    let split_index = split_index.max(leading_system_count);

    if split_index == 0 {
        return CompactionOutcome {
            messages,
            removed_count: 0,
            flushed_memories,
            used_llm: false,
        };
    }

    let old_messages = &messages[..split_index];
    let recent_messages = &messages[split_index..];
    let removed_count = old_messages.len();

    // Step 3: Generate summary (LLM or rule-based)
    let summary = if config.use_llm {
        if let Some(driver) = driver {
            match generate_llm_summary(driver, old_messages, config.summary_max_tokens).await {
                Ok(llm_summary) => {
                    tracing::info!(
                        "[Compaction] Generated LLM summary ({} chars)",
                        llm_summary.len()
                    );
                    llm_summary
                }
                Err(e) => {
                    if config.llm_fallback_to_rules {
                        tracing::warn!(
                            "[Compaction] LLM summary failed: {}, falling back to rules",
                            e
                        );
                        generate_summary(old_messages)
                    } else {
                        tracing::warn!(
                            "[Compaction] LLM summary failed: {}, returning original messages",
                            e
                        );
                        return CompactionOutcome {
                            messages,
                            removed_count: 0,
                            flushed_memories,
                            used_llm: false,
                        };
                    }
                }
            }
        } else {
            tracing::warn!(
                "[Compaction] LLM compaction requested but no driver available, using rules"
            );
            generate_summary(old_messages)
        }
    } else {
        generate_summary(old_messages)
    };

    let used_llm = config.use_llm && driver.is_some();

    // Step 4: Build compacted message list
    let mut compacted = Vec::with_capacity(1 + recent_messages.len());
    compacted.push(Message::system(summary));
    compacted.extend(recent_messages.iter().cloned());

    tracing::info!(
        "[Compaction] Removed {} messages, {} remain (llm={})",
        removed_count,
        compacted.len(),
        used_llm,
    );

    CompactionOutcome {
        messages: compacted,
        removed_count,
        flushed_memories,
        used_llm,
    }
}

/// Generate a summary using an LLM driver.
async fn generate_llm_summary(
    driver: &Arc<dyn LlmDriver>,
    messages: &[Message],
    max_tokens: u32,
) -> Result<String, String> {
    let mut conversation_text = String::new();
    for msg in messages {
        match msg {
            Message::User { content } => {
                conversation_text.push_str(&format!("用户: {}\n", content))
            }
            Message::Assistant { content, .. } => {
                conversation_text.push_str(&format!("助手: {}\n", content))
            }
            Message::System { content } => {
                if !content.starts_with("[以下是之前对话的摘要]") {
                    conversation_text.push_str(&format!("[系统]: {}\n", content))
                }
            }
            Message::ToolUse { tool, input, .. } => {
                conversation_text.push_str(&format!(
                    "[工具调用 {}]: {}\n",
                    tool.as_str(),
                    input
                ))
            }
            Message::ToolResult { output, .. } => {
                conversation_text.push_str(&format!("[工具结果]: {}\n", output))
            }
        }
    }

    // Truncate conversation text if too long for the prompt itself
    let max_conversation_chars = 8000;
    if conversation_text.len() > max_conversation_chars {
        conversation_text.truncate(max_conversation_chars);
        conversation_text.push_str("\n...(对话已截断)");
    }

    let prompt = format!(
        "请用简洁的中文总结以下对话的关键信息。保留重要的讨论主题、决策、结论和待办事项。\
         输出格式为段落式摘要，不超过200字。\n\n{}",
        conversation_text
    );

    let request = CompletionRequest {
        model: String::new(),
        system: Some(
            "你是一个对话摘要助手。只输出摘要内容，不要添加额外解释。".to_string(),
        ),
        messages: vec![Message::user(&prompt)],
        tools: Vec::new(),
        max_tokens: Some(max_tokens),
        temperature: Some(0.3),
        stop: Vec::new(),
        stream: false,
        thinking_enabled: false,
        reasoning_effort: None,
        plan_mode: false,
    };

    let response = driver
        .complete(request)
        .await
        .map_err(|e| format!("{}", e))?;

    // Extract text from content blocks
    let text_parts: Vec<String> = response
        .content
        .iter()
        .filter_map(|block| match block {
            ContentBlock::Text { text } => Some(text.clone()),
            _ => None,
        })
        .collect();

    let summary = text_parts.join("");

    if summary.is_empty() {
        return Err("LLM returned empty response".to_string());
    }

    Ok(summary)
}

/// Generate a rule-based summary of old messages.
fn generate_summary(messages: &[Message]) -> String {
    if messages.is_empty() {
        return "[对话开始]".to_string();
    }

    let mut sections: Vec<String> = vec!["[以下是之前对话的摘要]".to_string()];

    let mut user_count = 0;
    let mut assistant_count = 0;
    let mut topics: Vec<String> = Vec::new();

    for msg in messages {
        match msg {
            Message::User { content } => {
                user_count += 1;
                let topic = extract_topic(content);
                if let Some(t) = topic {
                    topics.push(t);
                }
            }
            Message::Assistant { .. } => {
                assistant_count += 1;
            }
            Message::System { content } => {
                // Skip system messages that are previous compaction summaries
                if !content.starts_with("[以下是之前对话的摘要]") {
                    sections.push(format!("系统提示: {}", truncate(content, 60)));
                }
            }
            Message::ToolUse { tool, .. } => {
                sections.push(format!("工具调用: {}", tool.as_str()));
            }
            Message::ToolResult { .. } => {
                // Skip tool results in summary
            }
        }
    }

    if !topics.is_empty() {
        let topic_list: Vec<String> = topics.iter().take(8).cloned().collect();
        sections.push(format!("讨论主题: {}", topic_list.join("; ")));
    }

    sections.push(format!(
        "(已压缩 {} 条消息，其中用户 {} 条，助手 {} 条)",
        messages.len(),
        user_count,
        assistant_count,
    ));

    let summary = sections.join("\n");

    // Enforce max length (char-safe for CJK)
    let max_chars = 800;
    if summary.chars().count() > max_chars {
        let truncated: String = summary.chars().take(max_chars).collect();
        format!("{}...\n(摘要已截断)", truncated)
    } else {
        summary
    }
}

/// Extract the main topic from a user message (first sentence or first 50 chars).
fn extract_topic(content: &str) -> Option<String> {
    let trimmed = content.trim();
    if trimmed.is_empty() {
        return None;
    }

    // Find sentence end markers
    for (i, char) in trimmed.char_indices() {
        if char == '。' || char == '！' || char == '？' || char == '\n' {
            let end = i + char.len_utf8();
            if end <= 80 {
                return Some(trimmed[..end].trim().to_string());
            }
            break;
        }
    }

    if trimmed.chars().count() <= 50 {
        return Some(trimmed.to_string());
    }

    Some(format!("{}...", trimmed.chars().take(50).collect::<String>()))
}

/// Truncate text to max_chars at char boundary.
fn truncate(text: &str, max_chars: usize) -> String {
    if text.chars().count() <= max_chars {
        return text.to_string();
    }
    let truncated: String = text.chars().take(max_chars).collect();
    format!("{}...", truncated)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_estimate_tokens_empty() {
        assert_eq!(estimate_tokens(""), 0);
    }

    #[test]
    fn test_estimate_tokens_english() {
        let tokens = estimate_tokens("Hello world");
        assert!(tokens > 0);
    }

    #[test]
    fn test_estimate_tokens_cjk() {
        let tokens = estimate_tokens("你好世界");
        assert!(tokens > 3); // CJK chars are ~1.5 tokens each
    }

    #[test]
    fn test_estimate_messages_tokens() {
        let messages = vec![
            Message::user("Hello"),
            Message::assistant("Hi there"),
        ];
        let tokens = estimate_messages_tokens(&messages);
        assert!(tokens > 0);
    }

    #[test]
    fn test_compact_messages_under_threshold() {
        let messages = vec![
            Message::user("Hello"),
            Message::assistant("Hi"),
        ];
        let (result, removed) = compact_messages(messages, 6);
        assert_eq!(removed, 0);
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn test_compact_messages_over_threshold() {
        let messages: Vec<Message> = (0..10)
            .flat_map(|i| {
                vec![
                    Message::user(format!("Question {}", i)),
                    Message::assistant(format!("Answer {}", i)),
                ]
            })
            .collect();

        let (result, removed) = compact_messages(messages, 4);
        assert!(removed > 0);
        // Should have: 1 summary + 4 recent messages
        assert_eq!(result.len(), 5);
        // First message should be a system summary
        assert!(matches!(&result[0], Message::System { .. }));
    }

    #[test]
    fn test_compact_preserves_leading_system() {
        let messages = vec![
            Message::system("You are helpful"),
            Message::user("Q1"),
            Message::assistant("A1"),
            Message::user("Q2"),
            Message::assistant("A2"),
            Message::user("Q3"),
            Message::assistant("A3"),
        ];

        let (result, removed) = compact_messages(messages, 4);
        assert!(removed > 0);
        // Should start with compaction summary, then recent messages
        assert!(matches!(&result[0], Message::System { .. }));
    }

    #[test]
    fn test_maybe_compact_under_threshold() {
        let messages = vec![
            Message::user("Short message"),
            Message::assistant("Short reply"),
        ];
        let result = maybe_compact(messages, 100_000);
        assert_eq!(result.len(), 2);
    }

    #[test]
    fn test_extract_topic_sentence() {
        let topic = extract_topic("什么是Rust的所有权系统？").unwrap();
        assert!(topic.contains("所有权"));
    }

    #[test]
    fn test_extract_topic_short() {
        let topic = extract_topic("Hello").unwrap();
        assert_eq!(topic, "Hello");
    }

    #[test]
    fn test_extract_topic_long() {
        let long = "This is a very long message that exceeds fifty characters in total length";
        let topic = extract_topic(long).unwrap();
        assert!(topic.ends_with("..."));
    }

    #[test]
    fn test_generate_summary() {
        let messages = vec![
            Message::user("What is Rust?"),
            Message::assistant("Rust is a systems programming language"),
            Message::user("How does ownership work?"),
            Message::assistant("Ownership is Rust's memory management system"),
        ];
        let summary = generate_summary(&messages);
        assert!(summary.contains("摘要"));
        assert!(summary.contains("2"));
    }
}