fix(growth): HIGH-6 修复 extract_combined 合并提取空壳

根因: growth.rs 构造 CombinedExtraction 时硬编码 experiences: Vec::new() 和 profile_signals: default()，导致 L1 结构化经验不被提取、L2 技能进化没有输入数据、整个进化引擎无法端到端工作。修复: - extractor.rs: 添加 COMBINED_EXTRACTION_PROMPT 统一 prompt，单次 LLM 调用同时输出 memories + experiences + profile_signals - extractor.rs: 添加 parse_combined_response() 解析 LLM JSON 响应 - LlmDriverForExtraction trait: 添加 extract_with_prompt() 方法（默认不支持，退化到现有 extract() + 启发式推断） - MemoryExtractor: 添加 extract_combined() 方法，优先单次调用，失败则退化 - growth.rs: extract_combined() 使用新的合并提取替代硬编码空值 - TauriExtractionDriver: 实现 extract_with_prompt() - ProfileSignals: 添加 has_any_signal() 方法 - types.rs: ProfileSignals 无 structural 变化（字段已存在）测试: 4 个新测试（parse_combined_response_full/minimal/invalid + extract_combined_fallback），11 个 extractor 测试全部通过
2026-04-18 22:56:42 +08:00
parent cb727fdcc7
commit 3c6581f915
4 changed files with 536 additions and 29 deletions
--- a/crates/zclaw-growth/src/extractor.rs
+++ b/crates/zclaw-growth/src/extractor.rs
@@ -21,7 +21,7 @@ pub trait LlmDriverForExtraction: Send + Sync {
    ) -> Result<Vec<ExtractedMemory>>;
    /// 单次 LLM 调用提取全部类型（记忆 + 经验 + 画像信号）
-    /// 默认实现：退化到 3 次独立调用
+    /// 默认实现：退化到 3 次独立调用（experiences 和 profile_signals 为空）
    async fn extract_combined_all(
        &self,
        messages: &[Message],
@@ -34,6 +34,19 @@ pub trait LlmDriverForExtraction: Send + Sync {
        }
        Ok(combined)
    }
    /// 使用自定义 prompt 进行单次 LLM 调用，返回原始文本响应
    /// 用于统一提取场景，默认返回不支持错误
    async fn extract_with_prompt(
        &self,
        _messages: &[Message],
        _system_prompt: &str,
        _user_prompt: &str,
    ) -> Result<String> {
        Err(zclaw_types::ZclawError::Internal(
            "extract_with_prompt not implemented".to_string(),
        ))
    }
 }
 /// Memory Extractor - extracts memories from conversations
@@ -100,13 +113,10 @@ impl MemoryExtractor {
        session_id: SessionId,
    ) -> Result<Vec<ExtractedMemory>> {
        // Check if LLM driver is available
-        let _llm_driver = match &self.llm_driver {
+        if self.llm_driver.is_none() {
-            Some(driver) => driver,
+            tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
-            None => {
+            return Ok(Vec::new());
-                tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
+        }
                return Ok(Vec::new());
            }
        };
        let mut results = Vec::new();
@@ -242,6 +252,299 @@ impl MemoryExtractor {
        tracing::info!("[MemoryExtractor] Stored {} memories to OpenViking", stored);
        Ok(stored)
    }
    /// 统一提取：单次 LLM 调用同时产出 memories + experiences + profile_signals
    ///
    /// 优先使用 `extract_with_prompt()` 进行单次调用；若 driver 不支持则
    /// 退化为 `extract()` + 从记忆推断经验/画像。
    pub async fn extract_combined(
        &self,
        messages: &[Message],
        session_id: SessionId,
    ) -> Result<crate::types::CombinedExtraction> {
        let llm_driver = match &self.llm_driver {
            Some(driver) => driver,
            None => {
                tracing::debug!(
                    "[MemoryExtractor] No LLM driver configured, skipping combined extraction"
                );
                return Ok(crate::types::CombinedExtraction::default());
            }
        };
        // 尝试单次 LLM 调用路径
        let system_prompt = "You are a memory extraction assistant. Analyze conversations and extract \
            structured memories, experiences, and profile signals in valid JSON format. \
            Always respond with valid JSON only, no additional text or markdown formatting.";
        let user_prompt = format!(
            "{}{}",
            crate::extractor::prompts::COMBINED_EXTRACTION_PROMPT,
            format_conversation_text(messages)
        );
        match llm_driver
            .extract_with_prompt(messages, system_prompt, &user_prompt)
            .await
        {
            Ok(raw_text) if !raw_text.trim().is_empty() => {
                match parse_combined_response(&raw_text, session_id.clone()) {
                    Ok(combined) => {
                        tracing::info!(
                            "[MemoryExtractor] Combined extraction: {} memories, {} experiences, {} profile signals",
                            combined.memories.len(),
                            combined.experiences.len(),
                            combined.profile_signals.has_any_signal() as usize,
                        );
                        return Ok(combined);
                    }
                    Err(e) => {
                        tracing::warn!(
                            "[MemoryExtractor] Combined response parse failed, falling back: {}",
                            e
                        );
                    }
                }
            }
            Ok(_) => {
                tracing::debug!("[MemoryExtractor] extract_with_prompt returned empty, falling back");
            }
            Err(e) => {
                tracing::debug!(
                    "[MemoryExtractor] extract_with_prompt not supported ({}), falling back",
                    e
                );
            }
        }
        // 退化路径：使用已有的 extract() 然后推断 experiences 和 profile_signals
        let memories = self.extract(messages, session_id).await?;
        let experiences = infer_experiences_from_memories(&memories);
        let profile_signals = infer_profile_signals_from_memories(&memories);
        Ok(crate::types::CombinedExtraction {
            memories,
            experiences,
            profile_signals,
        })
    }
 }
 /// 格式化对话消息为文本
 fn format_conversation_text(messages: &[Message]) -> String {
    messages
        .iter()
        .filter_map(|msg| match msg {
            Message::User { content } => Some(format!("[User]: {}", content)),
            Message::Assistant { content, .. } => Some(format!("[Assistant]: {}", content)),
            Message::System { content } => Some(format!("[System]: {}", content)),
            Message::ToolUse { .. } | Message::ToolResult { .. } => None,
        })
        .collect::<Vec<_>>()
        .join("\n\n")
 }
 /// 从 LLM 原始响应解析 CombinedExtraction
 pub fn parse_combined_response(
    raw: &str,
    session_id: SessionId,
 ) -> Result<crate::types::CombinedExtraction> {
    use crate::types::CombinedExtraction;
    let json_str = crate::json_utils::extract_json_block(raw);
    let parsed: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
        zclaw_types::ZclawError::Internal(format!("Failed to parse combined JSON: {}", e))
    })?;
    // 解析 memories
    let memories = parsed
        .get("memories")
        .and_then(|v| v.as_array())
        .map(|arr| {
            arr.iter()
                .filter_map(|item| parse_memory_item(item, &session_id))
                .collect::<Vec<_>>()
        })
        .unwrap_or_default();
    // 解析 experiences
    let experiences = parsed
        .get("experiences")
        .and_then(|v| v.as_array())
        .map(|arr| {
            arr.iter()
                .filter_map(parse_experience_item)
                .collect::<Vec<_>>()
        })
        .unwrap_or_default();
    // 解析 profile_signals
    let profile_signals = parse_profile_signals(&parsed);
    Ok(CombinedExtraction {
        memories,
        experiences,
        profile_signals,
    })
 }
 /// 解析单个 memory 项
 fn parse_memory_item(
    value: &serde_json::Value,
    session_id: &SessionId,
 ) -> Option<ExtractedMemory> {
    let content = value.get("content")?.as_str()?.to_string();
    let category = value
        .get("category")
        .and_then(|v| v.as_str())
        .unwrap_or("unknown")
        .to_string();
    let memory_type_str = value
        .get("memory_type")
        .and_then(|v| v.as_str())
        .unwrap_or("knowledge");
    let memory_type = crate::types::MemoryType::parse(memory_type_str);
    let confidence = value
        .get("confidence")
        .and_then(|v| v.as_f64())
        .unwrap_or(0.7) as f32;
    let keywords = crate::json_utils::extract_string_array(value, "keywords");
    Some(
        ExtractedMemory::new(memory_type, category, content, session_id.clone())
            .with_confidence(confidence)
            .with_keywords(keywords),
    )
 }
 /// 解析单个 experience 项
 fn parse_experience_item(value: &serde_json::Value) -> Option<crate::types::ExperienceCandidate> {
    use crate::types::Outcome;
    let pain_pattern = value.get("pain_pattern")?.as_str()?.to_string();
    let context = value
        .get("context")
        .and_then(|v| v.as_str())
        .unwrap_or("")
        .to_string();
    let solution_steps = crate::json_utils::extract_string_array(value, "solution_steps");
    let outcome_str = value
        .get("outcome")
        .and_then(|v| v.as_str())
        .unwrap_or("partial");
    let outcome = match outcome_str {
        "success" => Outcome::Success,
        "failed" => Outcome::Failed,
        _ => Outcome::Partial,
    };
    let confidence = value
        .get("confidence")
        .and_then(|v| v.as_f64())
        .unwrap_or(0.6) as f32;
    let tools_used = crate::json_utils::extract_string_array(value, "tools_used");
    let industry_context = value
        .get("industry_context")
        .and_then(|v| v.as_str())
        .map(String::from);
    Some(crate::types::ExperienceCandidate {
        pain_pattern,
        context,
        solution_steps,
        outcome,
        confidence,
        tools_used,
        industry_context,
    })
 }
 /// 解析 profile_signals
 fn parse_profile_signals(obj: &serde_json::Value) -> crate::types::ProfileSignals {
    let signals = obj.get("profile_signals");
    crate::types::ProfileSignals {
        industry: signals
            .and_then(|s| s.get("industry"))
            .and_then(|v| v.as_str())
            .map(String::from),
        recent_topic: signals
            .and_then(|s| s.get("recent_topic"))
            .and_then(|v| v.as_str())
            .map(String::from),
        pain_point: signals
            .and_then(|s| s.get("pain_point"))
            .and_then(|v| v.as_str())
            .map(String::from),
        preferred_tool: signals
            .and_then(|s| s.get("preferred_tool"))
            .and_then(|v| v.as_str())
            .map(String::from),
        communication_style: signals
            .and_then(|s| s.get("communication_style"))
            .and_then(|v| v.as_str())
            .map(String::from),
    }
 }
 /// 从已有记忆推断结构化经验（退化路径）
 fn infer_experiences_from_memories(
    memories: &[ExtractedMemory],
 ) -> Vec<crate::types::ExperienceCandidate> {
    memories
        .iter()
        .filter(|m| m.memory_type == crate::types::MemoryType::Experience)
        .filter_map(|m| {
            // 经验类记忆 → ExperienceCandidate
            let content = &m.content;
            if content.len() < 10 {
                return None;
            }
            Some(crate::types::ExperienceCandidate {
                pain_pattern: m.category.clone(),
                context: content.clone(),
                solution_steps: Vec::new(),
                outcome: crate::types::Outcome::Success,
                confidence: m.confidence * 0.7, // 降低推断置信度
                tools_used: m.keywords.clone(),
                industry_context: None,
            })
        })
        .collect()
 }
 /// 从已有记忆推断画像信号（退化路径）
 fn infer_profile_signals_from_memories(
    memories: &[ExtractedMemory],
 ) -> crate::types::ProfileSignals {
    use crate::types::ProfileSignals;
    let mut signals = ProfileSignals::default();
    for m in memories {
        match m.memory_type {
            crate::types::MemoryType::Preference => {
                if m.category.contains("style") || m.category.contains("风格") {
                    if signals.communication_style.is_none() {
                        signals.communication_style = Some(m.content.clone());
                    }
                }
            }
            crate::types::MemoryType::Knowledge => {
                if signals.recent_topic.is_none() && !m.keywords.is_empty() {
                    signals.recent_topic = Some(m.keywords.first().cloned().unwrap_or_default());
                }
            }
            crate::types::MemoryType::Experience => {
                for kw in &m.keywords {
                    if signals.preferred_tool.is_none()
                        && m.content.contains(kw.as_str())
                    {
                        signals.preferred_tool = Some(kw.clone());
                        break;
                    }
                }
            }
            _ => {}
        }
    }
    signals
 }
 /// Default extraction prompts for LLM
@@ -258,6 +561,55 @@ pub mod prompts {
        }
    }
    /// 统一提取 prompt — 单次 LLM 调用同时提取记忆、结构化经验、画像信号
    pub const COMBINED_EXTRACTION_PROMPT: &str = r#"
 分析以下对话，一次性提取三类信息。严格按 JSON 格式返回。
 ## 输出格式
 ```json
 {
  "memories": [
    {
      "memory_type": "preference|knowledge|experience",
      "category": "分类标签",
      "content": "记忆内容",
      "confidence": 0.0-1.0,
      "keywords": ["关键词"]
    }
  ],
  "experiences": [
    {
      "pain_pattern": "痛点模式简述",
      "context": "问题发生的上下文",
      "solution_steps": ["步骤1", "步骤2"],
      "outcome": "success|partial|failed",
      "confidence": 0.0-1.0,
      "tools_used": ["使用的工具/技能"],
      "industry_context": "行业标识(可选)"
    }
  ],
  "profile_signals": {
    "industry": "用户所在行业(可选)",
    "recent_topic": "最近讨论的主要话题(可选)",
    "pain_point": "用户当前痛点(可选)",
    "preferred_tool": "用户偏好的工具/技能(可选)",
    "communication_style": "沟通风格: concise|detailed|formal|casual(可选)"
  }
 }
 ```
 ## 提取规则
 1. **memories**: 提取用户偏好(沟通风格/格式/语言)、知识(事实/领域知识/经验教训)、使用经验(技能/工具使用模式和结果)
 2. **experiences**: 仅提取明确的"问题→解决"模式，要求有清晰的痛点和步骤，confidence >= 0.6
 3. **profile_signals**: 从对话中推断用户画像信息，只在有明确信号时填写，留空则不填
 4. 每个字段都要有实际内容，不确定的宁可省略
 5. 只返回 JSON，不要附加其他文本
 对话内容：
 "#;
    const PREFERENCE_EXTRACTION_PROMPT: &str = r#"
 分析以下对话，提取用户的偏好设置。关注：
 - 沟通风格偏好（简洁/详细、正式/随意）
@@ -391,5 +743,89 @@ mod tests {
        assert!(!prompts::get_extraction_prompt(MemoryType::Knowledge).is_empty());
        assert!(!prompts::get_extraction_prompt(MemoryType::Experience).is_empty());
        assert!(!prompts::get_extraction_prompt(MemoryType::Session).is_empty());
        assert!(!prompts::COMBINED_EXTRACTION_PROMPT.is_empty());
    }
    #[test]
    fn test_parse_combined_response_full() {
        let raw = r#"```json
 {
  "memories": [
    {
      "memory_type": "preference",
      "category": "communication-style",
      "content": "用户偏好简洁回复",
      "confidence": 0.9,
      "keywords": ["简洁", "风格"]
    },
    {
      "memory_type": "knowledge",
      "category": "user-facts",
      "content": "用户是医院行政人员",
      "confidence": 0.85,
      "keywords": ["医院", "行政"]
    }
  ],
  "experiences": [
    {
      "pain_pattern": "报表生成耗时",
      "context": "月度报表需要手动汇总多个Excel",
      "solution_steps": ["使用researcher工具自动抓取", "格式化输出为Excel"],
      "outcome": "success",
      "confidence": 0.85,
      "tools_used": ["researcher"],
      "industry_context": "healthcare"
    }
  ],
  "profile_signals": {
    "industry": "healthcare",
    "recent_topic": "报表自动化",
    "pain_point": "手动汇总Excel太慢",
    "preferred_tool": "researcher",
    "communication_style": "concise"
  }
 }
 ```"#;
        let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
        assert_eq!(result.memories.len(), 2);
        assert_eq!(result.experiences.len(), 1);
        assert_eq!(result.experiences[0].pain_pattern, "报表生成耗时");
        assert_eq!(result.experiences[0].outcome, crate::types::Outcome::Success);
        assert_eq!(result.profile_signals.industry.as_deref(), Some("healthcare"));
        assert_eq!(result.profile_signals.pain_point.as_deref(), Some("手动汇总Excel太慢"));
        assert!(result.profile_signals.has_any_signal());
    }
    #[test]
    fn test_parse_combined_response_minimal() {
        let raw = r#"{"memories": [], "experiences": [], "profile_signals": {}}"#;
        let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
        assert!(result.memories.is_empty());
        assert!(result.experiences.is_empty());
        assert!(!result.profile_signals.has_any_signal());
    }
    #[test]
    fn test_parse_combined_response_invalid() {
        let raw = "not json at all";
        let result = super::parse_combined_response(raw, SessionId::new());
        assert!(result.is_err());
    }
    #[tokio::test]
    async fn test_extract_combined_fallback() {
        // MockLlmDriver doesn't implement extract_with_prompt, so it falls back
        let driver = Arc::new(MockLlmDriver);
        let extractor = MemoryExtractor::new(driver);
        let messages = vec![Message::user("Hello"), Message::assistant("Hi there!")];
        let result = extractor
            .extract_combined(&messages, SessionId::new())
            .await
            .unwrap();
        // Fallback: extract() produces 3 memories, infer produces experiences from them
        assert!(!result.memories.is_empty());
    }
 }
--- a/crates/zclaw-growth/src/types.rs
+++ b/crates/zclaw-growth/src/types.rs
@@ -434,6 +434,17 @@ pub struct ProfileSignals {
    pub communication_style: Option<String>,
 }
 impl ProfileSignals {
    /// 是否包含至少一个有效信号
    pub fn has_any_signal(&self) -> bool {
        self.industry.is_some()
            || self.recent_topic.is_some()
            || self.pain_point.is_some()
            || self.preferred_tool.is_some()
            || self.communication_style.is_some()
    }
 }
 /// 进化事件
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct EvolutionEvent {
--- a/crates/zclaw-runtime/src/growth.rs
+++ b/crates/zclaw-runtime/src/growth.rs
@@ -15,7 +15,7 @@ use zclaw_growth::{
    AggregatedPattern, CombinedExtraction, EvolutionConfig, EvolutionEngine,
    ExperienceExtractor, GrowthTracker, InjectionFormat,
    LlmDriverForExtraction, MemoryExtractor, MemoryRetriever, PromptInjector,
-    ProfileSignals, RetrievalResult, UserProfileUpdater, VikingAdapter,
+    RetrievalResult, UserProfileUpdater, VikingAdapter,
 };
 use zclaw_memory::{ExtractedFactBatch, Fact, FactCategory, UserProfileStore};
 use zclaw_types::{AgentId, Message, Result, SessionId};
@@ -263,8 +263,8 @@ impl GrowthIntegration {
        Ok(count)
    }
-    /// Combined extraction: single LLM call that produces both stored memories
+    /// Combined extraction: single LLM call that produces stored memories,
-    /// and structured facts, avoiding double extraction overhead.
+    /// structured experiences, and profile signals — all in one pass.
    ///
    /// Returns `(memory_count, Option<ExtractedFactBatch>)` on success.
    pub async fn extract_combined(
@@ -277,25 +277,28 @@ impl GrowthIntegration {
            return Ok(None);
        }
-        // Single LLM extraction call
+        // 单次 LLM 提取：memories + experiences + profile_signals
-        let extracted = self
+        let combined = self
            .extractor
-            .extract(messages, session_id.clone())
+            .extract_combined(messages, session_id.clone())
            .await
            .unwrap_or_else(|e| {
                tracing::warn!("[GrowthIntegration] Combined extraction failed: {}", e);
-                Vec::new()
+                CombinedExtraction::default()
            });
-        if extracted.is_empty() {
+        if combined.memories.is_empty()
            && combined.experiences.is_empty()
            && !combined.profile_signals.has_any_signal()
        {
            return Ok(None);
        }
-        let mem_count = extracted.len();
+        let mem_count = combined.memories.len();
        // Store raw memories
        self.extractor
-            .store_memories(&agent_id.to_string(), &extracted)
+            .store_memories(&agent_id.to_string(), &combined.memories)
            .await?;
        // Track learning event
@@ -304,14 +307,9 @@ impl GrowthIntegration {
            .await?;
        // Persist structured experiences (L1 enhancement)
        let combined_extraction = CombinedExtraction {
            memories: extracted.clone(),
            experiences: Vec::new(), // LLM-driven extraction fills this later
            profile_signals: ProfileSignals::default(),
        };
        if let Ok(exp_count) = self
            .experience_extractor
-            .persist_experiences(&agent_id.to_string(), &combined_extraction)
+            .persist_experiences(&agent_id.to_string(), &combined)
            .await
        {
            if exp_count > 0 {
@@ -324,9 +322,7 @@ impl GrowthIntegration {
        // Update user profile from extraction signals (L1 enhancement)
        if let Some(profile_store) = &self.profile_store {
-            let updates = self
+            let updates = self.profile_updater.collect_updates(&combined);
                .profile_updater
                .collect_updates(&combined_extraction);
            let user_id = agent_id.to_string();
            for update in updates {
                if let Err(e) = profile_store
@@ -342,8 +338,9 @@ impl GrowthIntegration {
            }
        }
-        // Convert same extracted memories to structured facts (no extra LLM call)
+        // Convert extracted memories to structured facts
-        let facts: Vec<Fact> = extracted
+        let facts: Vec<Fact> = combined
            .memories
            .into_iter()
            .map(|m| {
                let category = match m.memory_type {
--- a/desktop/src-tauri/src/intelligence/extraction_adapter.rs
+++ b/desktop/src-tauri/src/intelligence/extraction_adapter.rs
@@ -225,6 +225,69 @@ impl LlmDriverForExtraction for TauriExtractionDriver {
        Ok(memories)
    }
    async fn extract_with_prompt(
        &self,
        messages: &[Message],
        system_prompt: &str,
        user_prompt: &str,
    ) -> Result<String> {
        if messages.len() < 2 {
            return Err(zclaw_types::Error::msg(
                "Too few messages for combined extraction",
            ));
        }
        tracing::debug!(
            "[TauriExtractionDriver] Combined extraction from {} messages",
            messages.len()
        );
        let request = CompletionRequest {
            model: self.model.clone(),
            system: Some(system_prompt.to_string()),
            messages: vec![Message::user(user_prompt.to_string())],
            tools: Vec::new(),
            max_tokens: Some(3000),
            temperature: Some(0.3),
            stop: Vec::new(),
            stream: false,
            thinking_enabled: false,
            reasoning_effort: None,
            plan_mode: false,
        };
        let response = self.driver.complete(request).await.map_err(|e| {
            tracing::error!(
                "[TauriExtractionDriver] Combined extraction LLM call failed: {}",
                e
            );
            e
        })?;
        let response_text: String = response
            .content
            .into_iter()
            .filter_map(|block| match block {
                ContentBlock::Text { text } => Some(text),
                _ => None,
            })
            .collect::<Vec<_>>()
            .join("");
        if response_text.is_empty() {
            return Err(zclaw_types::Error::msg(
                "Empty response from LLM for combined extraction",
            ));
        }
        tracing::info!(
            "[TauriExtractionDriver] Combined extraction response: {} chars",
            response_text.len()
        );
        Ok(response_text)
    }
 }
 /// Global extraction driver instance (legacy path, kept for compatibility).