fix(growth): HIGH-6 修复 extract_combined 合并提取空壳

根因: growth.rs 构造 CombinedExtraction 时硬编码 experiences: Vec::new() 和 profile_signals: default()，导致 L1 结构化经验不被提取、L2 技能进化没有输入数据、整个进化引擎无法端到端工作。修复: - extractor.rs: 添加 COMBINED_EXTRACTION_PROMPT 统一 prompt，单次 LLM 调用同时输出 memories + experiences + profile_signals - extractor.rs: 添加 parse_combined_response() 解析 LLM JSON 响应 - LlmDriverForExtraction trait: 添加 extract_with_prompt() 方法（默认不支持，退化到现有 extract() + 启发式推断） - MemoryExtractor: 添加 extract_combined() 方法，优先单次调用，失败则退化 - growth.rs: extract_combined() 使用新的合并提取替代硬编码空值 - TauriExtractionDriver: 实现 extract_with_prompt() - ProfileSignals: 添加 has_any_signal() 方法 - types.rs: ProfileSignals 无 structural 变化（字段已存在）测试: 4 个新测试（parse_combined_response_full/minimal/invalid + extract_combined_fallback），11 个 extractor 测试全部通过
2026-04-18 22:56:42 +08:00
parent cb727fdcc7
commit 3c6581f915
4 changed files with 536 additions and 29 deletions
--- a/crates/zclaw-growth/src/extractor.rs
+++ b/crates/zclaw-growth/src/extractor.rs
@@ -21,7 +21,7 @@ pub trait LlmDriverForExtraction: Send + Sync {
    ) -> Result<Vec<ExtractedMemory>>;

    /// 单次 LLM 调用提取全部类型（记忆 + 经验 + 画像信号）
-    /// 默认实现：退化到 3 次独立调用
+    /// 默认实现：退化到 3 次独立调用（experiences 和 profile_signals 为空）
    async fn extract_combined_all(
        &self,
        messages: &[Message],
@@ -34,6 +34,19 @@ pub trait LlmDriverForExtraction: Send + Sync {
        }
        Ok(combined)
    }
+
+    /// 使用自定义 prompt 进行单次 LLM 调用，返回原始文本响应
+    /// 用于统一提取场景，默认返回不支持错误
+    async fn extract_with_prompt(
+        &self,
+        _messages: &[Message],
+        _system_prompt: &str,
+        _user_prompt: &str,
+    ) -> Result<String> {
+        Err(zclaw_types::ZclawError::Internal(
+            "extract_with_prompt not implemented".to_string(),
+        ))
+    }
 }

 /// Memory Extractor - extracts memories from conversations
@@ -100,13 +113,10 @@ impl MemoryExtractor {
        session_id: SessionId,
    ) -> Result<Vec<ExtractedMemory>> {
        // Check if LLM driver is available
-        let _llm_driver = match &self.llm_driver {
-            Some(driver) => driver,
-            None => {
-                tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
-                return Ok(Vec::new());
-            }
-        };
+        if self.llm_driver.is_none() {
+            tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
+            return Ok(Vec::new());
+        }

        let mut results = Vec::new();

@@ -242,6 +252,299 @@ impl MemoryExtractor {
        tracing::info!("[MemoryExtractor] Stored {} memories to OpenViking", stored);
        Ok(stored)
    }
+
+    /// 统一提取：单次 LLM 调用同时产出 memories + experiences + profile_signals
+    ///
+    /// 优先使用 `extract_with_prompt()` 进行单次调用；若 driver 不支持则
+    /// 退化为 `extract()` + 从记忆推断经验/画像。
+    pub async fn extract_combined(
+        &self,
+        messages: &[Message],
+        session_id: SessionId,
+    ) -> Result<crate::types::CombinedExtraction> {
+        let llm_driver = match &self.llm_driver {
+            Some(driver) => driver,
+            None => {
+                tracing::debug!(
+                    "[MemoryExtractor] No LLM driver configured, skipping combined extraction"
+                );
+                return Ok(crate::types::CombinedExtraction::default());
+            }
+        };
+
+        // 尝试单次 LLM 调用路径
+        let system_prompt = "You are a memory extraction assistant. Analyze conversations and extract \
+            structured memories, experiences, and profile signals in valid JSON format. \
+            Always respond with valid JSON only, no additional text or markdown formatting.";
+        let user_prompt = format!(
+            "{}{}",
+            crate::extractor::prompts::COMBINED_EXTRACTION_PROMPT,
+            format_conversation_text(messages)
+        );
+
+        match llm_driver
+            .extract_with_prompt(messages, system_prompt, &user_prompt)
+            .await
+        {
+            Ok(raw_text) if !raw_text.trim().is_empty() => {
+                match parse_combined_response(&raw_text, session_id.clone()) {
+                    Ok(combined) => {
+                        tracing::info!(
+                            "[MemoryExtractor] Combined extraction: {} memories, {} experiences, {} profile signals",
+                            combined.memories.len(),
+                            combined.experiences.len(),
+                            combined.profile_signals.has_any_signal() as usize,
+                        );
+                        return Ok(combined);
+                    }
+                    Err(e) => {
+                        tracing::warn!(
+                            "[MemoryExtractor] Combined response parse failed, falling back: {}",
+                            e
+                        );
+                    }
+                }
+            }
+            Ok(_) => {
+                tracing::debug!("[MemoryExtractor] extract_with_prompt returned empty, falling back");
+            }
+            Err(e) => {
+                tracing::debug!(
+                    "[MemoryExtractor] extract_with_prompt not supported ({}), falling back",
+                    e
+                );
+            }
+        }
+
+        // 退化路径：使用已有的 extract() 然后推断 experiences 和 profile_signals
+        let memories = self.extract(messages, session_id).await?;
+        let experiences = infer_experiences_from_memories(&memories);
+        let profile_signals = infer_profile_signals_from_memories(&memories);
+
+        Ok(crate::types::CombinedExtraction {
+            memories,
+            experiences,
+            profile_signals,
+        })
+    }
+}
+
+/// 格式化对话消息为文本
+fn format_conversation_text(messages: &[Message]) -> String {
+    messages
+        .iter()
+        .filter_map(|msg| match msg {
+            Message::User { content } => Some(format!("[User]: {}", content)),
+            Message::Assistant { content, .. } => Some(format!("[Assistant]: {}", content)),
+            Message::System { content } => Some(format!("[System]: {}", content)),
+            Message::ToolUse { .. } | Message::ToolResult { .. } => None,
+        })
+        .collect::<Vec<_>>()
+        .join("\n\n")
+}
+
+/// 从 LLM 原始响应解析 CombinedExtraction
+pub fn parse_combined_response(
+    raw: &str,
+    session_id: SessionId,
+) -> Result<crate::types::CombinedExtraction> {
+    use crate::types::CombinedExtraction;
+
+    let json_str = crate::json_utils::extract_json_block(raw);
+    let parsed: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
+        zclaw_types::ZclawError::Internal(format!("Failed to parse combined JSON: {}", e))
+    })?;
+
+    // 解析 memories
+    let memories = parsed
+        .get("memories")
+        .and_then(|v| v.as_array())
+        .map(|arr| {
+            arr.iter()
+                .filter_map(|item| parse_memory_item(item, &session_id))
+                .collect::<Vec<_>>()
+        })
+        .unwrap_or_default();
+
+    // 解析 experiences
+    let experiences = parsed
+        .get("experiences")
+        .and_then(|v| v.as_array())
+        .map(|arr| {
+            arr.iter()
+                .filter_map(parse_experience_item)
+                .collect::<Vec<_>>()
+        })
+        .unwrap_or_default();
+
+    // 解析 profile_signals
+    let profile_signals = parse_profile_signals(&parsed);
+
+    Ok(CombinedExtraction {
+        memories,
+        experiences,
+        profile_signals,
+    })
+}
+
+/// 解析单个 memory 项
+fn parse_memory_item(
+    value: &serde_json::Value,
+    session_id: &SessionId,
+) -> Option<ExtractedMemory> {
+    let content = value.get("content")?.as_str()?.to_string();
+    let category = value
+        .get("category")
+        .and_then(|v| v.as_str())
+        .unwrap_or("unknown")
+        .to_string();
+    let memory_type_str = value
+        .get("memory_type")
+        .and_then(|v| v.as_str())
+        .unwrap_or("knowledge");
+    let memory_type = crate::types::MemoryType::parse(memory_type_str);
+    let confidence = value
+        .get("confidence")
+        .and_then(|v| v.as_f64())
+        .unwrap_or(0.7) as f32;
+    let keywords = crate::json_utils::extract_string_array(value, "keywords");
+
+    Some(
+        ExtractedMemory::new(memory_type, category, content, session_id.clone())
+            .with_confidence(confidence)
+            .with_keywords(keywords),
+    )
+}
+
+/// 解析单个 experience 项
+fn parse_experience_item(value: &serde_json::Value) -> Option<crate::types::ExperienceCandidate> {
+    use crate::types::Outcome;
+
+    let pain_pattern = value.get("pain_pattern")?.as_str()?.to_string();
+    let context = value
+        .get("context")
+        .and_then(|v| v.as_str())
+        .unwrap_or("")
+        .to_string();
+    let solution_steps = crate::json_utils::extract_string_array(value, "solution_steps");
+    let outcome_str = value
+        .get("outcome")
+        .and_then(|v| v.as_str())
+        .unwrap_or("partial");
+    let outcome = match outcome_str {
+        "success" => Outcome::Success,
+        "failed" => Outcome::Failed,
+        _ => Outcome::Partial,
+    };
+    let confidence = value
+        .get("confidence")
+        .and_then(|v| v.as_f64())
+        .unwrap_or(0.6) as f32;
+    let tools_used = crate::json_utils::extract_string_array(value, "tools_used");
+    let industry_context = value
+        .get("industry_context")
+        .and_then(|v| v.as_str())
+        .map(String::from);
+
+    Some(crate::types::ExperienceCandidate {
+        pain_pattern,
+        context,
+        solution_steps,
+        outcome,
+        confidence,
+        tools_used,
+        industry_context,
+    })
+}
+
+/// 解析 profile_signals
+fn parse_profile_signals(obj: &serde_json::Value) -> crate::types::ProfileSignals {
+    let signals = obj.get("profile_signals");
+    crate::types::ProfileSignals {
+        industry: signals
+            .and_then(|s| s.get("industry"))
+            .and_then(|v| v.as_str())
+            .map(String::from),
+        recent_topic: signals
+            .and_then(|s| s.get("recent_topic"))
+            .and_then(|v| v.as_str())
+            .map(String::from),
+        pain_point: signals
+            .and_then(|s| s.get("pain_point"))
+            .and_then(|v| v.as_str())
+            .map(String::from),
+        preferred_tool: signals
+            .and_then(|s| s.get("preferred_tool"))
+            .and_then(|v| v.as_str())
+            .map(String::from),
+        communication_style: signals
+            .and_then(|s| s.get("communication_style"))
+            .and_then(|v| v.as_str())
+            .map(String::from),
+    }
+}
+
+/// 从已有记忆推断结构化经验（退化路径）
+fn infer_experiences_from_memories(
+    memories: &[ExtractedMemory],
+) -> Vec<crate::types::ExperienceCandidate> {
+    memories
+        .iter()
+        .filter(|m| m.memory_type == crate::types::MemoryType::Experience)
+        .filter_map(|m| {
+            // 经验类记忆 → ExperienceCandidate
+            let content = &m.content;
+            if content.len() < 10 {
+                return None;
+            }
+            Some(crate::types::ExperienceCandidate {
+                pain_pattern: m.category.clone(),
+                context: content.clone(),
+                solution_steps: Vec::new(),
+                outcome: crate::types::Outcome::Success,
+                confidence: m.confidence * 0.7, // 降低推断置信度
+                tools_used: m.keywords.clone(),
+                industry_context: None,
+            })
+        })
+        .collect()
+}
+
+/// 从已有记忆推断画像信号（退化路径）
+fn infer_profile_signals_from_memories(
+    memories: &[ExtractedMemory],
+) -> crate::types::ProfileSignals {
+    use crate::types::ProfileSignals;
+
+    let mut signals = ProfileSignals::default();
+    for m in memories {
+        match m.memory_type {
+            crate::types::MemoryType::Preference => {
+                if m.category.contains("style") || m.category.contains("风格") {
+                    if signals.communication_style.is_none() {
+                        signals.communication_style = Some(m.content.clone());
+                    }
+                }
+            }
+            crate::types::MemoryType::Knowledge => {
+                if signals.recent_topic.is_none() && !m.keywords.is_empty() {
+                    signals.recent_topic = Some(m.keywords.first().cloned().unwrap_or_default());
+                }
+            }
+            crate::types::MemoryType::Experience => {
+                for kw in &m.keywords {
+                    if signals.preferred_tool.is_none()
+                        && m.content.contains(kw.as_str())
+                    {
+                        signals.preferred_tool = Some(kw.clone());
+                        break;
+                    }
+                }
+            }
+            _ => {}
+        }
+    }
+    signals
 }

 /// Default extraction prompts for LLM
@@ -258,6 +561,55 @@ pub mod prompts {
        }
    }

+    /// 统一提取 prompt — 单次 LLM 调用同时提取记忆、结构化经验、画像信号
+    pub const COMBINED_EXTRACTION_PROMPT: &str = r#"
+分析以下对话，一次性提取三类信息。严格按 JSON 格式返回。
+
+## 输出格式
+
+```json
+{
+  "memories": [
+    {
+      "memory_type": "preference|knowledge|experience",
+      "category": "分类标签",
+      "content": "记忆内容",
+      "confidence": 0.0-1.0,
+      "keywords": ["关键词"]
+    }
+  ],
+  "experiences": [
+    {
+      "pain_pattern": "痛点模式简述",
+      "context": "问题发生的上下文",
+      "solution_steps": ["步骤1", "步骤2"],
+      "outcome": "success|partial|failed",
+      "confidence": 0.0-1.0,
+      "tools_used": ["使用的工具/技能"],
+      "industry_context": "行业标识(可选)"
+    }
+  ],
+  "profile_signals": {
+    "industry": "用户所在行业(可选)",
+    "recent_topic": "最近讨论的主要话题(可选)",
+    "pain_point": "用户当前痛点(可选)",
+    "preferred_tool": "用户偏好的工具/技能(可选)",
+    "communication_style": "沟通风格: concise|detailed|formal|casual(可选)"
+  }
+}
+```
+
+## 提取规则
+
+1. **memories**: 提取用户偏好(沟通风格/格式/语言)、知识(事实/领域知识/经验教训)、使用经验(技能/工具使用模式和结果)
+2. **experiences**: 仅提取明确的"问题→解决"模式，要求有清晰的痛点和步骤，confidence >= 0.6
+3. **profile_signals**: 从对话中推断用户画像信息，只在有明确信号时填写，留空则不填
+4. 每个字段都要有实际内容，不确定的宁可省略
+5. 只返回 JSON，不要附加其他文本
+
+对话内容：
+"#;
+
    const PREFERENCE_EXTRACTION_PROMPT: &str = r#"
 分析以下对话，提取用户的偏好设置。关注：
 - 沟通风格偏好（简洁/详细、正式/随意）
@@ -391,5 +743,89 @@ mod tests {
        assert!(!prompts::get_extraction_prompt(MemoryType::Knowledge).is_empty());
        assert!(!prompts::get_extraction_prompt(MemoryType::Experience).is_empty());
        assert!(!prompts::get_extraction_prompt(MemoryType::Session).is_empty());
+        assert!(!prompts::COMBINED_EXTRACTION_PROMPT.is_empty());
+    }
+
+    #[test]
+    fn test_parse_combined_response_full() {
+        let raw = r#"```json
+{
+  "memories": [
+    {
+      "memory_type": "preference",
+      "category": "communication-style",
+      "content": "用户偏好简洁回复",
+      "confidence": 0.9,
+      "keywords": ["简洁", "风格"]
+    },
+    {
+      "memory_type": "knowledge",
+      "category": "user-facts",
+      "content": "用户是医院行政人员",
+      "confidence": 0.85,
+      "keywords": ["医院", "行政"]
+    }
+  ],
+  "experiences": [
+    {
+      "pain_pattern": "报表生成耗时",
+      "context": "月度报表需要手动汇总多个Excel",
+      "solution_steps": ["使用researcher工具自动抓取", "格式化输出为Excel"],
+      "outcome": "success",
+      "confidence": 0.85,
+      "tools_used": ["researcher"],
+      "industry_context": "healthcare"
+    }
+  ],
+  "profile_signals": {
+    "industry": "healthcare",
+    "recent_topic": "报表自动化",
+    "pain_point": "手动汇总Excel太慢",
+    "preferred_tool": "researcher",
+    "communication_style": "concise"
+  }
+}
+```"#;
+
+        let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
+        assert_eq!(result.memories.len(), 2);
+        assert_eq!(result.experiences.len(), 1);
+        assert_eq!(result.experiences[0].pain_pattern, "报表生成耗时");
+        assert_eq!(result.experiences[0].outcome, crate::types::Outcome::Success);
+        assert_eq!(result.profile_signals.industry.as_deref(), Some("healthcare"));
+        assert_eq!(result.profile_signals.pain_point.as_deref(), Some("手动汇总Excel太慢"));
+        assert!(result.profile_signals.has_any_signal());
+    }
+
+    #[test]
+    fn test_parse_combined_response_minimal() {
+        let raw = r#"{"memories": [], "experiences": [], "profile_signals": {}}"#;
+        let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
+        assert!(result.memories.is_empty());
+        assert!(result.experiences.is_empty());
+        assert!(!result.profile_signals.has_any_signal());
+    }
+
+    #[test]
+    fn test_parse_combined_response_invalid() {
+        let raw = "not json at all";
+        let result = super::parse_combined_response(raw, SessionId::new());
+        assert!(result.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_extract_combined_fallback() {
+        // MockLlmDriver doesn't implement extract_with_prompt, so it falls back
+        let driver = Arc::new(MockLlmDriver);
+        let extractor = MemoryExtractor::new(driver);
+        let messages = vec![Message::user("Hello"), Message::assistant("Hi there!")];
+
+        let result = extractor
+            .extract_combined(&messages, SessionId::new())
+            .await
+            .unwrap();
+
+        // Fallback: extract() produces 3 memories, infer produces experiences from them
+        assert!(!result.memories.is_empty());
    }
 }