fix(growth): HIGH-6 修复 extract_combined 合并提取空壳
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

根因: growth.rs 构造 CombinedExtraction 时硬编码 experiences: Vec::new()
和 profile_signals: default(),导致 L1 结构化经验不被提取、L2 技能进化
没有输入数据、整个进化引擎无法端到端工作。

修复:
- extractor.rs: 添加 COMBINED_EXTRACTION_PROMPT 统一 prompt,单次 LLM 调用
  同时输出 memories + experiences + profile_signals
- extractor.rs: 添加 parse_combined_response() 解析 LLM JSON 响应
- LlmDriverForExtraction trait: 添加 extract_with_prompt() 方法(默认不支持,
  退化到现有 extract() + 启发式推断)
- MemoryExtractor: 添加 extract_combined() 方法,优先单次调用,失败则退化
- growth.rs: extract_combined() 使用新的合并提取替代硬编码空值
- TauriExtractionDriver: 实现 extract_with_prompt()
- ProfileSignals: 添加 has_any_signal() 方法
- types.rs: ProfileSignals 无 structural 变化(字段已存在)

测试: 4 个新测试(parse_combined_response_full/minimal/invalid +
extract_combined_fallback),11 个 extractor 测试全部通过
This commit is contained in:
iven
2026-04-18 22:56:42 +08:00
parent cb727fdcc7
commit 3c6581f915
4 changed files with 536 additions and 29 deletions

View File

@@ -21,7 +21,7 @@ pub trait LlmDriverForExtraction: Send + Sync {
) -> Result<Vec<ExtractedMemory>>;
/// 单次 LLM 调用提取全部类型(记忆 + 经验 + 画像信号)
/// 默认实现:退化到 3 次独立调用
/// 默认实现:退化到 3 次独立调用experiences 和 profile_signals 为空)
async fn extract_combined_all(
&self,
messages: &[Message],
@@ -34,6 +34,19 @@ pub trait LlmDriverForExtraction: Send + Sync {
}
Ok(combined)
}
/// 使用自定义 prompt 进行单次 LLM 调用,返回原始文本响应
/// 用于统一提取场景,默认返回不支持错误
async fn extract_with_prompt(
&self,
_messages: &[Message],
_system_prompt: &str,
_user_prompt: &str,
) -> Result<String> {
Err(zclaw_types::ZclawError::Internal(
"extract_with_prompt not implemented".to_string(),
))
}
}
/// Memory Extractor - extracts memories from conversations
@@ -100,13 +113,10 @@ impl MemoryExtractor {
session_id: SessionId,
) -> Result<Vec<ExtractedMemory>> {
// Check if LLM driver is available
let _llm_driver = match &self.llm_driver {
Some(driver) => driver,
None => {
tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
return Ok(Vec::new());
}
};
if self.llm_driver.is_none() {
tracing::debug!("[MemoryExtractor] No LLM driver configured, skipping extraction");
return Ok(Vec::new());
}
let mut results = Vec::new();
@@ -242,6 +252,299 @@ impl MemoryExtractor {
tracing::info!("[MemoryExtractor] Stored {} memories to OpenViking", stored);
Ok(stored)
}
/// 统一提取:单次 LLM 调用同时产出 memories + experiences + profile_signals
///
/// 优先使用 `extract_with_prompt()` 进行单次调用;若 driver 不支持则
/// 退化为 `extract()` + 从记忆推断经验/画像。
pub async fn extract_combined(
&self,
messages: &[Message],
session_id: SessionId,
) -> Result<crate::types::CombinedExtraction> {
let llm_driver = match &self.llm_driver {
Some(driver) => driver,
None => {
tracing::debug!(
"[MemoryExtractor] No LLM driver configured, skipping combined extraction"
);
return Ok(crate::types::CombinedExtraction::default());
}
};
// 尝试单次 LLM 调用路径
let system_prompt = "You are a memory extraction assistant. Analyze conversations and extract \
structured memories, experiences, and profile signals in valid JSON format. \
Always respond with valid JSON only, no additional text or markdown formatting.";
let user_prompt = format!(
"{}{}",
crate::extractor::prompts::COMBINED_EXTRACTION_PROMPT,
format_conversation_text(messages)
);
match llm_driver
.extract_with_prompt(messages, system_prompt, &user_prompt)
.await
{
Ok(raw_text) if !raw_text.trim().is_empty() => {
match parse_combined_response(&raw_text, session_id.clone()) {
Ok(combined) => {
tracing::info!(
"[MemoryExtractor] Combined extraction: {} memories, {} experiences, {} profile signals",
combined.memories.len(),
combined.experiences.len(),
combined.profile_signals.has_any_signal() as usize,
);
return Ok(combined);
}
Err(e) => {
tracing::warn!(
"[MemoryExtractor] Combined response parse failed, falling back: {}",
e
);
}
}
}
Ok(_) => {
tracing::debug!("[MemoryExtractor] extract_with_prompt returned empty, falling back");
}
Err(e) => {
tracing::debug!(
"[MemoryExtractor] extract_with_prompt not supported ({}), falling back",
e
);
}
}
// 退化路径:使用已有的 extract() 然后推断 experiences 和 profile_signals
let memories = self.extract(messages, session_id).await?;
let experiences = infer_experiences_from_memories(&memories);
let profile_signals = infer_profile_signals_from_memories(&memories);
Ok(crate::types::CombinedExtraction {
memories,
experiences,
profile_signals,
})
}
}
/// 格式化对话消息为文本
fn format_conversation_text(messages: &[Message]) -> String {
messages
.iter()
.filter_map(|msg| match msg {
Message::User { content } => Some(format!("[User]: {}", content)),
Message::Assistant { content, .. } => Some(format!("[Assistant]: {}", content)),
Message::System { content } => Some(format!("[System]: {}", content)),
Message::ToolUse { .. } | Message::ToolResult { .. } => None,
})
.collect::<Vec<_>>()
.join("\n\n")
}
/// 从 LLM 原始响应解析 CombinedExtraction
pub fn parse_combined_response(
raw: &str,
session_id: SessionId,
) -> Result<crate::types::CombinedExtraction> {
use crate::types::CombinedExtraction;
let json_str = crate::json_utils::extract_json_block(raw);
let parsed: serde_json::Value = serde_json::from_str(json_str).map_err(|e| {
zclaw_types::ZclawError::Internal(format!("Failed to parse combined JSON: {}", e))
})?;
// 解析 memories
let memories = parsed
.get("memories")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|item| parse_memory_item(item, &session_id))
.collect::<Vec<_>>()
})
.unwrap_or_default();
// 解析 experiences
let experiences = parsed
.get("experiences")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(parse_experience_item)
.collect::<Vec<_>>()
})
.unwrap_or_default();
// 解析 profile_signals
let profile_signals = parse_profile_signals(&parsed);
Ok(CombinedExtraction {
memories,
experiences,
profile_signals,
})
}
/// 解析单个 memory 项
fn parse_memory_item(
value: &serde_json::Value,
session_id: &SessionId,
) -> Option<ExtractedMemory> {
let content = value.get("content")?.as_str()?.to_string();
let category = value
.get("category")
.and_then(|v| v.as_str())
.unwrap_or("unknown")
.to_string();
let memory_type_str = value
.get("memory_type")
.and_then(|v| v.as_str())
.unwrap_or("knowledge");
let memory_type = crate::types::MemoryType::parse(memory_type_str);
let confidence = value
.get("confidence")
.and_then(|v| v.as_f64())
.unwrap_or(0.7) as f32;
let keywords = crate::json_utils::extract_string_array(value, "keywords");
Some(
ExtractedMemory::new(memory_type, category, content, session_id.clone())
.with_confidence(confidence)
.with_keywords(keywords),
)
}
/// 解析单个 experience 项
fn parse_experience_item(value: &serde_json::Value) -> Option<crate::types::ExperienceCandidate> {
use crate::types::Outcome;
let pain_pattern = value.get("pain_pattern")?.as_str()?.to_string();
let context = value
.get("context")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let solution_steps = crate::json_utils::extract_string_array(value, "solution_steps");
let outcome_str = value
.get("outcome")
.and_then(|v| v.as_str())
.unwrap_or("partial");
let outcome = match outcome_str {
"success" => Outcome::Success,
"failed" => Outcome::Failed,
_ => Outcome::Partial,
};
let confidence = value
.get("confidence")
.and_then(|v| v.as_f64())
.unwrap_or(0.6) as f32;
let tools_used = crate::json_utils::extract_string_array(value, "tools_used");
let industry_context = value
.get("industry_context")
.and_then(|v| v.as_str())
.map(String::from);
Some(crate::types::ExperienceCandidate {
pain_pattern,
context,
solution_steps,
outcome,
confidence,
tools_used,
industry_context,
})
}
/// 解析 profile_signals
fn parse_profile_signals(obj: &serde_json::Value) -> crate::types::ProfileSignals {
let signals = obj.get("profile_signals");
crate::types::ProfileSignals {
industry: signals
.and_then(|s| s.get("industry"))
.and_then(|v| v.as_str())
.map(String::from),
recent_topic: signals
.and_then(|s| s.get("recent_topic"))
.and_then(|v| v.as_str())
.map(String::from),
pain_point: signals
.and_then(|s| s.get("pain_point"))
.and_then(|v| v.as_str())
.map(String::from),
preferred_tool: signals
.and_then(|s| s.get("preferred_tool"))
.and_then(|v| v.as_str())
.map(String::from),
communication_style: signals
.and_then(|s| s.get("communication_style"))
.and_then(|v| v.as_str())
.map(String::from),
}
}
/// 从已有记忆推断结构化经验(退化路径)
fn infer_experiences_from_memories(
memories: &[ExtractedMemory],
) -> Vec<crate::types::ExperienceCandidate> {
memories
.iter()
.filter(|m| m.memory_type == crate::types::MemoryType::Experience)
.filter_map(|m| {
// 经验类记忆 → ExperienceCandidate
let content = &m.content;
if content.len() < 10 {
return None;
}
Some(crate::types::ExperienceCandidate {
pain_pattern: m.category.clone(),
context: content.clone(),
solution_steps: Vec::new(),
outcome: crate::types::Outcome::Success,
confidence: m.confidence * 0.7, // 降低推断置信度
tools_used: m.keywords.clone(),
industry_context: None,
})
})
.collect()
}
/// 从已有记忆推断画像信号(退化路径)
fn infer_profile_signals_from_memories(
memories: &[ExtractedMemory],
) -> crate::types::ProfileSignals {
use crate::types::ProfileSignals;
let mut signals = ProfileSignals::default();
for m in memories {
match m.memory_type {
crate::types::MemoryType::Preference => {
if m.category.contains("style") || m.category.contains("风格") {
if signals.communication_style.is_none() {
signals.communication_style = Some(m.content.clone());
}
}
}
crate::types::MemoryType::Knowledge => {
if signals.recent_topic.is_none() && !m.keywords.is_empty() {
signals.recent_topic = Some(m.keywords.first().cloned().unwrap_or_default());
}
}
crate::types::MemoryType::Experience => {
for kw in &m.keywords {
if signals.preferred_tool.is_none()
&& m.content.contains(kw.as_str())
{
signals.preferred_tool = Some(kw.clone());
break;
}
}
}
_ => {}
}
}
signals
}
/// Default extraction prompts for LLM
@@ -258,6 +561,55 @@ pub mod prompts {
}
}
/// 统一提取 prompt — 单次 LLM 调用同时提取记忆、结构化经验、画像信号
pub const COMBINED_EXTRACTION_PROMPT: &str = r#"
分析以下对话,一次性提取三类信息。严格按 JSON 格式返回。
## 输出格式
```json
{
"memories": [
{
"memory_type": "preference|knowledge|experience",
"category": "分类标签",
"content": "记忆内容",
"confidence": 0.0-1.0,
"keywords": ["关键词"]
}
],
"experiences": [
{
"pain_pattern": "痛点模式简述",
"context": "问题发生的上下文",
"solution_steps": ["步骤1", "步骤2"],
"outcome": "success|partial|failed",
"confidence": 0.0-1.0,
"tools_used": ["使用的工具/技能"],
"industry_context": "行业标识(可选)"
}
],
"profile_signals": {
"industry": "用户所在行业(可选)",
"recent_topic": "最近讨论的主要话题(可选)",
"pain_point": "用户当前痛点(可选)",
"preferred_tool": "用户偏好的工具/技能(可选)",
"communication_style": "沟通风格: concise|detailed|formal|casual(可选)"
}
}
```
## 提取规则
1. **memories**: 提取用户偏好(沟通风格/格式/语言)、知识(事实/领域知识/经验教训)、使用经验(技能/工具使用模式和结果)
2. **experiences**: 仅提取明确的"问题→解决"模式要求有清晰的痛点和步骤confidence >= 0.6
3. **profile_signals**: 从对话中推断用户画像信息,只在有明确信号时填写,留空则不填
4. 每个字段都要有实际内容,不确定的宁可省略
5. 只返回 JSON不要附加其他文本
对话内容:
"#;
const PREFERENCE_EXTRACTION_PROMPT: &str = r#"
分析以下对话,提取用户的偏好设置。关注:
- 沟通风格偏好(简洁/详细、正式/随意)
@@ -391,5 +743,89 @@ mod tests {
assert!(!prompts::get_extraction_prompt(MemoryType::Knowledge).is_empty());
assert!(!prompts::get_extraction_prompt(MemoryType::Experience).is_empty());
assert!(!prompts::get_extraction_prompt(MemoryType::Session).is_empty());
assert!(!prompts::COMBINED_EXTRACTION_PROMPT.is_empty());
}
#[test]
fn test_parse_combined_response_full() {
let raw = r#"```json
{
"memories": [
{
"memory_type": "preference",
"category": "communication-style",
"content": "用户偏好简洁回复",
"confidence": 0.9,
"keywords": ["简洁", "风格"]
},
{
"memory_type": "knowledge",
"category": "user-facts",
"content": "用户是医院行政人员",
"confidence": 0.85,
"keywords": ["医院", "行政"]
}
],
"experiences": [
{
"pain_pattern": "报表生成耗时",
"context": "月度报表需要手动汇总多个Excel",
"solution_steps": ["使用researcher工具自动抓取", "格式化输出为Excel"],
"outcome": "success",
"confidence": 0.85,
"tools_used": ["researcher"],
"industry_context": "healthcare"
}
],
"profile_signals": {
"industry": "healthcare",
"recent_topic": "报表自动化",
"pain_point": "手动汇总Excel太慢",
"preferred_tool": "researcher",
"communication_style": "concise"
}
}
```"#;
let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
assert_eq!(result.memories.len(), 2);
assert_eq!(result.experiences.len(), 1);
assert_eq!(result.experiences[0].pain_pattern, "报表生成耗时");
assert_eq!(result.experiences[0].outcome, crate::types::Outcome::Success);
assert_eq!(result.profile_signals.industry.as_deref(), Some("healthcare"));
assert_eq!(result.profile_signals.pain_point.as_deref(), Some("手动汇总Excel太慢"));
assert!(result.profile_signals.has_any_signal());
}
#[test]
fn test_parse_combined_response_minimal() {
let raw = r#"{"memories": [], "experiences": [], "profile_signals": {}}"#;
let result = super::parse_combined_response(raw, SessionId::new()).unwrap();
assert!(result.memories.is_empty());
assert!(result.experiences.is_empty());
assert!(!result.profile_signals.has_any_signal());
}
#[test]
fn test_parse_combined_response_invalid() {
let raw = "not json at all";
let result = super::parse_combined_response(raw, SessionId::new());
assert!(result.is_err());
}
#[tokio::test]
async fn test_extract_combined_fallback() {
// MockLlmDriver doesn't implement extract_with_prompt, so it falls back
let driver = Arc::new(MockLlmDriver);
let extractor = MemoryExtractor::new(driver);
let messages = vec![Message::user("Hello"), Message::assistant("Hi there!")];
let result = extractor
.extract_combined(&messages, SessionId::new())
.await
.unwrap();
// Fallback: extract() produces 3 memories, infer produces experiences from them
assert!(!result.memories.is_empty());
}
}