From 24b866fc28e9cf527563458ae2de30594b8790e3 Mon Sep 17 00:00:00 2001 From: iven Date: Mon, 20 Apr 2026 00:07:07 +0800 Subject: [PATCH] =?UTF-8?q?fix(growth,runtime,desktop):=20E2E=20=E9=AA=8C?= =?UTF-8?q?=E8=AF=81=204=20=E9=A1=B9=20Bug=20=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 BUG-1: SemanticScorer CJK 分词缺失导致 TF-IDF 相似度为 0 - 新增 CJK bigram 分词: "北京工作" → ["北京","京工","工作","北京工作"] - 非CJK文本保持原有分割逻辑 - 3 个新测试: bigram 生成 + 混合文本 + CJK 相似度>0 P1 BUG-2: streamStore lifecycle:end 未记录 token 使用量 - AgentStreamDelta 增加 input_tokens/output_tokens 字段 - lifecycle:end 处理中检查并调用 addTokenUsage P2 BUG-3: NlScheduleParser "X点半" 解析为整点 - 所有时间正则增加可选的 (半) 捕获组 - extract_minute 辅助函数: 半 → 30 P2 BUG-4: NlScheduleParser "工作日每天" 未转为 1-5 - RE_WORKDAY_EXACT 支持 (每天|每日)? 中缀 - try_workday 优先级提升至 try_every_day 之前 E2E 报告: docs/E2E_TEST_REPORT_2026_04_19.md 测试: 806 passed / 0 failed (含 9 个新增测试) --- crates/zclaw-growth/src/retrieval/semantic.rs | 114 ++++++++++- crates/zclaw-runtime/src/nl_schedule.rs | 116 +++++++++-- desktop/src/lib/gateway-types.ts | 3 + desktop/src/store/chat/streamStore.ts | 8 + docs/E2E_TEST_REPORT_2026_04_19.md | 185 ++++++++++++++++++ 5 files changed, 405 insertions(+), 21 deletions(-) create mode 100644 docs/E2E_TEST_REPORT_2026_04_19.md diff --git a/crates/zclaw-growth/src/retrieval/semantic.rs b/crates/zclaw-growth/src/retrieval/semantic.rs index c386195..8dd64f0 100644 --- a/crates/zclaw-growth/src/retrieval/semantic.rs +++ b/crates/zclaw-growth/src/retrieval/semantic.rs @@ -122,13 +122,65 @@ impl SemanticScorer { .collect() } - /// Tokenize text into words + /// Tokenize text into words with CJK-aware bigram support. + /// + /// For ASCII/latin text, splits on non-alphanumeric boundaries as before. + /// For CJK text, generates character-level bigrams (e.g. "北京工作" → ["北京", "京工", "工作"]) + /// so that TF-IDF cosine similarity works for CJK queries. fn tokenize(text: &str) -> Vec { - text.to_lowercase() - .split(|c: char| !c.is_alphanumeric()) - .filter(|s| !s.is_empty() && s.len() > 1) - .map(|s| s.to_string()) - .collect() + let lower = text.to_lowercase(); + let mut tokens = Vec::new(); + + // Split into segments: each segment is either pure CJK or non-CJK + let mut cjk_buf = String::new(); + let mut latin_buf = String::new(); + + let flush_latin = |buf: &mut String, tokens: &mut Vec| { + if !buf.is_empty() { + for word in buf.split(|c: char| !c.is_alphanumeric()) { + if !word.is_empty() && word.len() > 1 { + tokens.push(word.to_string()); + } + } + buf.clear(); + } + }; + + let flush_cjk = |buf: &mut String, tokens: &mut Vec| { + if buf.is_empty() { + return; + } + let chars: Vec = buf.chars().collect(); + // Generate bigrams for CJK + if chars.len() >= 2 { + for i in 0..chars.len() - 1 { + tokens.push(format!("{}{}", chars[i], chars[i + 1])); + } + } + // Also include the full CJK segment as a single token for exact-match bonus + if chars.len() > 1 { + tokens.push(buf.clone()); + } + buf.clear(); + }; + + for c in lower.chars() { + if is_cjk_char(c) { + flush_latin(&mut latin_buf, &mut tokens); + cjk_buf.push(c); + } else if c.is_alphanumeric() { + flush_cjk(&mut cjk_buf, &mut tokens); + latin_buf.push(c); + } else { + // Non-alphanumeric, non-CJK: flush both + flush_latin(&mut latin_buf, &mut tokens); + flush_cjk(&mut cjk_buf, &mut tokens); + } + } + flush_latin(&mut latin_buf, &mut tokens); + flush_cjk(&mut cjk_buf, &mut tokens); + + tokens } /// Remove stop words from tokens @@ -409,6 +461,20 @@ impl Default for SemanticScorer { } } +/// Check if a character is a CJK ideograph +fn is_cjk_char(c: char) -> bool { + matches!(c, + '\u{4E00}'..='\u{9FFF}' | + '\u{3400}'..='\u{4DBF}' | + '\u{20000}'..='\u{2A6DF}' | + '\u{2A700}'..='\u{2B73F}' | + '\u{2B740}'..='\u{2B81F}' | + '\u{2B820}'..='\u{2CEAF}' | + '\u{F900}'..='\u{FAFF}' | + '\u{2F800}'..='\u{2FA1F}' + ) +} + /// Index statistics #[derive(Debug, Clone)] pub struct IndexStats { @@ -430,6 +496,42 @@ mod tests { assert_eq!(tokens, vec!["hello", "world", "this", "is", "test"]); } + #[test] + fn test_tokenize_cjk_bigrams() { + // CJK text should produce bigrams + full segment token + let tokens = SemanticScorer::tokenize("北京工作"); + assert!(tokens.contains(&"北京".to_string()), "should contain bigram 北京"); + assert!(tokens.contains(&"京工".to_string()), "should contain bigram 京工"); + assert!(tokens.contains(&"工作".to_string()), "should contain bigram 工作"); + assert!(tokens.contains(&"北京工作".to_string()), "should contain full segment"); + } + + #[test] + fn test_tokenize_mixed_cjk_latin() { + // Mixed CJK and latin should handle both + let tokens = SemanticScorer::tokenize("我在北京工作,用Python写脚本"); + // CJK bigrams + assert!(tokens.contains(&"我在".to_string())); + assert!(tokens.contains(&"北京".to_string())); + // Latin word + assert!(tokens.contains(&"python".to_string())); + } + + #[test] + fn test_cjk_similarity() { + let mut scorer = SemanticScorer::new(); + + let entry = MemoryEntry::new( + "test", MemoryType::Preference, "test", + "用户在北京工作,做AI产品经理".to_string(), + ); + scorer.index_entry(&entry); + + // Query "北京" should have non-zero similarity after bigram fix + let score = scorer.score_similarity("北京", &entry); + assert!(score > 0.0, "CJK query should score > 0 after bigram tokenization, got {}", score); + } + #[test] fn test_stop_words_removal() { let scorer = SemanticScorer::new(); diff --git a/crates/zclaw-runtime/src/nl_schedule.rs b/crates/zclaw-runtime/src/nl_schedule.rs index 091000d..818c307 100644 --- a/crates/zclaw-runtime/src/nl_schedule.rs +++ b/crates/zclaw-runtime/src/nl_schedule.rs @@ -68,14 +68,14 @@ const PERIOD: &str = "(凌晨|早上|早晨|上午|中午|下午|午后|傍晚| // extract_task_description static RE_TIME_STRIP: LazyLock = LazyLock::new(|| { Regex::new( - r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时::]\d{0,2}分?" + r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时::](?:\d{1,2}分?|半)?" ).expect("static regex pattern is valid") }); // try_every_day static RE_EVERY_DAY_EXACT: LazyLock = LazyLock::new(|| { Regex::new(&format!( - r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?", + r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?", PERIOD )).expect("static regex pattern is valid") }); @@ -89,15 +89,15 @@ static RE_EVERY_DAY_PERIOD: LazyLock = LazyLock::new(|| { // try_every_week static RE_EVERY_WEEK: LazyLock = LazyLock::new(|| { Regex::new(&format!( - r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?", + r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?", PERIOD )).expect("static regex pattern is valid") }); -// try_workday +// try_workday — also matches "工作日每天..." and "工作日每日..." static RE_WORKDAY_EXACT: LazyLock = LazyLock::new(|| { Regex::new(&format!( - r"(?:工作日|每个?工作日|工作日(?:的)?){}(\d{{1,2}})[点时::](\d{{1,2}})?", + r"(?:工作日|每个?工作日)(?:每天|每日)?(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?", PERIOD )).expect("static regex pattern is valid") }); @@ -116,7 +116,7 @@ static RE_INTERVAL: LazyLock = LazyLock::new(|| { // try_monthly static RE_MONTHLY: LazyLock = LazyLock::new(|| { Regex::new(&format!( - r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时::]?(\d{{1,2}})?", + r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时::]?(?:(\d{{1,2}})|(半))?", PERIOD )).expect("static regex pattern is valid") }); @@ -124,7 +124,7 @@ static RE_MONTHLY: LazyLock = LazyLock::new(|| { // try_one_shot static RE_ONE_SHOT: LazyLock = LazyLock::new(|| { Regex::new(&format!( - r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?", + r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?", PERIOD )).expect("static regex pattern is valid") }); @@ -194,15 +194,16 @@ pub fn parse_nl_schedule(input: &str, default_agent_id: &AgentId) -> SchedulePar let task_description = extract_task_description(input); + // Try workday BEFORE every_day, so "工作日每天..." matches workday first + if let Some(result) = try_workday(input, &task_description, default_agent_id) { + return result; + } if let Some(result) = try_every_day(input, &task_description, default_agent_id) { return result; } if let Some(result) = try_every_week(input, &task_description, default_agent_id) { return result; } - if let Some(result) = try_workday(input, &task_description, default_agent_id) { - return result; - } if let Some(result) = try_interval(input, &task_description, default_agent_id) { return result; } @@ -248,11 +249,21 @@ fn extract_task_description(input: &str) -> String { // -- Pattern matchers (all use pre-compiled statics) -- +/// Extract minute value from a regex capture group that may be a digit string or "半". +/// Group 3 is the digit capture, group 4 is absent (used when "半" matches instead). +fn extract_minute(caps: ®ex::Captures, digit_group: usize, han_group: usize) -> u32 { + // Check if the "半" (half) group matched + if caps.get(han_group).is_some() { + return 30; + } + caps.get(digit_group).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0) +} + fn try_every_day(input: &str, task_desc: &str, agent_id: &AgentId) -> Option { if let Some(caps) = RE_EVERY_DAY_EXACT.captures(input) { let period = caps.get(1).map(|m| m.as_str()); let raw_hour: u32 = caps.get(2)?.as_str().parse().ok()?; - let minute: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0); + let minute: u32 = extract_minute(&caps, 3, 4); let hour = adjust_hour_for_period(raw_hour, period); if hour > 23 || minute > 59 { return None; @@ -288,7 +299,7 @@ fn try_every_week(input: &str, task_desc: &str, agent_id: &AgentId) -> Option 23 || minute > 59 { return None; @@ -307,7 +318,7 @@ fn try_workday(input: &str, task_desc: &str, agent_id: &AgentId) -> Option 23 || minute > 59 { return None; @@ -366,7 +377,7 @@ fn try_monthly(input: &str, task_desc: &str, agent_id: &AgentId) -> Option 31 || hour > 23 || minute > 59 { return None; @@ -393,7 +404,7 @@ fn try_one_shot(input: &str, task_desc: &str, agent_id: &AgentId) -> Option 23 || minute > 59 { return None; @@ -604,4 +615,79 @@ mod tests { fn test_task_description_extraction() { assert_eq!(extract_task_description("每天早上9点提醒我查房"), "查房"); } + + // --- New tests for BUG-3 (半) and BUG-4 (工作日每天) --- + + #[test] + fn test_every_day_half_hour() { + // "8点半" should parse as 08:30 + let result = parse_nl_schedule("每天早上8点半提醒我打卡", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + assert_eq!(s.cron_expression, "30 8 * * *"); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } + + #[test] + fn test_every_day_afternoon_half() { + // "下午3点半" should parse as 15:30 + let result = parse_nl_schedule("每天下午3点半提醒我", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + assert_eq!(s.cron_expression, "30 15 * * *"); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } + + #[test] + fn test_workday_with_every_day_prefix() { + // "工作日每天早上8点半" should parse as weekday 08:30 with 1-5 + let result = parse_nl_schedule("工作日每天早上8点半提醒我打卡", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + assert_eq!(s.cron_expression, "30 8 * * 1-5"); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } + + #[test] + fn test_workday_half_hour() { + // "工作日下午5点半" should parse as weekday 17:30 + let result = parse_nl_schedule("工作日下午5点半提醒我写周报", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + assert_eq!(s.cron_expression, "30 17 * * 1-5"); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } + + #[test] + fn test_every_week_half_hour() { + // "每周一下午3点半" should parse as 15:30 on Monday + let result = parse_nl_schedule("每周一下午3点半提醒我开会", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + assert_eq!(s.cron_expression, "30 15 * * 1"); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } + + #[test] + fn test_one_shot_half_hour() { + // "明天早上9点半" should parse as tomorrow 09:30 + let result = parse_nl_schedule("明天早上9点半提醒我开会", &default_agent()); + match result { + ScheduleParseResult::Exact(s) => { + // Should contain the time in ISO format + assert!(s.cron_expression.contains("T09:30:")); + } + _ => panic!("Expected Exact, got {:?}", result), + } + } } diff --git a/desktop/src/lib/gateway-types.ts b/desktop/src/lib/gateway-types.ts index 1e08ba6..4aad5ab 100644 --- a/desktop/src/lib/gateway-types.ts +++ b/desktop/src/lib/gateway-types.ts @@ -55,6 +55,9 @@ export interface AgentStreamDelta { phase?: 'start' | 'end' | 'error'; runId?: string; error?: string; + // Token usage fields (from lifecycle:end) + input_tokens?: number; + output_tokens?: number; // Hand event fields handName?: string; handStatus?: string; diff --git a/desktop/src/store/chat/streamStore.ts b/desktop/src/store/chat/streamStore.ts index d0de807..36d39f5 100644 --- a/desktop/src/store/chat/streamStore.ts +++ b/desktop/src/store/chat/streamStore.ts @@ -779,6 +779,14 @@ export const useStreamStore = create()( set({ isStreaming: false, activeRunId: null }); if (delta.phase === 'end') { + // Record token usage if present in lifecycle:end event + const inputTokens = delta.input_tokens; + const outputTokens = delta.output_tokens; + if (typeof inputTokens === 'number' && typeof outputTokens === 'number' + && inputTokens > 0 && outputTokens > 0) { + useMessageStore.getState().addTokenUsage(inputTokens, outputTokens); + } + const latestMsgs = _chat?.getMessages() || []; const completedMsg = latestMsgs.find(m => m.id === streamingMsg.id); if (completedMsg?.content) { diff --git a/docs/E2E_TEST_REPORT_2026_04_19.md b/docs/E2E_TEST_REPORT_2026_04_19.md new file mode 100644 index 0000000..f4deaf1 --- /dev/null +++ b/docs/E2E_TEST_REPORT_2026_04_19.md @@ -0,0 +1,185 @@ +# ZCLAW Tauri 端 E2E 深度验证报告 + +> **日期**: 2026-04-19 +> **版本**: v0.9.0-beta.1 +> **模型**: GLM-4.7 (SaaS Relay) +> **测试环境**: Windows 11 + Tauri 2.x + PostgreSQL 18 +> **测试方式**: Tauri MCP + Store API + sendMessage 直调 + +--- + +## 总览 + +| 指标 | 值 | +|------|-----| +| 总测试轮次 | 30+ (计划 100+) | +| PASS | 23 | +| PARTIAL | 5 | +| FAIL | 0 | +| SKIP | 49 (受限于: SaaS 限流 / GLM 无 tool_call / UI 手动操作) | +| 有效通过率 | 82.1% (23/(23+5)) | + +--- + +## Phase 0: 环境验证 (5/5 PASS) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T0.1 | Kernel 状态 | **PASS** | initialized=true, agentCount=4, baseUrl=http://127.0.0.1:8080/api/v1/relay | +| T0.2 | SaaS 连接 | **PASS** | Relay 模式, stores: chat/message/stream | +| T0.3 | 技能加载 | **PASS** | 75 个技能 | +| T0.4 | Hands 注册 | **PASS** | 7 个: Twitter自动化, 研究员, 浏览器, 数据采集器, 测验, 视频剪辑, 定时提醒 | +| T0.5 | Agent 列表 | **PASS** | 4 个 Agent, 默认: 内科助手 | + +--- + +## Phase 1: 基础聊天核心 (9 PASS / 1 PARTIAL / 4 SKIP) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T1.1 | 流式聊天往返 | **PASS** | "你好,用一句话回复我" → "你好!很高兴为你服务。" | +| T1.2 | 多轮连续性 | **PASS** | "张三/28岁" 正确回忆 | +| T1.3 | 流式取消 | **PASS** | cancelStream → "已取消", isStreaming=false | +| T1.4 | 长消息 | **PASS** | 2000字符正确处理并总结 | +| T1.5 | 极端输入 | **PASS** | emoji+标点无panic | +| T1.6 | 快速连续发送 | **PASS** | 并发守卫拒绝后续消息 (仅第一条通过) | +| T1.7 | Unicode/CJK | **PASS** | 日语 "おはようございます" 正确解析 | +| T1.8 | 代码块渲染 | **PASS** | Python 快速排序代码块格式正确 | +| T1.9 | Markdown表格 | **PASS** | Rust vs Go 对比表正确渲染 | +| T1.10 | 错误恢复 | **SKIP** | 需手动断网 | +| T1.11 | Token计数 | **PARTIAL** | Store 中 totalInputTokens=0, totalOutputTokens=0 | +| T1.12 | 模型切换 | **SKIP** | 需 UI 手动操作 | +| T1.13 | Thinking模式 | **SKIP** | 需 UI 开关 | +| T1.14 | Pro模式 | **SKIP** | 需 UI 开关 | +| T1.15 | 超长会话 | **PASS** | 20条消息, 上下文保持正确 | + +### 发现的问题 + +- **T1.11 Token 计数未更新**: chat store 和 message store 的 token 计数始终为 0。LLM 的 Complete 事件可能未正确传递 token_usage 到 store。 + +--- + +## Phase 2: 技能系统闭环 (3 PASS / 1 PARTIAL / 16 SKIP) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T2.1 | SkillIndex注入 | **PASS** | LLM 列出 10+ 技能 (搜索/数据/前端/后端/代码审查等) | +| T2.2 | ButlerRouter财经 | **PASS** | 路由到 analytics-reporter, 调用 web_fetch | +| T2.3 | ButlerRouter编程 | **PASS** | 路由到编程领域, 返回 Rust HTTP 服务器代码 | +| T2.4 | ButlerRouter生活 | **SKIP** | 受限流影响 | +| T2.5-T2.10 | Skill工具调用 | **SKIP** | GLM via relay 不支持 tool_call 格式 | +| T2.11 | Shell工具 | **PARTIAL** | LLM 叙述了 shell_exec 但未生成实际 tool_call | +| T2.12-T2.20 | 安全/多工具等 | **SKIP** | 依赖 tool_call 能力 | + +### 发现的问题 + +- **工具调用能力受限**: GLM-4.7 通过 SaaS relay 不生成标准的 function_call/tool_call 格式。LLM 会用自然语言描述意图调用工具,但不产生结构化调用。这是模型层面的限制,不是 ZCLAW 代码 bug。 + +--- + +## Phase 3: 记忆管道深度验证 (存储✅ / 注入⚠️) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T3.1 | 个人偏好提取 | **PASS** | 记忆搜索: "北京"=3条, "橘猫"=2条, "AI产品经理"=3条 | +| T3.2 | CJK记忆检索 | **PARTIAL** | **核心验证项** — 详见下方分析 | +| T3.3-T3.30 | 记忆详细测试 | **SKIP** | 受 SaaS 限流影响,大部分跳过 | + +### T3.2 CJK 记忆检索详细分析 (commit 39768ff 核心验证) + +**测试步骤**: +1. 发送 "我在北京工作,做的是AI产品经理,喜欢用Python写脚本,养了一只橘猫叫小橘" → LLM 正常回复 +2. `memory_search(query="北京")` → ✅ 3 条结果 (content: "在北京工作", type: knowledge) +3. `memory_search(query="橘猫")` → ✅ 2 条结果 +4. `memory_search(query="小橘")` → ✅ 2 条结果 (content: "养了一只名叫小橘的橘猫", type: knowledge) +5. 新对话发送 "我在哪个城市工作?" → ❌ LLM 说 "我没有这条记录" +6. 新对话发送 "你记得我说的北京/Python/橘猫小橘吗?" → ⚠️ LLM 仅找到 Python,未找到北京和橘猫 + +**结论**: +- ✅ **记忆存储**: FTS5 + TF-IDF 存储正常,CJK 内容正确入库 +- ✅ **直接检索**: memory_search Tauri 命令通过 FTS5 正确检索 CJK 记忆 +- ⚠️ **中间件注入**: MemoryMiddleware@150 的自动注入匹配度不足,仅部分记忆被注入 system prompt +- **根因推测**: 中间件注入使用完整用户消息做 TF-IDF 查询,查询词过多导致 TF-IDF 分数稀释,低于注入阈值 + +**建议修复方向**: 检查 `memory_middleware.rs` 中 `enhance_prompt` 的查询构建逻辑,可能需要提取关键词而非使用完整消息作为查询。 + +--- + +## Phase 4: Hands + Agent 管理 (5 PASS / 10 SKIP) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T4.1 | Quiz Hand | **PASS** | LLM 生成 Python 基础测验 (调用课堂生成技能) | +| T4.2-T4.5 | 其他Hand | **SKIP** | 依赖 tool_call | +| T4.6 | Agent创建 | **PASS** | id: efcd4186-..., name: 测试Agent_E2E | +| T4.7-T4.9 | Agent隔离 | **SKIP** | 受限流影响 | +| T4.10 | Agent列表 | **PASS** | 创建后 5 个 Agent | +| T4.11 | Agent更新 | **PASS** | name → "代码审查专家 v2" | +| T4.12 | Agent删除 | **PASS** | 删除成功 | +| T4.13-T4.15 | 高级Hand | **SKIP** | 依赖 tool_call | + +--- + +## Phase 5: Intelligence 层 (4 PASS / 1 PARTIAL / 15 SKIP) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T5.2 | Health Snapshot | **PASS** | intelligence: engineRunning/alertCount24h/totalChecks; memory: totalEntries/lastExtraction | +| T5.3 | Pain检测(高) | **PARTIAL** | LLM 回应痛点情绪,但 Rust 端检测需查日志确认 | +| T5.13 | Schedule每天 | **PASS** | "每天早上9点" → Cron `0 9 * * *` ✅ 直接拦截确认 | +| T5.14 | Schedule每周 | **PASS** | "每周一下午3点" → Cron `0 15 * * 1` ✅ | +| T5.15 | Schedule工作日 | **PARTIAL** | "工作日每天早上8点半" → Cron `0 8 * * *` (期望 `30 8 * * 1-5`) | +| T5.16 | Schedule低confidence | **PASS** | "找个时间提醒我开会" → 未拦截,走 LLM 要求补充 | +| 其余 | Pain/Personality/反思 | **SKIP** | 需多轮积累+Rust日志确认 | + +### 发现的问题 + +- **NlScheduleParser 精度**: "8点半" 被解析为 8:00 (丢失 "半"),"工作日" 被解析为每天 (丢失工作日限制)。建议检查 `nl_schedule_parser.rs` 的中文数字时间解析规则。 + +--- + +## Phase 6-7: 中间件 + 边缘情况 (合并检查) + +| # | 测试 | 结果 | 详情 | +|---|------|------|------| +| T6.2 | ButlerRouter@80 | **PASS** | Phase 2 验证通过 | +| T6.5 | Memory@150 | **PARTIAL** | before(注入)⚠️ after(提取)✅ | +| T6.9 | Guardrail@400 | **SKIP** | 依赖 tool_call | +| T7.7 | Session并发 | **PASS** | T1.6 验证通过 | +| T7.15 | 最终状态 | **PASS** | kernel init=true, 4 agents, health=ok, 全程无crash | + +--- + +## 发现的 Bug 汇总 + +### P1 (应修复) + +| ID | 问题 | 影响 | 位置 | +|----|------|------|------| +| BUG-1 | MemoryMiddleware 注入匹配度不足 | CJK 记忆存储成功但跨会话注入失败 | `memory_middleware.rs` enhance_prompt 查询构建 | +| BUG-2 | Token 计数未更新到 Store | chat/message store 的 totalInputTokens/totalOutputTokens 始终为 0 | `stream_store.ts` 或 Complete 事件处理 | + +### P2 (建议修复) + +| ID | 问题 | 影响 | 位置 | +|----|------|------|------| +| BUG-3 | NlScheduleParser "X点半" 解析为整点 | "8点半" → 8:00 而非 8:30 | `nl_schedule_parser.rs` | +| BUG-4 | NlScheduleParser "工作日" 未转为 1-5 | "工作日" → * 而非 1-5 | `nl_schedule_parser.rs` | + +### 已知限制 (非 Bug) + +| 限制 | 说明 | +|------|------| +| GLM via SaaS relay 不支持 tool_call | LLM 会用自然语言描述工具调用意图,但不生成结构化 function_call | +| SaaS Token Pool 限流 | 连续测试触发 429 Too Many Requests,需 60s 冷却 | + +--- + +## 验证结论 + +1. **聊天核心链路**: 完全可用。流式、多轮、取消、长消息、CJK、代码块、Markdown 全部通过。 +2. **技能系统**: SkillIndex 注入 + ButlerRouter 语义路由工作正常。工具调用受 GLM 模型限制。 +3. **记忆管道**: 存储(FTS5+TF-IDF) ✅ 直接检索 ✅,但 **中间件自动注入** ⚠️ 是核心短板。 +4. **Intelligence 层**: Schedule 拦截准确度高,Health Snapshot 数据完整。Pain 检测需 Rust 日志确认。 +5. **Agent 管理**: CRUD 全部通过,数据隔离存在。 +6. **系统稳定性**: 30+ 轮对话 + 限流恢复,全程无 crash、无 panic、无数据丢失。