fix(growth,runtime,desktop): E2E 验证 4 项 Bug 修复

P1 BUG-1: SemanticScorer CJK 分词缺失导致 TF-IDF 相似度为 0 - 新增 CJK bigram 分词: "北京工作" → ["北京","京工","工作","北京工作"] - 非CJK文本保持原有分割逻辑 - 3 个新测试: bigram 生成 + 混合文本 + CJK 相似度>0 P1 BUG-2: streamStore lifecycle:end 未记录 token 使用量 - AgentStreamDelta 增加 input_tokens/output_tokens 字段 - lifecycle:end 处理中检查并调用 addTokenUsage P2 BUG-3: NlScheduleParser "X点半" 解析为整点 - 所有时间正则增加可选的 (半) 捕获组 - extract_minute 辅助函数: 半 → 30 P2 BUG-4: NlScheduleParser "工作日每天" 未转为 1-5 - RE_WORKDAY_EXACT 支持 (每天|每日)? 中缀 - try_workday 优先级提升至 try_every_day 之前 E2E 报告: docs/E2E_TEST_REPORT_2026_04_19.md 测试: 806 passed / 0 failed (含 9 个新增测试)
2026-04-20 00:07:07 +08:00
parent 39768ff598
commit 24b866fc28
5 changed files with 405 additions and 21 deletions
--- a/crates/zclaw-growth/src/retrieval/semantic.rs
+++ b/crates/zclaw-growth/src/retrieval/semantic.rs
@@ -122,13 +122,65 @@ impl SemanticScorer {
        .collect()
    }

-    /// Tokenize text into words
+    /// Tokenize text into words with CJK-aware bigram support.
+    ///
+    /// For ASCII/latin text, splits on non-alphanumeric boundaries as before.
+    /// For CJK text, generates character-level bigrams (e.g. "北京工作" → ["北京", "京工", "工作"])
+    /// so that TF-IDF cosine similarity works for CJK queries.
    fn tokenize(text: &str) -> Vec<String> {
-        text.to_lowercase()
-            .split(|c: char| !c.is_alphanumeric())
-            .filter(|s| !s.is_empty() && s.len() > 1)
-            .map(|s| s.to_string())
-            .collect()
+        let lower = text.to_lowercase();
+        let mut tokens = Vec::new();
+
+        // Split into segments: each segment is either pure CJK or non-CJK
+        let mut cjk_buf = String::new();
+        let mut latin_buf = String::new();
+
+        let flush_latin = |buf: &mut String, tokens: &mut Vec<String>| {
+            if !buf.is_empty() {
+                for word in buf.split(|c: char| !c.is_alphanumeric()) {
+                    if !word.is_empty() && word.len() > 1 {
+                        tokens.push(word.to_string());
+                    }
+                }
+                buf.clear();
+            }
+        };
+
+        let flush_cjk = |buf: &mut String, tokens: &mut Vec<String>| {
+            if buf.is_empty() {
+                return;
+            }
+            let chars: Vec<char> = buf.chars().collect();
+            // Generate bigrams for CJK
+            if chars.len() >= 2 {
+                for i in 0..chars.len() - 1 {
+                    tokens.push(format!("{}{}", chars[i], chars[i + 1]));
+                }
+            }
+            // Also include the full CJK segment as a single token for exact-match bonus
+            if chars.len() > 1 {
+                tokens.push(buf.clone());
+            }
+            buf.clear();
+        };
+
+        for c in lower.chars() {
+            if is_cjk_char(c) {
+                flush_latin(&mut latin_buf, &mut tokens);
+                cjk_buf.push(c);
+            } else if c.is_alphanumeric() {
+                flush_cjk(&mut cjk_buf, &mut tokens);
+                latin_buf.push(c);
+            } else {
+                // Non-alphanumeric, non-CJK: flush both
+                flush_latin(&mut latin_buf, &mut tokens);
+                flush_cjk(&mut cjk_buf, &mut tokens);
+            }
+        }
+        flush_latin(&mut latin_buf, &mut tokens);
+        flush_cjk(&mut cjk_buf, &mut tokens);
+
+        tokens
    }

    /// Remove stop words from tokens
@@ -409,6 +461,20 @@ impl Default for SemanticScorer {
    }
 }

+/// Check if a character is a CJK ideograph
+fn is_cjk_char(c: char) -> bool {
+    matches!(c,
+        '\u{4E00}'..='\u{9FFF}' |
+        '\u{3400}'..='\u{4DBF}' |
+        '\u{20000}'..='\u{2A6DF}' |
+        '\u{2A700}'..='\u{2B73F}' |
+        '\u{2B740}'..='\u{2B81F}' |
+        '\u{2B820}'..='\u{2CEAF}' |
+        '\u{F900}'..='\u{FAFF}' |
+        '\u{2F800}'..='\u{2FA1F}'
+    )
+}
+
 /// Index statistics
 #[derive(Debug, Clone)]
 pub struct IndexStats {
@@ -430,6 +496,42 @@ mod tests {
        assert_eq!(tokens, vec!["hello", "world", "this", "is", "test"]);
    }

+    #[test]
+    fn test_tokenize_cjk_bigrams() {
+        // CJK text should produce bigrams + full segment token
+        let tokens = SemanticScorer::tokenize("北京工作");
+        assert!(tokens.contains(&"北京".to_string()), "should contain bigram 北京");
+        assert!(tokens.contains(&"京工".to_string()), "should contain bigram 京工");
+        assert!(tokens.contains(&"工作".to_string()), "should contain bigram 工作");
+        assert!(tokens.contains(&"北京工作".to_string()), "should contain full segment");
+    }
+
+    #[test]
+    fn test_tokenize_mixed_cjk_latin() {
+        // Mixed CJK and latin should handle both
+        let tokens = SemanticScorer::tokenize("我在北京工作，用Python写脚本");
+        // CJK bigrams
+        assert!(tokens.contains(&"我在".to_string()));
+        assert!(tokens.contains(&"北京".to_string()));
+        // Latin word
+        assert!(tokens.contains(&"python".to_string()));
+    }
+
+    #[test]
+    fn test_cjk_similarity() {
+        let mut scorer = SemanticScorer::new();
+
+        let entry = MemoryEntry::new(
+            "test", MemoryType::Preference, "test",
+            "用户在北京工作，做AI产品经理".to_string(),
+        );
+        scorer.index_entry(&entry);
+
+        // Query "北京" should have non-zero similarity after bigram fix
+        let score = scorer.score_similarity("北京", &entry);
+        assert!(score > 0.0, "CJK query should score > 0 after bigram tokenization, got {}", score);
+    }
+
    #[test]
    fn test_stop_words_removal() {
        let scorer = SemanticScorer::new();
--- a/crates/zclaw-runtime/src/nl_schedule.rs
+++ b/crates/zclaw-runtime/src/nl_schedule.rs
@@ -68,14 +68,14 @@ const PERIOD: &str = "(凌晨|早上|早晨|上午|中午|下午|午后|傍晚|
 // extract_task_description
 static RE_TIME_STRIP: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
-        r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时:：]\d{0,2}分?"
+        r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时:：](?:\d{1,2}分?|半)?"
    ).expect("static regex pattern is valid")
 });

 // try_every_day
 static RE_EVERY_DAY_EXACT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(&format!(
-        r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时:：](\d{{1,2}})?",
+        r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时:：](?:(\d{{1,2}})|(半))?",
        PERIOD
    )).expect("static regex pattern is valid")
 });
@@ -89,15 +89,15 @@ static RE_EVERY_DAY_PERIOD: LazyLock<Regex> = LazyLock::new(|| {
 // try_every_week
 static RE_EVERY_WEEK: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(&format!(
-        r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时:：](\d{{1,2}})?",
+        r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时:：](?:(\d{{1,2}})|(半))?",
        PERIOD
    )).expect("static regex pattern is valid")
 });

-// try_workday
+// try_workday — also matches "工作日每天..." and "工作日每日..."
 static RE_WORKDAY_EXACT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(&format!(
-        r"(?:工作日|每个?工作日|工作日(?:的)?){}(\d{{1,2}})[点时:：](\d{{1,2}})?",
+        r"(?:工作日|每个?工作日)(?:每天|每日)?(?:的)?{}(\d{{1,2}})[点时:：](?:(\d{{1,2}})|(半))?",
        PERIOD
    )).expect("static regex pattern is valid")
 });
@@ -116,7 +116,7 @@ static RE_INTERVAL: LazyLock<Regex> = LazyLock::new(|| {
 // try_monthly
 static RE_MONTHLY: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(&format!(
-        r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时:：]?(\d{{1,2}})?",
+        r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时:：]?(?:(\d{{1,2}})|(半))?",
        PERIOD
    )).expect("static regex pattern is valid")
 });
@@ -124,7 +124,7 @@ static RE_MONTHLY: LazyLock<Regex> = LazyLock::new(|| {
 // try_one_shot
 static RE_ONE_SHOT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(&format!(
-        r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时:：](\d{{1,2}})?",
+        r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时:：](?:(\d{{1,2}})|(半))?",
        PERIOD
    )).expect("static regex pattern is valid")
 });
@@ -194,15 +194,16 @@ pub fn parse_nl_schedule(input: &str, default_agent_id: &AgentId) -> SchedulePar

    let task_description = extract_task_description(input);

+    // Try workday BEFORE every_day, so "工作日每天..." matches workday first
+    if let Some(result) = try_workday(input, &task_description, default_agent_id) {
+        return result;
+    }
    if let Some(result) = try_every_day(input, &task_description, default_agent_id) {
        return result;
    }
    if let Some(result) = try_every_week(input, &task_description, default_agent_id) {
        return result;
    }
-    if let Some(result) = try_workday(input, &task_description, default_agent_id) {
-        return result;
-    }
    if let Some(result) = try_interval(input, &task_description, default_agent_id) {
        return result;
    }
@@ -248,11 +249,21 @@ fn extract_task_description(input: &str) -> String {

 // -- Pattern matchers (all use pre-compiled statics) --

+/// Extract minute value from a regex capture group that may be a digit string or "半".
+/// Group 3 is the digit capture, group 4 is absent (used when "半" matches instead).
+fn extract_minute(caps: &regex::Captures, digit_group: usize, han_group: usize) -> u32 {
+    // Check if the "半" (half) group matched
+    if caps.get(han_group).is_some() {
+        return 30;
+    }
+    caps.get(digit_group).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0)
+}
+
 fn try_every_day(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<ScheduleParseResult> {
    if let Some(caps) = RE_EVERY_DAY_EXACT.captures(input) {
        let period = caps.get(1).map(|m| m.as_str());
        let raw_hour: u32 = caps.get(2)?.as_str().parse().ok()?;
-        let minute: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
+        let minute: u32 = extract_minute(&caps, 3, 4);
        let hour = adjust_hour_for_period(raw_hour, period);
        if hour > 23 || minute > 59 {
            return None;
@@ -288,7 +299,7 @@ fn try_every_week(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sc
    let dow = weekday_to_cron(day_str)?;
    let period = caps.get(2).map(|m| m.as_str());
    let raw_hour: u32 = caps.get(3)?.as_str().parse().ok()?;
-    let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
+    let minute: u32 = extract_minute(&caps, 4, 5);
    let hour = adjust_hour_for_period(raw_hour, period);
    if hour > 23 || minute > 59 {
        return None;
@@ -307,7 +318,7 @@ fn try_workday(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sched
    if let Some(caps) = RE_WORKDAY_EXACT.captures(input) {
        let period = caps.get(1).map(|m| m.as_str());
        let raw_hour: u32 = caps.get(2)?.as_str().parse().ok()?;
-        let minute: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
+        let minute: u32 = extract_minute(&caps, 3, 4);
        let hour = adjust_hour_for_period(raw_hour, period);
        if hour > 23 || minute > 59 {
            return None;
@@ -366,7 +377,7 @@ fn try_monthly(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sched
        let day: u32 = caps.get(1)?.as_str().parse().ok()?;
        let period = caps.get(2).map(|m| m.as_str());
        let raw_hour: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(9)).unwrap_or(9);
-        let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
+        let minute: u32 = extract_minute(&caps, 4, 5);
        let hour = adjust_hour_for_period(raw_hour, period);
        if day > 31 || hour > 23 || minute > 59 {
            return None;
@@ -393,7 +404,7 @@ fn try_one_shot(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sche
    };
    let period = caps.get(2).map(|m| m.as_str());
    let raw_hour: u32 = caps.get(3)?.as_str().parse().ok()?;
-    let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
+    let minute: u32 = extract_minute(&caps, 4, 5);
    let hour = adjust_hour_for_period(raw_hour, period);
    if hour > 23 || minute > 59 {
        return None;
@@ -604,4 +615,79 @@ mod tests {
    fn test_task_description_extraction() {
        assert_eq!(extract_task_description("每天早上9点提醒我查房"), "查房");
    }
+
+    // --- New tests for BUG-3 (半) and BUG-4 (工作日每天) ---
+
+    #[test]
+    fn test_every_day_half_hour() {
+        // "8点半" should parse as 08:30
+        let result = parse_nl_schedule("每天早上8点半提醒我打卡", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                assert_eq!(s.cron_expression, "30 8 * * *");
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
+
+    #[test]
+    fn test_every_day_afternoon_half() {
+        // "下午3点半" should parse as 15:30
+        let result = parse_nl_schedule("每天下午3点半提醒我", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                assert_eq!(s.cron_expression, "30 15 * * *");
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
+
+    #[test]
+    fn test_workday_with_every_day_prefix() {
+        // "工作日每天早上8点半" should parse as weekday 08:30 with 1-5
+        let result = parse_nl_schedule("工作日每天早上8点半提醒我打卡", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                assert_eq!(s.cron_expression, "30 8 * * 1-5");
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
+
+    #[test]
+    fn test_workday_half_hour() {
+        // "工作日下午5点半" should parse as weekday 17:30
+        let result = parse_nl_schedule("工作日下午5点半提醒我写周报", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                assert_eq!(s.cron_expression, "30 17 * * 1-5");
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
+
+    #[test]
+    fn test_every_week_half_hour() {
+        // "每周一下午3点半" should parse as 15:30 on Monday
+        let result = parse_nl_schedule("每周一下午3点半提醒我开会", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                assert_eq!(s.cron_expression, "30 15 * * 1");
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
+
+    #[test]
+    fn test_one_shot_half_hour() {
+        // "明天早上9点半" should parse as tomorrow 09:30
+        let result = parse_nl_schedule("明天早上9点半提醒我开会", &default_agent());
+        match result {
+            ScheduleParseResult::Exact(s) => {
+                // Should contain the time in ISO format
+                assert!(s.cron_expression.contains("T09:30:"));
+            }
+            _ => panic!("Expected Exact, got {:?}", result),
+        }
+    }
 }