fix(growth,runtime,desktop): E2E 验证 4 项 Bug 修复
Some checks are pending
CI / Lint & TypeCheck (push) Waiting to run
CI / Unit Tests (push) Waiting to run
CI / Build Frontend (push) Waiting to run
CI / Rust Check (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / E2E Tests (push) Blocked by required conditions
Some checks are pending
CI / Lint & TypeCheck (push) Waiting to run
CI / Unit Tests (push) Waiting to run
CI / Build Frontend (push) Waiting to run
CI / Rust Check (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / E2E Tests (push) Blocked by required conditions
P1 BUG-1: SemanticScorer CJK 分词缺失导致 TF-IDF 相似度为 0 - 新增 CJK bigram 分词: "北京工作" → ["北京","京工","工作","北京工作"] - 非CJK文本保持原有分割逻辑 - 3 个新测试: bigram 生成 + 混合文本 + CJK 相似度>0 P1 BUG-2: streamStore lifecycle:end 未记录 token 使用量 - AgentStreamDelta 增加 input_tokens/output_tokens 字段 - lifecycle:end 处理中检查并调用 addTokenUsage P2 BUG-3: NlScheduleParser "X点半" 解析为整点 - 所有时间正则增加可选的 (半) 捕获组 - extract_minute 辅助函数: 半 → 30 P2 BUG-4: NlScheduleParser "工作日每天" 未转为 1-5 - RE_WORKDAY_EXACT 支持 (每天|每日)? 中缀 - try_workday 优先级提升至 try_every_day 之前 E2E 报告: docs/E2E_TEST_REPORT_2026_04_19.md 测试: 806 passed / 0 failed (含 9 个新增测试)
This commit is contained in:
@@ -122,13 +122,65 @@ impl SemanticScorer {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Tokenize text into words
|
||||
/// Tokenize text into words with CJK-aware bigram support.
|
||||
///
|
||||
/// For ASCII/latin text, splits on non-alphanumeric boundaries as before.
|
||||
/// For CJK text, generates character-level bigrams (e.g. "北京工作" → ["北京", "京工", "工作"])
|
||||
/// so that TF-IDF cosine similarity works for CJK queries.
|
||||
fn tokenize(text: &str) -> Vec<String> {
|
||||
text.to_lowercase()
|
||||
.split(|c: char| !c.is_alphanumeric())
|
||||
.filter(|s| !s.is_empty() && s.len() > 1)
|
||||
.map(|s| s.to_string())
|
||||
.collect()
|
||||
let lower = text.to_lowercase();
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
// Split into segments: each segment is either pure CJK or non-CJK
|
||||
let mut cjk_buf = String::new();
|
||||
let mut latin_buf = String::new();
|
||||
|
||||
let flush_latin = |buf: &mut String, tokens: &mut Vec<String>| {
|
||||
if !buf.is_empty() {
|
||||
for word in buf.split(|c: char| !c.is_alphanumeric()) {
|
||||
if !word.is_empty() && word.len() > 1 {
|
||||
tokens.push(word.to_string());
|
||||
}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
};
|
||||
|
||||
let flush_cjk = |buf: &mut String, tokens: &mut Vec<String>| {
|
||||
if buf.is_empty() {
|
||||
return;
|
||||
}
|
||||
let chars: Vec<char> = buf.chars().collect();
|
||||
// Generate bigrams for CJK
|
||||
if chars.len() >= 2 {
|
||||
for i in 0..chars.len() - 1 {
|
||||
tokens.push(format!("{}{}", chars[i], chars[i + 1]));
|
||||
}
|
||||
}
|
||||
// Also include the full CJK segment as a single token for exact-match bonus
|
||||
if chars.len() > 1 {
|
||||
tokens.push(buf.clone());
|
||||
}
|
||||
buf.clear();
|
||||
};
|
||||
|
||||
for c in lower.chars() {
|
||||
if is_cjk_char(c) {
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
cjk_buf.push(c);
|
||||
} else if c.is_alphanumeric() {
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
latin_buf.push(c);
|
||||
} else {
|
||||
// Non-alphanumeric, non-CJK: flush both
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
}
|
||||
}
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
/// Remove stop words from tokens
|
||||
@@ -409,6 +461,20 @@ impl Default for SemanticScorer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a character is a CJK ideograph
|
||||
fn is_cjk_char(c: char) -> bool {
|
||||
matches!(c,
|
||||
'\u{4E00}'..='\u{9FFF}' |
|
||||
'\u{3400}'..='\u{4DBF}' |
|
||||
'\u{20000}'..='\u{2A6DF}' |
|
||||
'\u{2A700}'..='\u{2B73F}' |
|
||||
'\u{2B740}'..='\u{2B81F}' |
|
||||
'\u{2B820}'..='\u{2CEAF}' |
|
||||
'\u{F900}'..='\u{FAFF}' |
|
||||
'\u{2F800}'..='\u{2FA1F}'
|
||||
)
|
||||
}
|
||||
|
||||
/// Index statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IndexStats {
|
||||
@@ -430,6 +496,42 @@ mod tests {
|
||||
assert_eq!(tokens, vec!["hello", "world", "this", "is", "test"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_cjk_bigrams() {
|
||||
// CJK text should produce bigrams + full segment token
|
||||
let tokens = SemanticScorer::tokenize("北京工作");
|
||||
assert!(tokens.contains(&"北京".to_string()), "should contain bigram 北京");
|
||||
assert!(tokens.contains(&"京工".to_string()), "should contain bigram 京工");
|
||||
assert!(tokens.contains(&"工作".to_string()), "should contain bigram 工作");
|
||||
assert!(tokens.contains(&"北京工作".to_string()), "should contain full segment");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_mixed_cjk_latin() {
|
||||
// Mixed CJK and latin should handle both
|
||||
let tokens = SemanticScorer::tokenize("我在北京工作,用Python写脚本");
|
||||
// CJK bigrams
|
||||
assert!(tokens.contains(&"我在".to_string()));
|
||||
assert!(tokens.contains(&"北京".to_string()));
|
||||
// Latin word
|
||||
assert!(tokens.contains(&"python".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cjk_similarity() {
|
||||
let mut scorer = SemanticScorer::new();
|
||||
|
||||
let entry = MemoryEntry::new(
|
||||
"test", MemoryType::Preference, "test",
|
||||
"用户在北京工作,做AI产品经理".to_string(),
|
||||
);
|
||||
scorer.index_entry(&entry);
|
||||
|
||||
// Query "北京" should have non-zero similarity after bigram fix
|
||||
let score = scorer.score_similarity("北京", &entry);
|
||||
assert!(score > 0.0, "CJK query should score > 0 after bigram tokenization, got {}", score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_removal() {
|
||||
let scorer = SemanticScorer::new();
|
||||
|
||||
@@ -68,14 +68,14 @@ const PERIOD: &str = "(凌晨|早上|早晨|上午|中午|下午|午后|傍晚|
|
||||
// extract_task_description
|
||||
static RE_TIME_STRIP: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(
|
||||
r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时::]\d{0,2}分?"
|
||||
r"^(?:凌晨|早上|早晨|上午|中午|下午|午后|傍晚|黄昏|晚上|晚间|夜里|夜晚|半夜|午夜)?\d{1,2}[点时::](?:\d{1,2}分?|半)?"
|
||||
).expect("static regex pattern is valid")
|
||||
});
|
||||
|
||||
// try_every_day
|
||||
static RE_EVERY_DAY_EXACT: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(&format!(
|
||||
r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?",
|
||||
r"(?:每天|每日)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?",
|
||||
PERIOD
|
||||
)).expect("static regex pattern is valid")
|
||||
});
|
||||
@@ -89,15 +89,15 @@ static RE_EVERY_DAY_PERIOD: LazyLock<Regex> = LazyLock::new(|| {
|
||||
// try_every_week
|
||||
static RE_EVERY_WEEK: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(&format!(
|
||||
r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?",
|
||||
r"(?:每周|每个?星期|每个?礼拜)(一|二|三|四|五|六|日|天|周一|周二|周三|周四|周五|周六|周日|周天|星期一|星期二|星期三|星期四|星期五|星期六|星期日|星期天|礼拜一|礼拜二|礼拜三|礼拜四|礼拜五|礼拜六|礼拜日|礼拜天)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?",
|
||||
PERIOD
|
||||
)).expect("static regex pattern is valid")
|
||||
});
|
||||
|
||||
// try_workday
|
||||
// try_workday — also matches "工作日每天..." and "工作日每日..."
|
||||
static RE_WORKDAY_EXACT: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(&format!(
|
||||
r"(?:工作日|每个?工作日|工作日(?:的)?){}(\d{{1,2}})[点时::](\d{{1,2}})?",
|
||||
r"(?:工作日|每个?工作日)(?:每天|每日)?(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?",
|
||||
PERIOD
|
||||
)).expect("static regex pattern is valid")
|
||||
});
|
||||
@@ -116,7 +116,7 @@ static RE_INTERVAL: LazyLock<Regex> = LazyLock::new(|| {
|
||||
// try_monthly
|
||||
static RE_MONTHLY: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(&format!(
|
||||
r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时::]?(\d{{1,2}})?",
|
||||
r"(?:每月|每个月)(?:的)?(\d{{1,2}})[号日](?:的)?{}(\d{{1,2}})?[点时::]?(?:(\d{{1,2}})|(半))?",
|
||||
PERIOD
|
||||
)).expect("static regex pattern is valid")
|
||||
});
|
||||
@@ -124,7 +124,7 @@ static RE_MONTHLY: LazyLock<Regex> = LazyLock::new(|| {
|
||||
// try_one_shot
|
||||
static RE_ONE_SHOT: LazyLock<Regex> = LazyLock::new(|| {
|
||||
Regex::new(&format!(
|
||||
r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时::](\d{{1,2}})?",
|
||||
r"(明天|后天|大后天)(?:的)?{}(\d{{1,2}})[点时::](?:(\d{{1,2}})|(半))?",
|
||||
PERIOD
|
||||
)).expect("static regex pattern is valid")
|
||||
});
|
||||
@@ -194,15 +194,16 @@ pub fn parse_nl_schedule(input: &str, default_agent_id: &AgentId) -> SchedulePar
|
||||
|
||||
let task_description = extract_task_description(input);
|
||||
|
||||
// Try workday BEFORE every_day, so "工作日每天..." matches workday first
|
||||
if let Some(result) = try_workday(input, &task_description, default_agent_id) {
|
||||
return result;
|
||||
}
|
||||
if let Some(result) = try_every_day(input, &task_description, default_agent_id) {
|
||||
return result;
|
||||
}
|
||||
if let Some(result) = try_every_week(input, &task_description, default_agent_id) {
|
||||
return result;
|
||||
}
|
||||
if let Some(result) = try_workday(input, &task_description, default_agent_id) {
|
||||
return result;
|
||||
}
|
||||
if let Some(result) = try_interval(input, &task_description, default_agent_id) {
|
||||
return result;
|
||||
}
|
||||
@@ -248,11 +249,21 @@ fn extract_task_description(input: &str) -> String {
|
||||
|
||||
// -- Pattern matchers (all use pre-compiled statics) --
|
||||
|
||||
/// Extract minute value from a regex capture group that may be a digit string or "半".
|
||||
/// Group 3 is the digit capture, group 4 is absent (used when "半" matches instead).
|
||||
fn extract_minute(caps: ®ex::Captures, digit_group: usize, han_group: usize) -> u32 {
|
||||
// Check if the "半" (half) group matched
|
||||
if caps.get(han_group).is_some() {
|
||||
return 30;
|
||||
}
|
||||
caps.get(digit_group).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0)
|
||||
}
|
||||
|
||||
fn try_every_day(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<ScheduleParseResult> {
|
||||
if let Some(caps) = RE_EVERY_DAY_EXACT.captures(input) {
|
||||
let period = caps.get(1).map(|m| m.as_str());
|
||||
let raw_hour: u32 = caps.get(2)?.as_str().parse().ok()?;
|
||||
let minute: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
|
||||
let minute: u32 = extract_minute(&caps, 3, 4);
|
||||
let hour = adjust_hour_for_period(raw_hour, period);
|
||||
if hour > 23 || minute > 59 {
|
||||
return None;
|
||||
@@ -288,7 +299,7 @@ fn try_every_week(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sc
|
||||
let dow = weekday_to_cron(day_str)?;
|
||||
let period = caps.get(2).map(|m| m.as_str());
|
||||
let raw_hour: u32 = caps.get(3)?.as_str().parse().ok()?;
|
||||
let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
|
||||
let minute: u32 = extract_minute(&caps, 4, 5);
|
||||
let hour = adjust_hour_for_period(raw_hour, period);
|
||||
if hour > 23 || minute > 59 {
|
||||
return None;
|
||||
@@ -307,7 +318,7 @@ fn try_workday(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sched
|
||||
if let Some(caps) = RE_WORKDAY_EXACT.captures(input) {
|
||||
let period = caps.get(1).map(|m| m.as_str());
|
||||
let raw_hour: u32 = caps.get(2)?.as_str().parse().ok()?;
|
||||
let minute: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
|
||||
let minute: u32 = extract_minute(&caps, 3, 4);
|
||||
let hour = adjust_hour_for_period(raw_hour, period);
|
||||
if hour > 23 || minute > 59 {
|
||||
return None;
|
||||
@@ -366,7 +377,7 @@ fn try_monthly(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sched
|
||||
let day: u32 = caps.get(1)?.as_str().parse().ok()?;
|
||||
let period = caps.get(2).map(|m| m.as_str());
|
||||
let raw_hour: u32 = caps.get(3).map(|m| m.as_str().parse().unwrap_or(9)).unwrap_or(9);
|
||||
let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
|
||||
let minute: u32 = extract_minute(&caps, 4, 5);
|
||||
let hour = adjust_hour_for_period(raw_hour, period);
|
||||
if day > 31 || hour > 23 || minute > 59 {
|
||||
return None;
|
||||
@@ -393,7 +404,7 @@ fn try_one_shot(input: &str, task_desc: &str, agent_id: &AgentId) -> Option<Sche
|
||||
};
|
||||
let period = caps.get(2).map(|m| m.as_str());
|
||||
let raw_hour: u32 = caps.get(3)?.as_str().parse().ok()?;
|
||||
let minute: u32 = caps.get(4).map(|m| m.as_str().parse().unwrap_or(0)).unwrap_or(0);
|
||||
let minute: u32 = extract_minute(&caps, 4, 5);
|
||||
let hour = adjust_hour_for_period(raw_hour, period);
|
||||
if hour > 23 || minute > 59 {
|
||||
return None;
|
||||
@@ -604,4 +615,79 @@ mod tests {
|
||||
fn test_task_description_extraction() {
|
||||
assert_eq!(extract_task_description("每天早上9点提醒我查房"), "查房");
|
||||
}
|
||||
|
||||
// --- New tests for BUG-3 (半) and BUG-4 (工作日每天) ---
|
||||
|
||||
#[test]
|
||||
fn test_every_day_half_hour() {
|
||||
// "8点半" should parse as 08:30
|
||||
let result = parse_nl_schedule("每天早上8点半提醒我打卡", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
assert_eq!(s.cron_expression, "30 8 * * *");
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_every_day_afternoon_half() {
|
||||
// "下午3点半" should parse as 15:30
|
||||
let result = parse_nl_schedule("每天下午3点半提醒我", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
assert_eq!(s.cron_expression, "30 15 * * *");
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_workday_with_every_day_prefix() {
|
||||
// "工作日每天早上8点半" should parse as weekday 08:30 with 1-5
|
||||
let result = parse_nl_schedule("工作日每天早上8点半提醒我打卡", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
assert_eq!(s.cron_expression, "30 8 * * 1-5");
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_workday_half_hour() {
|
||||
// "工作日下午5点半" should parse as weekday 17:30
|
||||
let result = parse_nl_schedule("工作日下午5点半提醒我写周报", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
assert_eq!(s.cron_expression, "30 17 * * 1-5");
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_every_week_half_hour() {
|
||||
// "每周一下午3点半" should parse as 15:30 on Monday
|
||||
let result = parse_nl_schedule("每周一下午3点半提醒我开会", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
assert_eq!(s.cron_expression, "30 15 * * 1");
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_one_shot_half_hour() {
|
||||
// "明天早上9点半" should parse as tomorrow 09:30
|
||||
let result = parse_nl_schedule("明天早上9点半提醒我开会", &default_agent());
|
||||
match result {
|
||||
ScheduleParseResult::Exact(s) => {
|
||||
// Should contain the time in ISO format
|
||||
assert!(s.cron_expression.contains("T09:30:"));
|
||||
}
|
||||
_ => panic!("Expected Exact, got {:?}", result),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user