fix(growth,runtime,desktop): E2E 验证 4 项 Bug 修复
Some checks are pending
CI / Lint & TypeCheck (push) Waiting to run
CI / Unit Tests (push) Waiting to run
CI / Build Frontend (push) Waiting to run
CI / Rust Check (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / E2E Tests (push) Blocked by required conditions

P1 BUG-1: SemanticScorer CJK 分词缺失导致 TF-IDF 相似度为 0
- 新增 CJK bigram 分词: "北京工作" → ["北京","京工","工作","北京工作"]
- 非CJK文本保持原有分割逻辑
- 3 个新测试: bigram 生成 + 混合文本 + CJK 相似度>0

P1 BUG-2: streamStore lifecycle:end 未记录 token 使用量
- AgentStreamDelta 增加 input_tokens/output_tokens 字段
- lifecycle:end 处理中检查并调用 addTokenUsage

P2 BUG-3: NlScheduleParser "X点半" 解析为整点
- 所有时间正则增加可选的 (半) 捕获组
- extract_minute 辅助函数: 半 → 30

P2 BUG-4: NlScheduleParser "工作日每天" 未转为 1-5
- RE_WORKDAY_EXACT 支持 (每天|每日)? 中缀
- try_workday 优先级提升至 try_every_day 之前

E2E 报告: docs/E2E_TEST_REPORT_2026_04_19.md
测试: 806 passed / 0 failed (含 9 个新增测试)
This commit is contained in:
iven
2026-04-20 00:07:07 +08:00
parent 39768ff598
commit 24b866fc28
5 changed files with 405 additions and 21 deletions

View File

@@ -122,13 +122,65 @@ impl SemanticScorer {
.collect()
}
/// Tokenize text into words
/// Tokenize text into words with CJK-aware bigram support.
///
/// For ASCII/latin text, splits on non-alphanumeric boundaries as before.
/// For CJK text, generates character-level bigrams (e.g. "北京工作" → ["北京", "京工", "工作"])
/// so that TF-IDF cosine similarity works for CJK queries.
fn tokenize(text: &str) -> Vec<String> {
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty() && s.len() > 1)
.map(|s| s.to_string())
.collect()
let lower = text.to_lowercase();
let mut tokens = Vec::new();
// Split into segments: each segment is either pure CJK or non-CJK
let mut cjk_buf = String::new();
let mut latin_buf = String::new();
let flush_latin = |buf: &mut String, tokens: &mut Vec<String>| {
if !buf.is_empty() {
for word in buf.split(|c: char| !c.is_alphanumeric()) {
if !word.is_empty() && word.len() > 1 {
tokens.push(word.to_string());
}
}
buf.clear();
}
};
let flush_cjk = |buf: &mut String, tokens: &mut Vec<String>| {
if buf.is_empty() {
return;
}
let chars: Vec<char> = buf.chars().collect();
// Generate bigrams for CJK
if chars.len() >= 2 {
for i in 0..chars.len() - 1 {
tokens.push(format!("{}{}", chars[i], chars[i + 1]));
}
}
// Also include the full CJK segment as a single token for exact-match bonus
if chars.len() > 1 {
tokens.push(buf.clone());
}
buf.clear();
};
for c in lower.chars() {
if is_cjk_char(c) {
flush_latin(&mut latin_buf, &mut tokens);
cjk_buf.push(c);
} else if c.is_alphanumeric() {
flush_cjk(&mut cjk_buf, &mut tokens);
latin_buf.push(c);
} else {
// Non-alphanumeric, non-CJK: flush both
flush_latin(&mut latin_buf, &mut tokens);
flush_cjk(&mut cjk_buf, &mut tokens);
}
}
flush_latin(&mut latin_buf, &mut tokens);
flush_cjk(&mut cjk_buf, &mut tokens);
tokens
}
/// Remove stop words from tokens
@@ -409,6 +461,20 @@ impl Default for SemanticScorer {
}
}
/// Check if a character is a CJK ideograph
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' |
'\u{3400}'..='\u{4DBF}' |
'\u{20000}'..='\u{2A6DF}' |
'\u{2A700}'..='\u{2B73F}' |
'\u{2B740}'..='\u{2B81F}' |
'\u{2B820}'..='\u{2CEAF}' |
'\u{F900}'..='\u{FAFF}' |
'\u{2F800}'..='\u{2FA1F}'
)
}
/// Index statistics
#[derive(Debug, Clone)]
pub struct IndexStats {
@@ -430,6 +496,42 @@ mod tests {
assert_eq!(tokens, vec!["hello", "world", "this", "is", "test"]);
}
#[test]
fn test_tokenize_cjk_bigrams() {
// CJK text should produce bigrams + full segment token
let tokens = SemanticScorer::tokenize("北京工作");
assert!(tokens.contains(&"北京".to_string()), "should contain bigram 北京");
assert!(tokens.contains(&"京工".to_string()), "should contain bigram 京工");
assert!(tokens.contains(&"工作".to_string()), "should contain bigram 工作");
assert!(tokens.contains(&"北京工作".to_string()), "should contain full segment");
}
#[test]
fn test_tokenize_mixed_cjk_latin() {
// Mixed CJK and latin should handle both
let tokens = SemanticScorer::tokenize("我在北京工作用Python写脚本");
// CJK bigrams
assert!(tokens.contains(&"我在".to_string()));
assert!(tokens.contains(&"北京".to_string()));
// Latin word
assert!(tokens.contains(&"python".to_string()));
}
#[test]
fn test_cjk_similarity() {
let mut scorer = SemanticScorer::new();
let entry = MemoryEntry::new(
"test", MemoryType::Preference, "test",
"用户在北京工作做AI产品经理".to_string(),
);
scorer.index_entry(&entry);
// Query "北京" should have non-zero similarity after bigram fix
let score = scorer.score_similarity("北京", &entry);
assert!(score > 0.0, "CJK query should score > 0 after bigram tokenization, got {}", score);
}
#[test]
fn test_stop_words_removal() {
let scorer = SemanticScorer::new();