fix(growth,runtime,desktop): E2E 验证 4 项 Bug 修复
Some checks are pending
CI / Lint & TypeCheck (push) Waiting to run
CI / Unit Tests (push) Waiting to run
CI / Build Frontend (push) Waiting to run
CI / Rust Check (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / E2E Tests (push) Blocked by required conditions
Some checks are pending
CI / Lint & TypeCheck (push) Waiting to run
CI / Unit Tests (push) Waiting to run
CI / Build Frontend (push) Waiting to run
CI / Rust Check (push) Waiting to run
CI / Security Scan (push) Waiting to run
CI / E2E Tests (push) Blocked by required conditions
P1 BUG-1: SemanticScorer CJK 分词缺失导致 TF-IDF 相似度为 0 - 新增 CJK bigram 分词: "北京工作" → ["北京","京工","工作","北京工作"] - 非CJK文本保持原有分割逻辑 - 3 个新测试: bigram 生成 + 混合文本 + CJK 相似度>0 P1 BUG-2: streamStore lifecycle:end 未记录 token 使用量 - AgentStreamDelta 增加 input_tokens/output_tokens 字段 - lifecycle:end 处理中检查并调用 addTokenUsage P2 BUG-3: NlScheduleParser "X点半" 解析为整点 - 所有时间正则增加可选的 (半) 捕获组 - extract_minute 辅助函数: 半 → 30 P2 BUG-4: NlScheduleParser "工作日每天" 未转为 1-5 - RE_WORKDAY_EXACT 支持 (每天|每日)? 中缀 - try_workday 优先级提升至 try_every_day 之前 E2E 报告: docs/E2E_TEST_REPORT_2026_04_19.md 测试: 806 passed / 0 failed (含 9 个新增测试)
This commit is contained in:
@@ -122,13 +122,65 @@ impl SemanticScorer {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Tokenize text into words
|
||||
/// Tokenize text into words with CJK-aware bigram support.
|
||||
///
|
||||
/// For ASCII/latin text, splits on non-alphanumeric boundaries as before.
|
||||
/// For CJK text, generates character-level bigrams (e.g. "北京工作" → ["北京", "京工", "工作"])
|
||||
/// so that TF-IDF cosine similarity works for CJK queries.
|
||||
fn tokenize(text: &str) -> Vec<String> {
|
||||
text.to_lowercase()
|
||||
.split(|c: char| !c.is_alphanumeric())
|
||||
.filter(|s| !s.is_empty() && s.len() > 1)
|
||||
.map(|s| s.to_string())
|
||||
.collect()
|
||||
let lower = text.to_lowercase();
|
||||
let mut tokens = Vec::new();
|
||||
|
||||
// Split into segments: each segment is either pure CJK or non-CJK
|
||||
let mut cjk_buf = String::new();
|
||||
let mut latin_buf = String::new();
|
||||
|
||||
let flush_latin = |buf: &mut String, tokens: &mut Vec<String>| {
|
||||
if !buf.is_empty() {
|
||||
for word in buf.split(|c: char| !c.is_alphanumeric()) {
|
||||
if !word.is_empty() && word.len() > 1 {
|
||||
tokens.push(word.to_string());
|
||||
}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
};
|
||||
|
||||
let flush_cjk = |buf: &mut String, tokens: &mut Vec<String>| {
|
||||
if buf.is_empty() {
|
||||
return;
|
||||
}
|
||||
let chars: Vec<char> = buf.chars().collect();
|
||||
// Generate bigrams for CJK
|
||||
if chars.len() >= 2 {
|
||||
for i in 0..chars.len() - 1 {
|
||||
tokens.push(format!("{}{}", chars[i], chars[i + 1]));
|
||||
}
|
||||
}
|
||||
// Also include the full CJK segment as a single token for exact-match bonus
|
||||
if chars.len() > 1 {
|
||||
tokens.push(buf.clone());
|
||||
}
|
||||
buf.clear();
|
||||
};
|
||||
|
||||
for c in lower.chars() {
|
||||
if is_cjk_char(c) {
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
cjk_buf.push(c);
|
||||
} else if c.is_alphanumeric() {
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
latin_buf.push(c);
|
||||
} else {
|
||||
// Non-alphanumeric, non-CJK: flush both
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
}
|
||||
}
|
||||
flush_latin(&mut latin_buf, &mut tokens);
|
||||
flush_cjk(&mut cjk_buf, &mut tokens);
|
||||
|
||||
tokens
|
||||
}
|
||||
|
||||
/// Remove stop words from tokens
|
||||
@@ -409,6 +461,20 @@ impl Default for SemanticScorer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if a character is a CJK ideograph
|
||||
fn is_cjk_char(c: char) -> bool {
|
||||
matches!(c,
|
||||
'\u{4E00}'..='\u{9FFF}' |
|
||||
'\u{3400}'..='\u{4DBF}' |
|
||||
'\u{20000}'..='\u{2A6DF}' |
|
||||
'\u{2A700}'..='\u{2B73F}' |
|
||||
'\u{2B740}'..='\u{2B81F}' |
|
||||
'\u{2B820}'..='\u{2CEAF}' |
|
||||
'\u{F900}'..='\u{FAFF}' |
|
||||
'\u{2F800}'..='\u{2FA1F}'
|
||||
)
|
||||
}
|
||||
|
||||
/// Index statistics
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IndexStats {
|
||||
@@ -430,6 +496,42 @@ mod tests {
|
||||
assert_eq!(tokens, vec!["hello", "world", "this", "is", "test"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_cjk_bigrams() {
|
||||
// CJK text should produce bigrams + full segment token
|
||||
let tokens = SemanticScorer::tokenize("北京工作");
|
||||
assert!(tokens.contains(&"北京".to_string()), "should contain bigram 北京");
|
||||
assert!(tokens.contains(&"京工".to_string()), "should contain bigram 京工");
|
||||
assert!(tokens.contains(&"工作".to_string()), "should contain bigram 工作");
|
||||
assert!(tokens.contains(&"北京工作".to_string()), "should contain full segment");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_mixed_cjk_latin() {
|
||||
// Mixed CJK and latin should handle both
|
||||
let tokens = SemanticScorer::tokenize("我在北京工作,用Python写脚本");
|
||||
// CJK bigrams
|
||||
assert!(tokens.contains(&"我在".to_string()));
|
||||
assert!(tokens.contains(&"北京".to_string()));
|
||||
// Latin word
|
||||
assert!(tokens.contains(&"python".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cjk_similarity() {
|
||||
let mut scorer = SemanticScorer::new();
|
||||
|
||||
let entry = MemoryEntry::new(
|
||||
"test", MemoryType::Preference, "test",
|
||||
"用户在北京工作,做AI产品经理".to_string(),
|
||||
);
|
||||
scorer.index_entry(&entry);
|
||||
|
||||
// Query "北京" should have non-zero similarity after bigram fix
|
||||
let score = scorer.score_similarity("北京", &entry);
|
||||
assert!(score > 0.0, "CJK query should score > 0 after bigram tokenization, got {}", score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_removal() {
|
||||
let scorer = SemanticScorer::new();
|
||||
|
||||
Reference in New Issue
Block a user