From 39768ff5989023510f336695bb1392f4476dff07 Mon Sep 17 00:00:00 2001 From: iven Date: Sun, 19 Apr 2026 22:23:32 +0800 Subject: [PATCH] =?UTF-8?q?fix(growth):=20CJK=20=E8=AE=B0=E5=BF=86?= =?UTF-8?q?=E6=A3=80=E7=B4=A2=20TF-IDF=20=E9=98=88=E5=80=BC=E8=BF=87?= =?UTF-8?q?=E9=AB=98=E5=AF=BC=E8=87=B4=E6=B3=A8=E5=85=A5=E5=A4=B1=E8=B4=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 根因: SqliteStorage.find() 对 CJK 查询使用 LIKE fallback 获取候选, 但 TF-IDF 评分因 unicode61 tokenizer 不支持 CJK 而系统性地偏低, 被默认 min_similarity=0.7 阈值全部过滤掉。 修复: 检测到 CJK 查询时将阈值降至 50%(0.35),避免所有记忆被误过滤。 --- crates/zclaw-growth/src/storage/sqlite.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/crates/zclaw-growth/src/storage/sqlite.rs b/crates/zclaw-growth/src/storage/sqlite.rs index 2dc24d3..60692ec 100644 --- a/crates/zclaw-growth/src/storage/sqlite.rs +++ b/crates/zclaw-growth/src/storage/sqlite.rs @@ -732,6 +732,11 @@ impl VikingStorage for SqliteStorage { async fn find(&self, query: &str, options: FindOptions) -> Result> { let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking + // Detect CJK early — used both for LIKE fallback and similarity threshold relaxation + let has_cjk = query.chars().any(|c| { + matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}') + }); + // Strategy: use FTS5 for initial filtering when query is non-empty, // then score candidates with TF-IDF / embedding for precise ranking. // When FTS5 returns nothing, we return empty — do NOT fall back to @@ -792,9 +797,6 @@ impl VikingStorage for SqliteStorage { // FTS5 returned no results or failed — check if query contains CJK // characters. unicode61 tokenizer doesn't index CJK, so fall back // to LIKE-based search for CJK queries. - let has_cjk = query.chars().any(|c| { - matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}') - }); if !has_cjk { tracing::debug!( @@ -897,9 +899,17 @@ impl VikingStorage for SqliteStorage { scorer.score_similarity(query, &entry) }; - // Apply similarity threshold + // Apply similarity threshold (relaxed for CJK queries since unicode61 + // tokenizer doesn't produce meaningful TF-IDF scores for CJK text) if let Some(min_similarity) = options.min_similarity { - if semantic_score < min_similarity { + let threshold = if has_cjk { + // CJK TF-IDF scores are systematically low due to tokenizer limitations; + // use 50% of the normal threshold to avoid filtering out all results + min_similarity * 0.5 + } else { + min_similarity + }; + if semantic_score < threshold { continue; } }