fix(growth): CJK 记忆检索 TF-IDF 阈值过高导致注入失败
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

根因: SqliteStorage.find() 对 CJK 查询使用 LIKE fallback 获取候选,
但 TF-IDF 评分因 unicode61 tokenizer 不支持 CJK 而系统性地偏低,
被默认 min_similarity=0.7 阈值全部过滤掉。

修复: 检测到 CJK 查询时将阈值降至 50%(0.35),避免所有记忆被误过滤。
This commit is contained in:
iven
2026-04-19 22:23:32 +08:00
parent 3ee68fa763
commit 39768ff598

View File

@@ -732,6 +732,11 @@ impl VikingStorage for SqliteStorage {
async fn find(&self, query: &str, options: FindOptions) -> Result<Vec<MemoryEntry>> { async fn find(&self, query: &str, options: FindOptions) -> Result<Vec<MemoryEntry>> {
let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking
// Detect CJK early — used both for LIKE fallback and similarity threshold relaxation
let has_cjk = query.chars().any(|c| {
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
});
// Strategy: use FTS5 for initial filtering when query is non-empty, // Strategy: use FTS5 for initial filtering when query is non-empty,
// then score candidates with TF-IDF / embedding for precise ranking. // then score candidates with TF-IDF / embedding for precise ranking.
// When FTS5 returns nothing, we return empty — do NOT fall back to // When FTS5 returns nothing, we return empty — do NOT fall back to
@@ -792,9 +797,6 @@ impl VikingStorage for SqliteStorage {
// FTS5 returned no results or failed — check if query contains CJK // FTS5 returned no results or failed — check if query contains CJK
// characters. unicode61 tokenizer doesn't index CJK, so fall back // characters. unicode61 tokenizer doesn't index CJK, so fall back
// to LIKE-based search for CJK queries. // to LIKE-based search for CJK queries.
let has_cjk = query.chars().any(|c| {
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
});
if !has_cjk { if !has_cjk {
tracing::debug!( tracing::debug!(
@@ -897,9 +899,17 @@ impl VikingStorage for SqliteStorage {
scorer.score_similarity(query, &entry) scorer.score_similarity(query, &entry)
}; };
// Apply similarity threshold // Apply similarity threshold (relaxed for CJK queries since unicode61
// tokenizer doesn't produce meaningful TF-IDF scores for CJK text)
if let Some(min_similarity) = options.min_similarity { if let Some(min_similarity) = options.min_similarity {
if semantic_score < min_similarity { let threshold = if has_cjk {
// CJK TF-IDF scores are systematically low due to tokenizer limitations;
// use 50% of the normal threshold to avoid filtering out all results
min_similarity * 0.5
} else {
min_similarity
};
if semantic_score < threshold {
continue; continue;
} }
} }