fix(growth): CJK 记忆检索 TF-IDF 阈值过高导致注入失败
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
根因: SqliteStorage.find() 对 CJK 查询使用 LIKE fallback 获取候选, 但 TF-IDF 评分因 unicode61 tokenizer 不支持 CJK 而系统性地偏低, 被默认 min_similarity=0.7 阈值全部过滤掉。 修复: 检测到 CJK 查询时将阈值降至 50%(0.35),避免所有记忆被误过滤。
This commit is contained in:
@@ -732,6 +732,11 @@ impl VikingStorage for SqliteStorage {
|
||||
async fn find(&self, query: &str, options: FindOptions) -> Result<Vec<MemoryEntry>> {
|
||||
let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking
|
||||
|
||||
// Detect CJK early — used both for LIKE fallback and similarity threshold relaxation
|
||||
let has_cjk = query.chars().any(|c| {
|
||||
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
|
||||
});
|
||||
|
||||
// Strategy: use FTS5 for initial filtering when query is non-empty,
|
||||
// then score candidates with TF-IDF / embedding for precise ranking.
|
||||
// When FTS5 returns nothing, we return empty — do NOT fall back to
|
||||
@@ -792,9 +797,6 @@ impl VikingStorage for SqliteStorage {
|
||||
// FTS5 returned no results or failed — check if query contains CJK
|
||||
// characters. unicode61 tokenizer doesn't index CJK, so fall back
|
||||
// to LIKE-based search for CJK queries.
|
||||
let has_cjk = query.chars().any(|c| {
|
||||
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
|
||||
});
|
||||
|
||||
if !has_cjk {
|
||||
tracing::debug!(
|
||||
@@ -897,9 +899,17 @@ impl VikingStorage for SqliteStorage {
|
||||
scorer.score_similarity(query, &entry)
|
||||
};
|
||||
|
||||
// Apply similarity threshold
|
||||
// Apply similarity threshold (relaxed for CJK queries since unicode61
|
||||
// tokenizer doesn't produce meaningful TF-IDF scores for CJK text)
|
||||
if let Some(min_similarity) = options.min_similarity {
|
||||
if semantic_score < min_similarity {
|
||||
let threshold = if has_cjk {
|
||||
// CJK TF-IDF scores are systematically low due to tokenizer limitations;
|
||||
// use 50% of the normal threshold to avoid filtering out all results
|
||||
min_similarity * 0.5
|
||||
} else {
|
||||
min_similarity
|
||||
};
|
||||
if semantic_score < threshold {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user