fix(growth): CJK 记忆检索 TF-IDF 阈值过高导致注入失败
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
根因: SqliteStorage.find() 对 CJK 查询使用 LIKE fallback 获取候选, 但 TF-IDF 评分因 unicode61 tokenizer 不支持 CJK 而系统性地偏低, 被默认 min_similarity=0.7 阈值全部过滤掉。 修复: 检测到 CJK 查询时将阈值降至 50%(0.35),避免所有记忆被误过滤。
This commit is contained in:
@@ -732,6 +732,11 @@ impl VikingStorage for SqliteStorage {
|
|||||||
async fn find(&self, query: &str, options: FindOptions) -> Result<Vec<MemoryEntry>> {
|
async fn find(&self, query: &str, options: FindOptions) -> Result<Vec<MemoryEntry>> {
|
||||||
let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking
|
let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking
|
||||||
|
|
||||||
|
// Detect CJK early — used both for LIKE fallback and similarity threshold relaxation
|
||||||
|
let has_cjk = query.chars().any(|c| {
|
||||||
|
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
|
||||||
|
});
|
||||||
|
|
||||||
// Strategy: use FTS5 for initial filtering when query is non-empty,
|
// Strategy: use FTS5 for initial filtering when query is non-empty,
|
||||||
// then score candidates with TF-IDF / embedding for precise ranking.
|
// then score candidates with TF-IDF / embedding for precise ranking.
|
||||||
// When FTS5 returns nothing, we return empty — do NOT fall back to
|
// When FTS5 returns nothing, we return empty — do NOT fall back to
|
||||||
@@ -792,9 +797,6 @@ impl VikingStorage for SqliteStorage {
|
|||||||
// FTS5 returned no results or failed — check if query contains CJK
|
// FTS5 returned no results or failed — check if query contains CJK
|
||||||
// characters. unicode61 tokenizer doesn't index CJK, so fall back
|
// characters. unicode61 tokenizer doesn't index CJK, so fall back
|
||||||
// to LIKE-based search for CJK queries.
|
// to LIKE-based search for CJK queries.
|
||||||
let has_cjk = query.chars().any(|c| {
|
|
||||||
matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}')
|
|
||||||
});
|
|
||||||
|
|
||||||
if !has_cjk {
|
if !has_cjk {
|
||||||
tracing::debug!(
|
tracing::debug!(
|
||||||
@@ -897,9 +899,17 @@ impl VikingStorage for SqliteStorage {
|
|||||||
scorer.score_similarity(query, &entry)
|
scorer.score_similarity(query, &entry)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Apply similarity threshold
|
// Apply similarity threshold (relaxed for CJK queries since unicode61
|
||||||
|
// tokenizer doesn't produce meaningful TF-IDF scores for CJK text)
|
||||||
if let Some(min_similarity) = options.min_similarity {
|
if let Some(min_similarity) = options.min_similarity {
|
||||||
if semantic_score < min_similarity {
|
let threshold = if has_cjk {
|
||||||
|
// CJK TF-IDF scores are systematically low due to tokenizer limitations;
|
||||||
|
// use 50% of the normal threshold to avoid filtering out all results
|
||||||
|
min_similarity * 0.5
|
||||||
|
} else {
|
||||||
|
min_similarity
|
||||||
|
};
|
||||||
|
if semantic_score < threshold {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user