diff --git a/crates/zclaw-growth/src/storage/sqlite.rs b/crates/zclaw-growth/src/storage/sqlite.rs index 2dc24d3..60692ec 100644 --- a/crates/zclaw-growth/src/storage/sqlite.rs +++ b/crates/zclaw-growth/src/storage/sqlite.rs @@ -732,6 +732,11 @@ impl VikingStorage for SqliteStorage { async fn find(&self, query: &str, options: FindOptions) -> Result> { let limit = options.limit.unwrap_or(50).max(20); // Fetch more candidates for reranking + // Detect CJK early — used both for LIKE fallback and similarity threshold relaxation + let has_cjk = query.chars().any(|c| { + matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}') + }); + // Strategy: use FTS5 for initial filtering when query is non-empty, // then score candidates with TF-IDF / embedding for precise ranking. // When FTS5 returns nothing, we return empty — do NOT fall back to @@ -792,9 +797,6 @@ impl VikingStorage for SqliteStorage { // FTS5 returned no results or failed — check if query contains CJK // characters. unicode61 tokenizer doesn't index CJK, so fall back // to LIKE-based search for CJK queries. - let has_cjk = query.chars().any(|c| { - matches!(c, '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}') - }); if !has_cjk { tracing::debug!( @@ -897,9 +899,17 @@ impl VikingStorage for SqliteStorage { scorer.score_similarity(query, &entry) }; - // Apply similarity threshold + // Apply similarity threshold (relaxed for CJK queries since unicode61 + // tokenizer doesn't produce meaningful TF-IDF scores for CJK text) if let Some(min_similarity) = options.min_similarity { - if semantic_score < min_similarity { + let threshold = if has_cjk { + // CJK TF-IDF scores are systematically low due to tokenizer limitations; + // use 50% of the normal threshold to avoid filtering out all results + min_similarity * 0.5 + } else { + min_similarity + }; + if semantic_score < threshold { continue; } }