Files
zclaw_openfang/crates/zclaw-growth/src/retrieval/query.rs
iven 1bf0d3a73d fix(memory): CJK-aware short query threshold + Chinese synonym expansion
1. MemoryMiddleware: replace byte-length check (query.len() < 4) with
   char-count check (query.chars().count() < 2). Single CJK characters
   are 3 UTF-8 bytes but 1 meaningful character — the old threshold
   incorrectly skipped 1-2 char Chinese queries like "你好".

2. QueryAnalyzer: add Chinese synonym mappings for 13 common technical
   terms (错误→bug, 优化→improve, 配置→config, etc.) so CJK queries
   can find relevant English-keyword memories and vice versa.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-02 01:21:29 +08:00

368 lines
12 KiB
Rust

//! Query Analyzer
//!
//! Provides query analysis and expansion capabilities for improved retrieval.
//! Extracts keywords, identifies intent, and generates search variations.
use crate::types::MemoryType;
use std::collections::HashSet;
/// Query analysis result
#[derive(Debug, Clone)]
pub struct AnalyzedQuery {
/// Original query string
pub original: String,
/// Extracted keywords
pub keywords: Vec<String>,
/// Query intent
pub intent: QueryIntent,
/// Memory types to search (inferred from query)
pub target_types: Vec<MemoryType>,
/// Expanded search terms
pub expansions: Vec<String>,
}
/// Query intent classification
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QueryIntent {
/// Looking for preferences/settings
Preference,
/// Looking for factual knowledge
Knowledge,
/// Looking for how-to/experience
Experience,
/// General conversation
General,
/// Code-related query
Code,
/// Configuration query
Configuration,
}
/// Query analyzer
pub struct QueryAnalyzer {
/// Keywords that indicate preference queries
preference_indicators: HashSet<String>,
/// Keywords that indicate knowledge queries
knowledge_indicators: HashSet<String>,
/// Keywords that indicate experience queries
experience_indicators: HashSet<String>,
/// Keywords that indicate code queries
code_indicators: HashSet<String>,
/// Stop words to filter out
stop_words: HashSet<String>,
}
impl QueryAnalyzer {
/// Create a new query analyzer
pub fn new() -> Self {
Self {
preference_indicators: [
"prefer", "like", "want", "favorite", "favourite", "style",
"format", "language", "setting", "preference", "usually",
"typically", "always", "never", "习惯", "偏好", "喜欢", "想要",
]
.iter()
.map(|s| s.to_string())
.collect(),
knowledge_indicators: [
"what", "how", "why", "explain", "tell", "know", "learn",
"understand", "meaning", "definition", "concept", "theory",
"是什么", "怎么", "为什么", "解释", "了解", "知道",
]
.iter()
.map(|s| s.to_string())
.collect(),
experience_indicators: [
"experience", "tried", "used", "before", "last time",
"previous", "history", "remember", "recall", "when",
"经验", "尝试", "用过", "上次", "记得", "回忆",
]
.iter()
.map(|s| s.to_string())
.collect(),
code_indicators: [
"code", "function", "class", "method", "variable", "type",
"error", "bug", "fix", "implement", "refactor", "api",
"代码", "函数", "", "方法", "变量", "错误", "修复", "实现",
]
.iter()
.map(|s| s.to_string())
.collect(),
stop_words: [
"the", "a", "an", "is", "are", "was", "were", "be", "been",
"have", "has", "had", "do", "does", "did", "will", "would",
"could", "should", "may", "might", "must", "can", "to", "of",
"in", "for", "on", "with", "at", "by", "from", "as", "and",
"or", "but", "if", "then", "else", "when", "where", "which",
"who", "whom", "whose", "this", "that", "these", "those",
]
.iter()
.map(|s| s.to_string())
.collect(),
}
}
/// Analyze a query string
pub fn analyze(&self, query: &str) -> AnalyzedQuery {
let keywords = self.extract_keywords(query);
let intent = self.classify_intent(&keywords);
let target_types = self.infer_memory_types(intent, &keywords);
let expansions = self.expand_query(&keywords);
AnalyzedQuery {
original: query.to_string(),
keywords,
intent,
target_types,
expansions,
}
}
/// Extract keywords from query
fn extract_keywords(&self, query: &str) -> Vec<String> {
query
.to_lowercase()
.split(|c: char| !c.is_alphanumeric() && !is_cjk(c))
.filter(|s| !s.is_empty() && s.len() > 1)
.filter(|s| !self.stop_words.contains(*s))
.map(|s| s.to_string())
.collect()
}
/// Classify query intent
fn classify_intent(&self, keywords: &[String]) -> QueryIntent {
let mut scores = [
(QueryIntent::Preference, 0),
(QueryIntent::Knowledge, 0),
(QueryIntent::Experience, 0),
(QueryIntent::Code, 0),
];
for keyword in keywords {
if self.preference_indicators.contains(keyword) {
scores[0].1 += 2;
}
if self.knowledge_indicators.contains(keyword) {
scores[1].1 += 2;
}
if self.experience_indicators.contains(keyword) {
scores[2].1 += 2;
}
if self.code_indicators.contains(keyword) {
scores[3].1 += 2;
}
}
// Find highest scoring intent
scores.sort_by(|a, b| b.1.cmp(&a.1));
if scores[0].1 > 0 {
scores[0].0
} else {
QueryIntent::General
}
}
/// Infer which memory types to search
fn infer_memory_types(&self, intent: QueryIntent, _keywords: &[String]) -> Vec<MemoryType> {
let mut types = Vec::new();
match intent {
QueryIntent::Preference => {
types.push(MemoryType::Preference);
}
QueryIntent::Knowledge | QueryIntent::Code => {
types.push(MemoryType::Knowledge);
types.push(MemoryType::Experience);
}
QueryIntent::Experience => {
types.push(MemoryType::Experience);
types.push(MemoryType::Knowledge);
}
QueryIntent::General => {
// Search all types
types.push(MemoryType::Preference);
types.push(MemoryType::Knowledge);
types.push(MemoryType::Experience);
}
QueryIntent::Configuration => {
types.push(MemoryType::Preference);
types.push(MemoryType::Knowledge);
}
}
types
}
/// Expand query with related terms
fn expand_query(&self, keywords: &[String]) -> Vec<String> {
let mut expansions = Vec::new();
// Add stemmed variations (simplified)
for keyword in keywords {
// Add singular/plural variations
if keyword.ends_with('s') && keyword.len() > 3 {
expansions.push(keyword[..keyword.len()-1].to_string());
} else {
expansions.push(format!("{}s", keyword));
}
// Add common synonyms (simplified)
if let Some(synonyms) = self.get_synonyms(keyword) {
expansions.extend(synonyms);
}
}
expansions
}
/// Get synonyms for a keyword (simplified, English + Chinese)
fn get_synonyms(&self, keyword: &str) -> Option<Vec<String>> {
let synonyms: &[&str] = match keyword {
// English synonyms
"code" => &["program", "script", "source"],
"error" => &["bug", "issue", "problem", "exception"],
"fix" => &["solve", "resolve", "repair", "patch"],
"fast" => &["quick", "speed", "performance", "efficient"],
"slow" => &["performance", "optimize", "speed"],
"help" => &["assist", "support", "guide", "aid"],
"learn" => &["study", "understand", "know", "grasp"],
// Chinese synonyms — critical for Chinese-language queries
"错误" => &["问题", "bug", "异常", "故障"],
"修复" => &["解决", "修正", "处理", "fix"],
"优化" => &["改进", "提升", "加速", "improve"],
"配置" => &["设置", "参数", "选项", "config"],
"性能" => &["速度", "效率", "performance"],
"问题" => &["错误", "故障", "issue", "problem"],
"帮助" => &["协助", "支持", "help"],
"学习" => &["了解", "掌握", "learn"],
"代码" => &["程序", "脚本", "code"],
"数据库" => &["DB", "database", "存储"],
"部署" => &["发布", "上线", "deploy"],
"测试" => &["验证", "检验", "test"],
"安全" => &["防护", "加密", "security"],
_ => return None,
};
Some(synonyms.iter().map(|s| s.to_string()).collect())
}
/// Generate search queries from analyzed query
pub fn generate_search_queries(&self, analyzed: &AnalyzedQuery) -> Vec<String> {
let mut queries = vec![analyzed.original.clone()];
// Add keyword-based query
if !analyzed.keywords.is_empty() {
queries.push(analyzed.keywords.join(" "));
}
// Add expanded terms
for expansion in &analyzed.expansions {
if !expansion.is_empty() {
queries.push(expansion.clone());
}
}
// Deduplicate
queries.sort();
queries.dedup();
queries
}
}
impl Default for QueryAnalyzer {
fn default() -> Self {
Self::new()
}
}
/// Check if character is CJK
fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
'\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
'\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
'\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
'\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E
'\u{F900}'..='\u{FAFF}' | // CJK Compatibility Ideographs
'\u{2F800}'..='\u{2FA1F}' // CJK Compatibility Ideographs Supplement
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_keywords() {
let analyzer = QueryAnalyzer::new();
let keywords = analyzer.extract_keywords("What is the Rust programming language?");
assert!(keywords.contains(&"rust".to_string()));
assert!(keywords.contains(&"programming".to_string()));
assert!(keywords.contains(&"language".to_string()));
assert!(!keywords.contains(&"the".to_string())); // stop word
}
#[test]
fn test_classify_intent_preference() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("I prefer concise responses");
assert_eq!(analyzed.intent, QueryIntent::Preference);
assert!(analyzed.target_types.contains(&MemoryType::Preference));
}
#[test]
fn test_classify_intent_knowledge() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Explain how async/await works in Rust");
assert_eq!(analyzed.intent, QueryIntent::Knowledge);
}
#[test]
fn test_classify_intent_code() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Fix this error in my function");
assert_eq!(analyzed.intent, QueryIntent::Code);
}
#[test]
fn test_query_expansion() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("fix the error");
assert!(!analyzed.expansions.is_empty());
}
#[test]
fn test_generate_search_queries() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Rust programming");
let queries = analyzer.generate_search_queries(&analyzed);
assert!(queries.len() >= 1);
}
#[test]
fn test_cjk_detection() {
assert!(is_cjk('中'));
assert!(is_cjk('文'));
assert!(!is_cjk('a'));
assert!(!is_cjk('1'));
}
#[test]
fn test_chinese_keywords() {
let analyzer = QueryAnalyzer::new();
let keywords = analyzer.extract_keywords("我喜欢简洁的回复");
// Chinese characters should be extracted
assert!(!keywords.is_empty());
}
}