//! Query Analyzer //! //! Provides query analysis and expansion capabilities for improved retrieval. //! Extracts keywords, identifies intent, and generates search variations. use crate::types::MemoryType; use std::collections::HashSet; /// Query analysis result #[derive(Debug, Clone)] pub struct AnalyzedQuery { /// Original query string pub original: String, /// Extracted keywords pub keywords: Vec, /// Query intent pub intent: QueryIntent, /// Memory types to search (inferred from query) pub target_types: Vec, /// Expanded search terms pub expansions: Vec, } /// Query intent classification #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum QueryIntent { /// Looking for preferences/settings Preference, /// Looking for factual knowledge Knowledge, /// Looking for how-to/experience Experience, /// General conversation General, /// Code-related query Code, /// Configuration query Configuration, /// Identity/personal recall — user asks about themselves or past conversations /// Triggers broad retrieval of all preference + knowledge memories IdentityRecall, } /// Query analyzer pub struct QueryAnalyzer { /// Keywords that indicate preference queries preference_indicators: HashSet, /// Keywords that indicate knowledge queries knowledge_indicators: HashSet, /// Keywords that indicate experience queries experience_indicators: HashSet, /// Keywords that indicate code queries code_indicators: HashSet, /// Stop words to filter out stop_words: HashSet, /// Patterns indicating identity/personal recall queries identity_patterns: Vec, } impl QueryAnalyzer { /// Create a new query analyzer pub fn new() -> Self { Self { preference_indicators: [ "prefer", "like", "want", "favorite", "favourite", "style", "format", "language", "setting", "preference", "usually", "typically", "always", "never", "习惯", "偏好", "喜欢", "想要", ] .iter() .map(|s| s.to_string()) .collect(), knowledge_indicators: [ "what", "how", "why", "explain", "tell", "know", "learn", "understand", "meaning", "definition", "concept", "theory", "是什么", "怎么", "为什么", "解释", "了解", "知道", ] .iter() .map(|s| s.to_string()) .collect(), experience_indicators: [ "experience", "tried", "used", "before", "last time", "previous", "history", "remember", "recall", "when", "经验", "尝试", "用过", "上次", "记得", "回忆", ] .iter() .map(|s| s.to_string()) .collect(), code_indicators: [ "code", "function", "class", "method", "variable", "type", "error", "bug", "fix", "implement", "refactor", "api", "代码", "函数", "类", "方法", "变量", "错误", "修复", "实现", ] .iter() .map(|s| s.to_string()) .collect(), stop_words: [ "the", "a", "an", "is", "are", "was", "were", "be", "been", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "and", "or", "but", "if", "then", "else", "when", "where", "which", "who", "whom", "whose", "this", "that", "these", "those", ] .iter() .map(|s| s.to_string()) .collect(), identity_patterns: [ // Chinese identity recall patterns "我是谁", "我叫什么", "我之前", "我告诉过你", "我之前告诉", "还记得我", "你还记得", "我的名字", "我的身份", "我的信息", "我的工作", "我在哪", "我的偏好", "我喜欢什么", "关于我", "了解我", "记得我", "我之前说过", // English identity recall patterns "who am i", "what is my name", "what do you know about me", "what did i tell", "do you remember me", "what do you remember", "my preferences", "about me", "what have i shared", ] .iter() .map(|s| s.to_string()) .collect(), } } /// Analyze a query string pub fn analyze(&self, query: &str) -> AnalyzedQuery { let keywords = self.extract_keywords(query); // Check for identity recall patterns first (highest priority) let query_lower = query.to_lowercase(); let is_identity = self.identity_patterns.iter() .any(|pattern| query_lower.contains(&pattern.to_lowercase())); let intent = if is_identity { QueryIntent::IdentityRecall } else { self.classify_intent(&keywords) }; let target_types = self.infer_memory_types(intent, &keywords); let expansions = self.expand_query(&keywords); AnalyzedQuery { original: query.to_string(), keywords, intent, target_types, expansions, } } /// Extract keywords from query fn extract_keywords(&self, query: &str) -> Vec { query .to_lowercase() .split(|c: char| !c.is_alphanumeric() && !is_cjk(c)) .filter(|s| !s.is_empty() && s.len() > 1) .filter(|s| !self.stop_words.contains(*s)) .map(|s| s.to_string()) .collect() } /// Classify query intent fn classify_intent(&self, keywords: &[String]) -> QueryIntent { let mut scores = [ (QueryIntent::Preference, 0), (QueryIntent::Knowledge, 0), (QueryIntent::Experience, 0), (QueryIntent::Code, 0), ]; for keyword in keywords { if self.preference_indicators.contains(keyword) { scores[0].1 += 2; } if self.knowledge_indicators.contains(keyword) { scores[1].1 += 2; } if self.experience_indicators.contains(keyword) { scores[2].1 += 2; } if self.code_indicators.contains(keyword) { scores[3].1 += 2; } } // Find highest scoring intent scores.sort_by(|a, b| b.1.cmp(&a.1)); if scores[0].1 > 0 { scores[0].0 } else { QueryIntent::General } } /// Infer which memory types to search fn infer_memory_types(&self, intent: QueryIntent, _keywords: &[String]) -> Vec { let mut types = Vec::new(); match intent { QueryIntent::Preference => { types.push(MemoryType::Preference); } QueryIntent::Knowledge | QueryIntent::Code => { types.push(MemoryType::Knowledge); types.push(MemoryType::Experience); } QueryIntent::Experience => { types.push(MemoryType::Experience); types.push(MemoryType::Knowledge); } QueryIntent::General => { // Search all types types.push(MemoryType::Preference); types.push(MemoryType::Knowledge); types.push(MemoryType::Experience); } QueryIntent::Configuration => { types.push(MemoryType::Preference); types.push(MemoryType::Knowledge); } QueryIntent::IdentityRecall => { // Identity recall needs all memory types types.push(MemoryType::Preference); types.push(MemoryType::Knowledge); types.push(MemoryType::Experience); } } types } /// Expand query with related terms fn expand_query(&self, keywords: &[String]) -> Vec { let mut expansions = Vec::new(); // Add stemmed variations (simplified) for keyword in keywords { // Add singular/plural variations if keyword.ends_with('s') && keyword.len() > 3 { expansions.push(keyword[..keyword.len()-1].to_string()); } else { expansions.push(format!("{}s", keyword)); } // Add common synonyms (simplified) if let Some(synonyms) = self.get_synonyms(keyword) { expansions.extend(synonyms); } } expansions } /// Get synonyms for a keyword (simplified, English + Chinese) fn get_synonyms(&self, keyword: &str) -> Option> { let synonyms: &[&str] = match keyword { // English synonyms "code" => &["program", "script", "source"], "error" => &["bug", "issue", "problem", "exception"], "fix" => &["solve", "resolve", "repair", "patch"], "fast" => &["quick", "speed", "performance", "efficient"], "slow" => &["performance", "optimize", "speed"], "help" => &["assist", "support", "guide", "aid"], "learn" => &["study", "understand", "know", "grasp"], // Chinese synonyms — critical for Chinese-language queries "错误" => &["问题", "bug", "异常", "故障"], "修复" => &["解决", "修正", "处理", "fix"], "优化" => &["改进", "提升", "加速", "improve"], "配置" => &["设置", "参数", "选项", "config"], "性能" => &["速度", "效率", "performance"], "问题" => &["错误", "故障", "issue", "problem"], "帮助" => &["协助", "支持", "help"], "学习" => &["了解", "掌握", "learn"], "代码" => &["程序", "脚本", "code"], "数据库" => &["DB", "database", "存储"], "部署" => &["发布", "上线", "deploy"], "测试" => &["验证", "检验", "test"], "安全" => &["防护", "加密", "security"], _ => return None, }; Some(synonyms.iter().map(|s| s.to_string()).collect()) } /// Generate search queries from analyzed query pub fn generate_search_queries(&self, analyzed: &AnalyzedQuery) -> Vec { let mut queries = vec![analyzed.original.clone()]; // Add keyword-based query if !analyzed.keywords.is_empty() { queries.push(analyzed.keywords.join(" ")); } // Add expanded terms for expansion in &analyzed.expansions { if !expansion.is_empty() { queries.push(expansion.clone()); } } // Deduplicate queries.sort(); queries.dedup(); queries } } impl Default for QueryAnalyzer { fn default() -> Self { Self::new() } } /// Check if character is CJK fn is_cjk(c: char) -> bool { matches!(c, '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D '\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E '\u{F900}'..='\u{FAFF}' | // CJK Compatibility Ideographs '\u{2F800}'..='\u{2FA1F}' // CJK Compatibility Ideographs Supplement ) } #[cfg(test)] mod tests { use super::*; #[test] fn test_extract_keywords() { let analyzer = QueryAnalyzer::new(); let keywords = analyzer.extract_keywords("What is the Rust programming language?"); assert!(keywords.contains(&"rust".to_string())); assert!(keywords.contains(&"programming".to_string())); assert!(keywords.contains(&"language".to_string())); assert!(!keywords.contains(&"the".to_string())); // stop word } #[test] fn test_classify_intent_preference() { let analyzer = QueryAnalyzer::new(); let analyzed = analyzer.analyze("I prefer concise responses"); assert_eq!(analyzed.intent, QueryIntent::Preference); assert!(analyzed.target_types.contains(&MemoryType::Preference)); } #[test] fn test_classify_intent_knowledge() { let analyzer = QueryAnalyzer::new(); let analyzed = analyzer.analyze("Explain how async/await works in Rust"); assert_eq!(analyzed.intent, QueryIntent::Knowledge); } #[test] fn test_classify_intent_code() { let analyzer = QueryAnalyzer::new(); let analyzed = analyzer.analyze("Fix this error in my function"); assert_eq!(analyzed.intent, QueryIntent::Code); } #[test] fn test_query_expansion() { let analyzer = QueryAnalyzer::new(); let analyzed = analyzer.analyze("fix the error"); assert!(!analyzed.expansions.is_empty()); } #[test] fn test_generate_search_queries() { let analyzer = QueryAnalyzer::new(); let analyzed = analyzer.analyze("Rust programming"); let queries = analyzer.generate_search_queries(&analyzed); assert!(queries.len() >= 1); } #[test] fn test_cjk_detection() { assert!(is_cjk('中')); assert!(is_cjk('文')); assert!(!is_cjk('a')); assert!(!is_cjk('1')); } #[test] fn test_chinese_keywords() { let analyzer = QueryAnalyzer::new(); let keywords = analyzer.extract_keywords("我喜欢简洁的回复"); // Chinese characters should be extracted assert!(!keywords.is_empty()); } }