Files
zclaw_openfang/crates/zclaw-growth/src/retrieval/query.rs
iven 9a2611d122
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
fix(growth,hands,kernel,desktop): Phase 1 用户可感知修复 — 6 项断链修复
Phase 1 修复内容:
1. Hand 执行前端字段映射 — instance_id → runId,修复 Hand 状态追踪
2. Heartbeat 痛点感知 — PAIN_POINTS_CACHE + VikingStorage 持久化 + 未解决痛点检查
3. Browser Hand 委托消息 — pending_execution → delegated_to_frontend + 中文摘要
4. 跨会话记忆检索增强 — 扩展 IdentityRecall 模式 26→43 + 弱身份信号检测 + 低结果 fallback
5. Twitter Hand 凭据持久化 — SetCredentials action + 文件持久化 + 启动恢复
6. Browser 测试修复 — 适配新的 delegated_to_frontend 响应格式

验证: cargo check  | cargo test 912 PASS  | tsc --noEmit 
2026-04-21 10:18:25 +08:00

475 lines
18 KiB
Rust

//! Query Analyzer
//!
//! Provides query analysis and expansion capabilities for improved retrieval.
//! Extracts keywords, identifies intent, and generates search variations.
use crate::types::MemoryType;
use std::collections::HashSet;
/// Query analysis result
#[derive(Debug, Clone)]
pub struct AnalyzedQuery {
/// Original query string
pub original: String,
/// Extracted keywords
pub keywords: Vec<String>,
/// Query intent
pub intent: QueryIntent,
/// Memory types to search (inferred from query)
pub target_types: Vec<MemoryType>,
/// Expanded search terms
pub expansions: Vec<String>,
/// Whether weak identity signals were detected (personal pronouns, possessives)
pub weak_identity: bool,
}
/// Query intent classification
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QueryIntent {
/// Looking for preferences/settings
Preference,
/// Looking for factual knowledge
Knowledge,
/// Looking for how-to/experience
Experience,
/// General conversation
General,
/// Code-related query
Code,
/// Configuration query
Configuration,
/// Identity/personal recall — user asks about themselves or past conversations
/// Triggers broad retrieval of all preference + knowledge memories
IdentityRecall,
}
/// Query analyzer
pub struct QueryAnalyzer {
/// Keywords that indicate preference queries
preference_indicators: HashSet<String>,
/// Keywords that indicate knowledge queries
knowledge_indicators: HashSet<String>,
/// Keywords that indicate experience queries
experience_indicators: HashSet<String>,
/// Keywords that indicate code queries
code_indicators: HashSet<String>,
/// Stop words to filter out
stop_words: HashSet<String>,
/// Patterns indicating identity/personal recall queries
identity_patterns: Vec<String>,
/// Weak identity signals (pronouns, possessives) that boost broad retrieval
weak_identity_indicators: Vec<String>,
}
impl QueryAnalyzer {
/// Create a new query analyzer
pub fn new() -> Self {
Self {
preference_indicators: [
"prefer", "like", "want", "favorite", "favourite", "style",
"format", "language", "setting", "preference", "usually",
"typically", "always", "never", "习惯", "偏好", "喜欢", "想要",
]
.iter()
.map(|s| s.to_string())
.collect(),
knowledge_indicators: [
"what", "how", "why", "explain", "tell", "know", "learn",
"understand", "meaning", "definition", "concept", "theory",
"是什么", "怎么", "为什么", "解释", "了解", "知道",
]
.iter()
.map(|s| s.to_string())
.collect(),
experience_indicators: [
"experience", "tried", "used", "before", "last time",
"previous", "history", "remember", "recall", "when",
"经验", "尝试", "用过", "上次", "记得", "回忆",
]
.iter()
.map(|s| s.to_string())
.collect(),
code_indicators: [
"code", "function", "class", "method", "variable", "type",
"error", "bug", "fix", "implement", "refactor", "api",
"代码", "函数", "", "方法", "变量", "错误", "修复", "实现",
]
.iter()
.map(|s| s.to_string())
.collect(),
stop_words: [
"the", "a", "an", "is", "are", "was", "were", "be", "been",
"have", "has", "had", "do", "does", "did", "will", "would",
"could", "should", "may", "might", "must", "can", "to", "of",
"in", "for", "on", "with", "at", "by", "from", "as", "and",
"or", "but", "if", "then", "else", "when", "where", "which",
"who", "whom", "whose", "this", "that", "these", "those",
]
.iter()
.map(|s| s.to_string())
.collect(),
identity_patterns: [
// Chinese identity recall patterns — direct identity queries
"我是谁", "我叫什么", "我的名字", "我的身份", "我的信息",
"关于我", "了解我", "记得我",
// Chinese — cross-session recall ("what did we discuss before")
"我之前", "我告诉过你", "我之前告诉", "我之前说过",
"还记得我", "你还记得", "你记得吗", "记得之前",
"我们之前聊过", "我们讨论过", "我们聊过", "上次聊",
"之前说过", "之前告诉", "以前说过", "以前聊过",
// Chinese — preferences/settings queries
"我的偏好", "我喜欢什么", "我的工作", "我在哪",
"我的设置", "我的习惯", "我的爱好", "我的职业",
"我记得", "我想起来", "我忘了",
// English identity recall patterns
"who am i", "what is my name", "what do you know about me",
"what did i tell", "do you remember me", "what do you remember",
"my preferences", "about me", "what have i shared",
"remind me", "what we discussed", "my settings", "my profile",
"tell me about myself", "what did we talk about", "what was my",
"i mentioned before", "we talked about", "i told you before",
]
.iter()
.map(|s| s.to_string())
.collect(),
// Weak identity signals — pronouns that hint at personal context
weak_identity_indicators: [
"我的", "我之前", "我们之前", "我们上次",
"my ", "i told", "i said", "we discussed", "we talked",
]
.iter()
.map(|s| s.to_string())
.collect(),
}
}
/// Analyze a query string
pub fn analyze(&self, query: &str) -> AnalyzedQuery {
let keywords = self.extract_keywords(query);
// Check for identity recall patterns first (highest priority)
let query_lower = query.to_lowercase();
let is_identity = self.identity_patterns.iter()
.any(|pattern| query_lower.contains(&pattern.to_lowercase()));
// Check for weak identity signals (personal pronouns, possessives)
let weak_identity = !is_identity && self.weak_identity_indicators.iter()
.any(|indicator| query_lower.contains(&indicator.to_lowercase()));
let intent = if is_identity {
QueryIntent::IdentityRecall
} else {
self.classify_intent(&keywords)
};
let target_types = self.infer_memory_types(intent, &keywords);
let expansions = self.expand_query(&keywords);
AnalyzedQuery {
original: query.to_string(),
keywords,
intent,
target_types,
expansions,
weak_identity,
}
}
/// Extract keywords from query
fn extract_keywords(&self, query: &str) -> Vec<String> {
query
.to_lowercase()
.split(|c: char| !c.is_alphanumeric() && !is_cjk(c))
.filter(|s| !s.is_empty() && s.len() > 1)
.filter(|s| !self.stop_words.contains(*s))
.map(|s| s.to_string())
.collect()
}
/// Classify query intent
fn classify_intent(&self, keywords: &[String]) -> QueryIntent {
let mut scores = [
(QueryIntent::Preference, 0),
(QueryIntent::Knowledge, 0),
(QueryIntent::Experience, 0),
(QueryIntent::Code, 0),
];
for keyword in keywords {
if self.preference_indicators.contains(keyword) {
scores[0].1 += 2;
}
if self.knowledge_indicators.contains(keyword) {
scores[1].1 += 2;
}
if self.experience_indicators.contains(keyword) {
scores[2].1 += 2;
}
if self.code_indicators.contains(keyword) {
scores[3].1 += 2;
}
}
// Find highest scoring intent
scores.sort_by(|a, b| b.1.cmp(&a.1));
if scores[0].1 > 0 {
scores[0].0
} else {
QueryIntent::General
}
}
/// Infer which memory types to search
fn infer_memory_types(&self, intent: QueryIntent, _keywords: &[String]) -> Vec<MemoryType> {
let mut types = Vec::new();
match intent {
QueryIntent::Preference => {
types.push(MemoryType::Preference);
}
QueryIntent::Knowledge | QueryIntent::Code => {
types.push(MemoryType::Knowledge);
types.push(MemoryType::Experience);
}
QueryIntent::Experience => {
types.push(MemoryType::Experience);
types.push(MemoryType::Knowledge);
}
QueryIntent::General => {
// Search all types
types.push(MemoryType::Preference);
types.push(MemoryType::Knowledge);
types.push(MemoryType::Experience);
}
QueryIntent::Configuration => {
types.push(MemoryType::Preference);
types.push(MemoryType::Knowledge);
}
QueryIntent::IdentityRecall => {
// Identity recall needs all memory types
types.push(MemoryType::Preference);
types.push(MemoryType::Knowledge);
types.push(MemoryType::Experience);
}
}
types
}
/// Expand query with related terms
fn expand_query(&self, keywords: &[String]) -> Vec<String> {
let mut expansions = Vec::new();
// Add stemmed variations (simplified)
for keyword in keywords {
// Add singular/plural variations
if keyword.ends_with('s') && keyword.len() > 3 {
expansions.push(keyword[..keyword.len()-1].to_string());
} else {
expansions.push(format!("{}s", keyword));
}
// Add common synonyms (simplified)
if let Some(synonyms) = self.get_synonyms(keyword) {
expansions.extend(synonyms);
}
}
expansions
}
/// Get synonyms for a keyword (simplified, English + Chinese)
fn get_synonyms(&self, keyword: &str) -> Option<Vec<String>> {
let synonyms: &[&str] = match keyword {
// English synonyms
"code" => &["program", "script", "source"],
"error" => &["bug", "issue", "problem", "exception"],
"fix" => &["solve", "resolve", "repair", "patch"],
"fast" => &["quick", "speed", "performance", "efficient"],
"slow" => &["performance", "optimize", "speed"],
"help" => &["assist", "support", "guide", "aid"],
"learn" => &["study", "understand", "know", "grasp"],
// Chinese synonyms — critical for Chinese-language queries
"错误" => &["问题", "bug", "异常", "故障"],
"修复" => &["解决", "修正", "处理", "fix"],
"优化" => &["改进", "提升", "加速", "improve"],
"配置" => &["设置", "参数", "选项", "config"],
"性能" => &["速度", "效率", "performance"],
"问题" => &["错误", "故障", "issue", "problem"],
"帮助" => &["协助", "支持", "help"],
"学习" => &["了解", "掌握", "learn"],
"代码" => &["程序", "脚本", "code"],
"数据库" => &["DB", "database", "存储"],
"部署" => &["发布", "上线", "deploy"],
"测试" => &["验证", "检验", "test"],
"安全" => &["防护", "加密", "security"],
_ => return None,
};
Some(synonyms.iter().map(|s| s.to_string()).collect())
}
/// Generate search queries from analyzed query
pub fn generate_search_queries(&self, analyzed: &AnalyzedQuery) -> Vec<String> {
let mut queries = vec![analyzed.original.clone()];
// Add keyword-based query
if !analyzed.keywords.is_empty() {
queries.push(analyzed.keywords.join(" "));
}
// Add expanded terms
for expansion in &analyzed.expansions {
if !expansion.is_empty() {
queries.push(expansion.clone());
}
}
// Deduplicate
queries.sort();
queries.dedup();
queries
}
}
impl Default for QueryAnalyzer {
fn default() -> Self {
Self::new()
}
}
/// Check if character is CJK
fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
'\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
'\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
'\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
'\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E
'\u{F900}'..='\u{FAFF}' | // CJK Compatibility Ideographs
'\u{2F800}'..='\u{2FA1F}' // CJK Compatibility Ideographs Supplement
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_keywords() {
let analyzer = QueryAnalyzer::new();
let keywords = analyzer.extract_keywords("What is the Rust programming language?");
assert!(keywords.contains(&"rust".to_string()));
assert!(keywords.contains(&"programming".to_string()));
assert!(keywords.contains(&"language".to_string()));
assert!(!keywords.contains(&"the".to_string())); // stop word
}
#[test]
fn test_classify_intent_preference() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("I prefer concise responses");
assert_eq!(analyzed.intent, QueryIntent::Preference);
assert!(analyzed.target_types.contains(&MemoryType::Preference));
}
#[test]
fn test_classify_intent_knowledge() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Explain how async/await works in Rust");
assert_eq!(analyzed.intent, QueryIntent::Knowledge);
}
#[test]
fn test_classify_intent_code() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Fix this error in my function");
assert_eq!(analyzed.intent, QueryIntent::Code);
}
#[test]
fn test_query_expansion() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("fix the error");
assert!(!analyzed.expansions.is_empty());
}
#[test]
fn test_generate_search_queries() {
let analyzer = QueryAnalyzer::new();
let analyzed = analyzer.analyze("Rust programming");
let queries = analyzer.generate_search_queries(&analyzed);
assert!(queries.len() >= 1);
}
#[test]
fn test_cjk_detection() {
assert!(is_cjk('中'));
assert!(is_cjk('文'));
assert!(!is_cjk('a'));
assert!(!is_cjk('1'));
}
#[test]
fn test_chinese_keywords() {
let analyzer = QueryAnalyzer::new();
let keywords = analyzer.extract_keywords("我喜欢简洁的回复");
// Chinese characters should be extracted
assert!(!keywords.is_empty());
}
#[test]
fn test_identity_recall_expanded_patterns() {
let analyzer = QueryAnalyzer::new();
// New Chinese patterns should trigger IdentityRecall
assert_eq!(analyzer.analyze("我们之前聊过什么").intent, QueryIntent::IdentityRecall);
assert_eq!(analyzer.analyze("你记得吗上次说的").intent, QueryIntent::IdentityRecall);
assert_eq!(analyzer.analyze("我的设置是什么").intent, QueryIntent::IdentityRecall);
assert_eq!(analyzer.analyze("我们讨论过这个话题").intent, QueryIntent::IdentityRecall);
// New English patterns
assert_eq!(analyzer.analyze("what did we talk about yesterday").intent, QueryIntent::IdentityRecall);
assert_eq!(analyzer.analyze("remind me what I said").intent, QueryIntent::IdentityRecall);
assert_eq!(analyzer.analyze("my settings").intent, QueryIntent::IdentityRecall);
}
#[test]
fn test_weak_identity_detection() {
let analyzer = QueryAnalyzer::new();
// Queries with "我的" but not matching full identity patterns
let analyzed = analyzer.analyze("我的项目进度怎么样了");
assert!(analyzed.weak_identity, "Should detect weak identity from '我的'");
assert_ne!(analyzed.intent, QueryIntent::IdentityRecall);
// Queries without personal signals should not trigger weak identity
let analyzed = analyzer.analyze("解释一下Rust的所有权");
assert!(!analyzed.weak_identity);
// Full identity pattern should NOT set weak_identity (it's already IdentityRecall)
let analyzed = analyzer.analyze("我是谁");
assert!(!analyzed.weak_identity);
assert_eq!(analyzed.intent, QueryIntent::IdentityRecall);
}
#[test]
fn test_no_false_identity_on_general_queries() {
let analyzer = QueryAnalyzer::new();
// General queries should not trigger identity recall or weak identity
assert_ne!(analyzer.analyze("什么是机器学习").intent, QueryIntent::IdentityRecall);
assert!(!analyzer.analyze("什么是机器学习").weak_identity);
}
}