//! Researcher Hand - Deep research and analysis capabilities //! //! This hand provides web search, content fetching, and research synthesis. use async_trait::async_trait; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::RwLock; use zclaw_types::Result; use crate::{Hand, HandConfig, HandContext, HandResult}; /// Search engine options #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum SearchEngine { Google, Bing, DuckDuckGo, Auto, } impl Default for SearchEngine { fn default() -> Self { Self::Auto } } /// Research depth level #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum ResearchDepth { Quick, // Fast search, top 3 results Standard, // Normal search, top 10 results Deep, // Comprehensive search, multiple sources } impl Default for ResearchDepth { fn default() -> Self { Self::Standard } } /// Research query configuration #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ResearchQuery { /// Search query pub query: String, /// Search engine to use #[serde(default)] pub engine: SearchEngine, /// Research depth #[serde(default)] pub depth: ResearchDepth, /// Maximum results to return #[serde(default = "default_max_results")] pub max_results: usize, /// Include related topics #[serde(default)] pub include_related: bool, /// Time limit in seconds #[serde(default = "default_time_limit")] pub time_limit_secs: u64, } fn default_max_results() -> usize { 10 } fn default_time_limit() -> u64 { 60 } /// Search result item #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { /// Title of the result pub title: String, /// URL pub url: String, /// Snippet/summary pub snippet: String, /// Source name pub source: String, /// Relevance score (0-100) #[serde(default)] pub relevance: u8, /// Fetched content (if available) #[serde(default)] pub content: Option, /// Timestamp #[serde(default)] pub fetched_at: Option, } /// Research report #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ResearchReport { /// Original query pub query: String, /// Search results pub results: Vec, /// Synthesized summary #[serde(default)] pub summary: Option, /// Key findings #[serde(default)] pub key_findings: Vec, /// Related topics discovered #[serde(default)] pub related_topics: Vec, /// Research timestamp pub researched_at: String, /// Total time spent (ms) pub duration_ms: u64, } /// Researcher action types #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "action")] pub enum ResearcherAction { #[serde(rename = "search")] Search { query: ResearchQuery }, #[serde(rename = "fetch")] Fetch { url: String }, #[serde(rename = "summarize")] Summarize { urls: Vec }, #[serde(rename = "report")] Report { query: ResearchQuery }, } /// Researcher Hand implementation pub struct ResearcherHand { config: HandConfig, client: reqwest::Client, cache: Arc>>, } impl ResearcherHand { /// Create a new researcher hand pub fn new() -> Self { Self { config: HandConfig { id: "researcher".to_string(), name: "研究员".to_string(), description: "深度研究和分析能力,支持网络搜索和内容获取".to_string(), needs_approval: false, dependencies: vec!["network".to_string()], input_schema: Some(serde_json::json!({ "type": "object", "oneOf": [ { "properties": { "action": { "const": "search" }, "query": { "type": "object", "properties": { "query": { "type": "string" }, "engine": { "type": "string", "enum": ["google", "bing", "duckduckgo", "auto"] }, "depth": { "type": "string", "enum": ["quick", "standard", "deep"] }, "maxResults": { "type": "integer" } }, "required": ["query"] } }, "required": ["action", "query"] }, { "properties": { "action": { "const": "fetch" }, "url": { "type": "string" } }, "required": ["action", "url"] }, { "properties": { "action": { "const": "report" }, "query": { "$ref": "#/properties/query" } }, "required": ["action", "query"] } ] })), tags: vec!["research".to_string(), "web".to_string(), "search".to_string()], enabled: true, }, client: reqwest::Client::builder() .timeout(std::time::Duration::from_secs(30)) .user_agent("ZCLAW-Researcher/1.0") .build() .unwrap_or_else(|_| reqwest::Client::new()), cache: Arc::new(RwLock::new(HashMap::new())), } } /// Execute a web search async fn execute_search(&self, query: &ResearchQuery) -> Result> { let start = std::time::Instant::now(); // Use DuckDuckGo as default search (no API key required) let results = self.search_duckduckgo(&query.query, query.max_results).await?; let duration = start.elapsed().as_millis() as u64; tracing::info!( target: "researcher", query = %query.query, duration_ms = duration, results_count = results.len(), "Search completed" ); Ok(results) } /// Search using DuckDuckGo (no API key required) async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result> { let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1", url_encode(query)); let response = self.client .get(&url) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?; let json: Value = response.json().await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?; let mut results = Vec::new(); // Parse DuckDuckGo Instant Answer if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) { if !abstract_text.is_empty() { results.push(SearchResult { title: query.to_string(), url: json.get("AbstractURL") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(), snippet: abstract_text.to_string(), source: json.get("AbstractSource") .and_then(|v| v.as_str()) .unwrap_or("DuckDuckGo") .to_string(), relevance: 100, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } } // Parse related topics if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) { for item in related.iter().take(max_results) { if let Some(obj) = item.as_object() { results.push(SearchResult { title: obj.get("Text") .and_then(|v| v.as_str()) .unwrap_or("Related Topic") .to_string(), url: obj.get("FirstURL") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(), snippet: obj.get("Text") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(), source: "DuckDuckGo".to_string(), relevance: 80, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } } } Ok(results) } /// Fetch content from a URL async fn execute_fetch(&self, url: &str) -> Result { let start = std::time::Instant::now(); // Check cache first { let cache = self.cache.read().await; if let Some(cached) = cache.get(url) { if cached.content.is_some() { return Ok(cached.clone()); } } } let response = self.client .get(url) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Fetch request failed: {}", e)))?; let content_type = response.headers() .get(reqwest::header::CONTENT_TYPE) .and_then(|v| v.to_str().ok()) .unwrap_or(""); let content = if content_type.contains("text/html") { // Extract text from HTML let html = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?; self.extract_text_from_html(&html) } else if content_type.contains("text/") || content_type.contains("application/json") { response.text().await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))? } else { "[Binary content]".to_string() }; let result = SearchResult { title: url.to_string(), url: url.to_string(), snippet: content.chars().take(500).collect(), source: url.to_string(), relevance: 100, content: Some(content), fetched_at: Some(chrono::Utc::now().to_rfc3339()), }; // Cache the result { let mut cache = self.cache.write().await; cache.insert(url.to_string(), result.clone()); } let duration = start.elapsed().as_millis() as u64; tracing::info!( target: "researcher", url = url, duration_ms = duration, "Fetch completed" ); Ok(result) } /// Extract readable text from HTML fn extract_text_from_html(&self, html: &str) -> String { let html_lower = html.to_lowercase(); let mut text = String::new(); let mut in_tag = false; let mut in_script = false; let mut in_style = false; let mut pos: usize = 0; for c in html.chars() { let char_len = c.len_utf8(); match c { '<' => { // Check for closing tags before entering tag mode let remaining = &html_lower[pos..]; if remaining.starts_with("' => { in_tag = false; } _ if in_tag => {} _ if in_script || in_style => {} ' ' | '\n' | '\t' | '\r' => { if !text.ends_with(' ') && !text.is_empty() { text.push(' '); } } _ => text.push(c), } pos += char_len; } if text.len() > 10000 { text.truncate(10000); text.push_str("..."); } text.trim().to_string() } /// Generate a comprehensive research report async fn execute_report(&self, query: &ResearchQuery) -> Result { let start = std::time::Instant::now(); // First, execute search let mut results = self.execute_search(query).await?; // Fetch content for top results let fetch_limit = match query.depth { ResearchDepth::Quick => 1, ResearchDepth::Standard => 3, ResearchDepth::Deep => 5, }; for result in results.iter_mut().take(fetch_limit) { if !result.url.is_empty() { match self.execute_fetch(&result.url).await { Ok(fetched) => { result.content = fetched.content; result.fetched_at = fetched.fetched_at; } Err(e) => { tracing::warn!(target: "researcher", error = %e, "Failed to fetch content"); } } } } // Extract key findings let key_findings: Vec = results.iter() .take(5) .filter_map(|r| { r.content.as_ref().map(|c| { c.split(". ") .take(3) .collect::>() .join(". ") }) }) .collect(); // Extract related topics from snippets let related_topics: Vec = results.iter() .filter_map(|r| { if r.snippet.len() > 50 { Some(r.title.clone()) } else { None } }) .take(5) .collect(); let duration = start.elapsed().as_millis() as u64; // Generate summary from top results let summary = if results.is_empty() { "未找到相关结果,建议调整搜索关键词后重试".to_string() } else { let top_snippets: Vec<&str> = results .iter() .take(3) .filter_map(|r| { let s = r.snippet.trim(); if s.is_empty() { None } else { Some(s) } }) .collect(); if top_snippets.is_empty() { format!("找到 {} 条相关结果,但无摘要信息", results.len()) } else { format!( "基于 {} 条搜索结果:{}", results.len(), top_snippets.join(";") ) } }; Ok(ResearchReport { query: query.query.clone(), results, summary: Some(summary), key_findings, related_topics, researched_at: chrono::Utc::now().to_rfc3339(), duration_ms: duration, }) } } impl Default for ResearcherHand { fn default() -> Self { Self::new() } } #[async_trait] impl Hand for ResearcherHand { fn config(&self) -> &HandConfig { &self.config } async fn execute(&self, _context: &HandContext, input: Value) -> Result { let action: ResearcherAction = serde_json::from_value(input.clone()) .map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid action: {}", e)))?; let start = std::time::Instant::now(); let result = match action { ResearcherAction::Search { query } => { let results = self.execute_search(&query).await?; json!({ "action": "search", "query": query.query, "results": results, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Fetch { url } => { let result = self.execute_fetch(&url).await?; json!({ "action": "fetch", "url": url, "result": result, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Summarize { urls } => { let mut results = Vec::new(); for url in urls.iter().take(5) { if let Ok(result) = self.execute_fetch(url).await { results.push(result); } } json!({ "action": "summarize", "urls": urls, "results": results, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Report { query } => { let report = self.execute_report(&query).await?; json!({ "action": "report", "report": report }) } }; Ok(HandResult::success(result)) } fn needs_approval(&self) -> bool { false // Research operations are generally safe } fn check_dependencies(&self) -> Result> { // Network connectivity will be checked at runtime Ok(Vec::new()) } fn status(&self) -> crate::HandStatus { crate::HandStatus::Idle } } /// URL encoding helper (simple implementation) fn url_encode(s: &str) -> String { s.chars() .map(|c| match c { 'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => c.to_string(), _ => format!("%{:02X}", c as u32), }) .collect() } #[cfg(test)] mod tests { use super::*; fn create_test_hand() -> ResearcherHand { ResearcherHand::new() } fn test_context() -> HandContext { HandContext::default() } // --- Config & Type Tests --- #[test] fn test_config_id() { let hand = create_test_hand(); assert_eq!(hand.config().id, "researcher"); assert_eq!(hand.config().name, "研究员"); assert!(hand.config().enabled); assert!(!hand.config().needs_approval); } #[test] fn test_search_engine_default_is_auto() { let engine = SearchEngine::default(); assert!(matches!(engine, SearchEngine::Auto)); } #[test] fn test_research_depth_default_is_standard() { let depth = ResearchDepth::default(); assert!(matches!(depth, ResearchDepth::Standard)); } #[test] fn test_research_depth_serialize() { let json = serde_json::to_string(&ResearchDepth::Deep).unwrap(); assert_eq!(json, "\"deep\""); } #[test] fn test_research_depth_deserialize() { let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap(); assert!(matches!(depth, ResearchDepth::Quick)); } #[test] fn test_search_engine_serialize_roundtrip() { for engine in [SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] { let json = serde_json::to_string(&engine).unwrap(); let back: SearchEngine = serde_json::from_str(&json).unwrap(); assert_eq!(json, serde_json::to_string(&back).unwrap()); } } // --- Action Deserialization Tests --- #[test] fn test_action_search_deserialize() { let json = json!({ "action": "search", "query": { "query": "Rust programming", "engine": "duckduckgo", "depth": "quick", "maxResults": 5 } }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Search { query } => { assert_eq!(query.query, "Rust programming"); assert!(matches!(query.engine, SearchEngine::DuckDuckGo)); assert!(matches!(query.depth, ResearchDepth::Quick)); assert_eq!(query.max_results, 5); } _ => panic!("Expected Search action"), } } #[test] fn test_action_fetch_deserialize() { let json = json!({ "action": "fetch", "url": "https://example.com/page" }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Fetch { url } => { assert_eq!(url, "https://example.com/page"); } _ => panic!("Expected Fetch action"), } } #[test] fn test_action_report_deserialize() { let json = json!({ "action": "report", "query": { "query": "AI trends 2026", "depth": "deep" } }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Report { query } => { assert_eq!(query.query, "AI trends 2026"); assert!(matches!(query.depth, ResearchDepth::Deep)); } _ => panic!("Expected Report action"), } } #[test] fn test_action_invalid_rejected() { let json = json!({ "action": "unknown_action", "data": "whatever" }); let result: std::result::Result = serde_json::from_value(json); assert!(result.is_err()); } // --- URL Encoding Tests --- #[test] fn test_url_encode_ascii() { assert_eq!(url_encode("hello world"), "hello%20world"); } #[test] fn test_url_encode_chinese() { let encoded = url_encode("中文搜索"); assert!(encoded.contains("%")); // Chinese chars should be percent-encoded assert!(!encoded.contains("中文")); } #[test] fn test_url_encode_safe_chars() { assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string()); } #[test] fn test_url_encode_empty() { assert_eq!(url_encode(""), ""); } // --- HTML Text Extraction Tests --- #[test] fn test_extract_text_basic() { let hand = create_test_hand(); let html = "

Title

Content here

"; let text = hand.extract_text_from_html(html); assert!(text.contains("Title")); assert!(text.contains("Content here")); } #[test] fn test_extract_text_strips_scripts() { let hand = create_test_hand(); let html = "

Safe text

"; let text = hand.extract_text_from_html(html); assert!(!text.contains("alert")); assert!(text.contains("Safe text")); } #[test] fn test_extract_text_strips_styles() { let hand = create_test_hand(); let html = "

Visible

"; let text = hand.extract_text_from_html(html); assert!(!text.contains("color")); assert!(text.contains("Visible")); } #[test] fn test_extract_text_truncates_long_content() { let hand = create_test_hand(); let long_body: String = "x".repeat(20000); let html = format!("

{}

", long_body); let text = hand.extract_text_from_html(&html); assert!(text.len() <= 10003); // 10000 + "..." } #[test] fn test_extract_text_empty_body() { let hand = create_test_hand(); let html = ""; let text = hand.extract_text_from_html(html); assert!(text.is_empty()); } // --- Hand Trait Tests --- #[tokio::test] async fn test_needs_approval_is_false() { let hand = create_test_hand(); assert!(!hand.needs_approval()); } #[tokio::test] async fn test_status_is_idle() { let hand = create_test_hand(); assert!(matches!(hand.status(), crate::HandStatus::Idle)); } #[tokio::test] async fn test_check_dependencies_ok() { let hand = create_test_hand(); let missing = hand.check_dependencies().unwrap(); // Default is_dependency_available returns true for all assert!(missing.is_empty()); } // --- Default Values Tests --- #[test] fn test_research_query_defaults() { let json = json!({ "query": "test" }); let query: ResearchQuery = serde_json::from_value(json).unwrap(); assert_eq!(query.query, "test"); assert!(matches!(query.engine, SearchEngine::Auto)); assert!(matches!(query.depth, ResearchDepth::Standard)); assert_eq!(query.max_results, 10); assert_eq!(query.time_limit_secs, 60); assert!(!query.include_related); } #[test] fn test_search_result_serialization() { let result = SearchResult { title: "Test".to_string(), url: "https://example.com".to_string(), snippet: "A snippet".to_string(), source: "TestSource".to_string(), relevance: 90, content: None, fetched_at: None, }; let json = serde_json::to_string(&result).unwrap(); assert!(json.contains("Test")); assert!(json.contains("https://example.com")); } #[test] fn test_research_report_summary_is_some_when_results() { // Verify the struct allows Some value let report = ResearchReport { query: "test".to_string(), results: vec![SearchResult { title: "R".to_string(), url: "https://r.co".to_string(), snippet: "snippet text".to_string(), source: "S".to_string(), relevance: 80, content: None, fetched_at: None, }], summary: Some("基于 1 条搜索结果:snippet text".to_string()), key_findings: vec![], related_topics: vec![], researched_at: "2026-01-01T00:00:00Z".to_string(), duration_ms: 100, }; assert!(report.summary.is_some()); assert!(report.summary.unwrap().contains("snippet text")); } }