//! Researcher Hand - Deep research and analysis capabilities //! //! This hand provides web search, content fetching, and research synthesis. use async_trait::async_trait; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::HashMap; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use std::sync::Arc; use tokio::sync::RwLock; use url::Url; use zclaw_types::Result; use crate::{Hand, HandConfig, HandContext, HandResult}; /// Search engine options #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum SearchEngine { SearXNG, Google, Bing, DuckDuckGo, Auto, } impl Default for SearchEngine { fn default() -> Self { Self::Auto } } /// Search configuration loaded from config/config.toml #[derive(Debug, Clone)] struct SearchConfig { default_engine: SearchEngine, searxng_url: String, timeout_secs: u64, jina_api_key: Option, } impl Default for SearchConfig { fn default() -> Self { Self { default_engine: SearchEngine::Auto, searxng_url: "http://localhost:8888".to_string(), timeout_secs: 15, jina_api_key: None, } } } impl SearchConfig { fn load() -> Self { let path = "config/config.toml"; let content = match std::fs::read_to_string(path) { Ok(c) => c, Err(_) => return Self::default(), }; #[derive(Deserialize)] struct ToolsWebSearch { default_engine: Option, #[allow(dead_code)] max_results: Option, searxng_url: Option, searxng_timeout: Option, } #[derive(Deserialize)] struct ToolsWeb { search: Option, } #[derive(Deserialize)] struct Tools { web: Option, } #[derive(Deserialize)] struct Config { tools: Option, } let config: Config = match toml::from_str(&content) { Ok(c) => c, Err(_) => return Self::default(), }; let search = config.tools .and_then(|t| t.web) .and_then(|w| w.search); match search { Some(s) => { let engine = s.default_engine .as_deref() .and_then(|e| serde_json::from_str(&format!("\"{}\"", e)).ok()) .unwrap_or_default(); Self { default_engine: engine, searxng_url: s.searxng_url .unwrap_or_else(|| "http://localhost:8888".to_string()), timeout_secs: s.searxng_timeout.unwrap_or(15), jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(), } } None => Self::default(), } } } /// Research depth level #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum ResearchDepth { Quick, // Fast search, top 3 results Standard, // Normal search, top 10 results Deep, // Comprehensive search, multiple sources } impl Default for ResearchDepth { fn default() -> Self { Self::Standard } } /// Research query configuration #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ResearchQuery { /// Search query pub query: String, /// Search engine to use #[serde(default)] pub engine: SearchEngine, /// Research depth #[serde(default)] pub depth: ResearchDepth, /// Maximum results to return #[serde(default = "default_max_results")] pub max_results: usize, /// Include related topics #[serde(default)] pub include_related: bool, /// Time limit in seconds #[serde(default = "default_time_limit")] pub time_limit_secs: u64, } fn default_max_results() -> usize { 10 } fn default_time_limit() -> u64 { 60 } const MAX_QUERY_LENGTH: usize = 500; const MAX_RESULTS_CAP: usize = 50; const MAX_URL_LENGTH: usize = 2048; const CACHE_MAX_ENTRIES: usize = 200; impl ResearchQuery { fn validate(&self) -> std::result::Result<(), String> { if self.query.trim().is_empty() { return Err("搜索查询不能为空".to_string()); } if self.query.len() > MAX_QUERY_LENGTH { return Err(format!("查询过长(上限 {} 字符)", MAX_QUERY_LENGTH)); } if self.max_results > MAX_RESULTS_CAP { return Err(format!("max_results 上限为 {}", MAX_RESULTS_CAP)); } Ok(()) } } /// Search result item #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct SearchResult { /// Title of the result pub title: String, /// URL pub url: String, /// Snippet/summary pub snippet: String, /// Source name pub source: String, /// Relevance score (0-100) #[serde(default)] pub relevance: u8, /// Fetched content (if available) #[serde(default)] pub content: Option, /// Timestamp #[serde(default)] pub fetched_at: Option, } /// Research report #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct ResearchReport { /// Original query pub query: String, /// Search results pub results: Vec, /// Synthesized summary #[serde(default)] pub summary: Option, /// Key findings #[serde(default)] pub key_findings: Vec, /// Related topics discovered #[serde(default)] pub related_topics: Vec, /// Research timestamp pub researched_at: String, /// Total time spent (ms) pub duration_ms: u64, } /// Researcher action types #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "action")] pub enum ResearcherAction { #[serde(rename = "search")] Search { query: ResearchQuery }, #[serde(rename = "fetch")] Fetch { url: String }, #[serde(rename = "summarize")] Summarize { urls: Vec }, #[serde(rename = "report")] Report { query: ResearchQuery }, } /// Researcher Hand implementation pub struct ResearcherHand { config: HandConfig, search_config: SearchConfig, client: reqwest::Client, cache: Arc>>, } impl ResearcherHand { /// Create a new researcher hand pub fn new() -> Self { Self { config: HandConfig { id: "researcher".to_string(), name: "研究员".to_string(), description: "深度研究和分析能力,支持网络搜索和内容获取".to_string(), needs_approval: false, dependencies: vec!["network".to_string()], input_schema: Some(serde_json::json!({ "type": "object", "properties": { "action": { "type": "string", "enum": ["search", "fetch", "report", "summarize"], "description": "Action to perform: search (web search), fetch (get URL content), report (deep research), summarize (multiple URLs)" }, "query": { "type": "string", "description": "Search query string for search/report actions" }, "url": { "type": "string", "description": "URL to fetch content from" }, "urls": { "type": "array", "items": { "type": "string" }, "description": "List of URLs to summarize" }, "engine": { "type": "string", "enum": ["auto", "searxng", "google", "bing", "duckduckgo"], "description": "Search engine preference" } }, "description": "Provide 'query' for search/report, or 'url' for fetch, or 'urls' for summarize" })), tags: vec!["research".to_string(), "web".to_string(), "search".to_string()], enabled: true, max_concurrent: 0, timeout_secs: 0, }, search_config: SearchConfig::load(), client: reqwest::Client::builder() .timeout(std::time::Duration::from_secs(30)) .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") .redirect(reqwest::redirect::Policy::limited(3)) .build() .unwrap_or_else(|_| reqwest::Client::new()), cache: Arc::new(RwLock::new(HashMap::new())), } } /// Infer action from input fields when LLM omits the `action` field. /// Many LLMs (especially non-OpenAI models like glm) call tools without /// including the enum tag, e.g. sending `{"query": "search terms"}` instead /// of `{"action": "search", "query": "search terms"}`. fn infer_action(input: &Value) -> Result { // Debug: log all keys in the input let keys: Vec<&str> = input.as_object() .map(|obj| obj.keys().map(|k| k.as_str()).collect()) .unwrap_or_default(); tracing::debug!(target: "researcher", ?keys, %input, "infer_action examining input"); // Check for action field with wrong value if let Some(action) = input.get("action").and_then(|v| v.as_str()) { if action == "search" || action == "report" { if let Some(query_val) = input.get("query") { let query = Self::parse_query(query_val); if !query.query.trim().is_empty() { return Ok(if action == "report" { ResearcherAction::Report { query } } else { ResearcherAction::Search { query } }); } } } if action == "fetch" { if let Some(url) = input.get("url").and_then(|v| v.as_str()) { return Ok(ResearcherAction::Fetch { url: url.to_string() }); } } } // Has "url" (singular) → fetch if let Some(url) = input.get("url").and_then(|v| v.as_str()) { if !url.is_empty() && url.starts_with("http") { return Ok(ResearcherAction::Fetch { url: url.to_string() }); } } // Has "urls" (plural) → summarize if let Some(urls) = input.get("urls").and_then(|v| v.as_array()) { let url_list: Vec = urls.iter() .filter_map(|v| v.as_str().map(|s| s.to_string())) .collect(); if !url_list.is_empty() { return Ok(ResearcherAction::Summarize { urls: url_list }); } } // Has "query" → search if let Some(query_val) = input.get("query") { let query = Self::parse_query(query_val); if !query.query.trim().is_empty() { return Ok(ResearcherAction::Search { query }); } } // Has "search" or "search_query" → search for key in &["search", "search_query", "keyword", "keywords", "q", "text"] { if let Some(val) = input.get(key) { let query = Self::parse_query(val); if !query.query.trim().is_empty() { return Ok(ResearcherAction::Search { query }); } } } // Check for injected fallback query from loop_runner (when LLM sends empty args) if let Some(fallback) = input.get("_fallback_query").and_then(|v| v.as_str()) { if !fallback.trim().is_empty() { tracing::debug!(target: "researcher", query = %fallback, "Using fallback user message as search query"); return Ok(ResearcherAction::Search { query: ResearchQuery { query: fallback.to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }}); } } // Last resort: if any string field looks like a search query if let Some(obj) = input.as_object() { for (key, val) in obj { if let Some(s) = val.as_str() { if s.len() > 2 && !s.starts_with("http") && key != "action" && key != "engine" { tracing::debug!(target: "researcher", key = %key, value = %s, "Using fallback field as query"); return Ok(ResearcherAction::Search { query: ResearchQuery { query: s.to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }}); } } } } Err(zclaw_types::ZclawError::HandError( "无法识别搜索意图:请提供 query(搜索)或 url(获取网页)参数".to_string() )) } fn parse_query(query_val: &Value) -> ResearchQuery { if query_val.is_string() { ResearchQuery { query: query_val.as_str().unwrap_or("").to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, } } else { serde_json::from_value(query_val.clone()).unwrap_or_else(|_| ResearchQuery { query: query_val.get("query") .or_else(|| query_val.get("search")) .or_else(|| query_val.get("q")) .or_else(|| query_val.get("keyword")) .and_then(|v| v.as_str()) .unwrap_or("") .to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }) } } /// Execute a web search — route to the configured backend async fn execute_search(&self, query: &ResearchQuery) -> Result> { query.validate().map_err(|e| zclaw_types::ZclawError::HandError(e))?; let max_results = query.max_results.min(MAX_RESULTS_CAP); let start = std::time::Instant::now(); let engine = match &query.engine { SearchEngine::Auto => &self.search_config.default_engine, other => other, }; let results = match engine { SearchEngine::SearXNG => { match self.search_searxng(&query.query, max_results).await { Ok(r) if !r.is_empty() => r, _ => self.search_native(&query.query, max_results).await?, } } SearchEngine::Auto => { self.search_native(&query.query, max_results).await? } SearchEngine::DuckDuckGo => { // DDG在国内不可用,降级到百度 tracing::warn!(target: "researcher", "DuckDuckGo在国内不可用,降级到百度"); self.search_baidu(&query.query, max_results).await? } SearchEngine::Google => { tracing::warn!(target: "researcher", "Google在国内不可用,降级到百度"); self.search_baidu(&query.query, max_results).await? } SearchEngine::Bing => { self.search_bing(&query.query, max_results).await? } }; let duration = start.elapsed().as_millis() as u64; tracing::info!( target: "researcher", query = %query.query, engine = ?engine, duration_ms = duration, results_count = results.len(), "Search completed" ); Ok(results) } /// Rust-native multi-engine search — optimized for China mainland users /// Priority: Baidu + Bing CN (both always work in China) /// DuckDuckGo as optional fallback (may be blocked by GFW) async fn search_native(&self, query: &str, max_results: usize) -> Result> { let mut all_results = Vec::new(); // Always use Baidu + Bing CN in parallel (both work in China) let baidu_fut = self.search_baidu(query, max_results); let bing_fut = self.search_bing(query, max_results); let (baidu_res, bing_res) = tokio::join!( async { baidu_fut.await }, async { bing_fut.await }, ); if let Ok(r) = baidu_res { all_results.extend(r); } if let Ok(r) = bing_res { all_results.extend(r); } // If both primary engines returned nothing, try DDG as last resort if all_results.is_empty() { tracing::info!(target: "researcher", "Primary engines empty, trying DuckDuckGo as fallback"); if let Ok(r) = self.search_duckduckgo_html(query, max_results).await { all_results.extend(r); } } // Deduplicate by URL let mut seen_urls = std::collections::HashSet::new(); all_results.retain(|r| seen_urls.insert(r.url.to_lowercase())); // Sort by relevance descending, take top N all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance)); all_results.truncate(max_results); if all_results.is_empty() { tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query); } Ok(all_results) } /// Search using SearXNG meta-search engine (aggregates 70+ engines) async fn search_searxng(&self, query: &str, max_results: usize) -> Result> { let url = format!( "{}/search?q={}&format=json&categories=general&language=auto&pageno=1", self.search_config.searxng_url.trim_end_matches('/'), url_encode(query) ); let response = self.client .get(&url) .timeout(std::time::Duration::from_secs(self.search_config.timeout_secs)) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError( format!("SearXNG request failed: {}", e) ))?; let status = response.status(); if !status.is_success() { return Err(zclaw_types::ZclawError::HandError( format!("SearXNG returned HTTP {}", status) )); } let json: Value = response.json().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Failed to parse SearXNG response: {}", e) ))?; let mut results = Vec::new(); if let Some(items) = json.get("results").and_then(|v| v.as_array()) { for item in items.iter().take(max_results) { let title = item.get("title") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let url = item.get("url") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let snippet = item.get("content") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let engines = item.get("engines") .and_then(|v| v.as_array()) .map(|arr| { arr.iter() .filter_map(|e| e.as_str()) .collect::>() .join(",") }) .unwrap_or_default(); let score = item.get("score") .and_then(|v| v.as_f64()) .unwrap_or(0.0); // Normalize score to 0-100 range let relevance = if score > 0.0 { (score.min(10.0) * 10.0) as u8 } else { 50 }; if !title.is_empty() && !url.is_empty() { results.push(SearchResult { title, url, snippet, source: if engines.is_empty() { "SearXNG".to_string() } else { format!("SearXNG({})", engines) }, relevance, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } } } Ok(results) } /// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior) async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result> { let has_cjk = query.chars().any(|c| is_cjk_char(c)); let region = if has_cjk { "wt-wt" } else { "wt-wt" }; let body = format!("q={}&b=&l={}", url_encode(query), region); let response = self.client .post("https://html.duckduckgo.com/html/") .header("Content-Type", "application/x-www-form-urlencoded") .header("Accept", "text/html,application/xhtml+xml") .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") .body(body) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError( format!("DuckDuckGo HTML search failed: {}", e) ))?; let status = response.status(); if !status.is_success() { return Err(zclaw_types::ZclawError::HandError( format!("DuckDuckGo returned HTTP {}", status) )); } let html = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Failed to read DuckDuckGo response: {}", e) ))?; Ok(self.parse_ddg_html(&html, max_results)) } /// Parse DuckDuckGo HTML search results page fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec { let mut results = Vec::new(); for block in html.split("class=\"result__body\"") { if results.len() >= max_results { break; } // Find the result title link: Title let title_link = match extract_between(block, "result__a", "") { Some(s) => s, None => continue, }; // title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text let title = title_link.rsplit('>').next() .map(|s| strip_html_tags(s).trim().to_string()) .unwrap_or_default(); let url = extract_href_uddg(block).unwrap_or_default(); let snippet = extract_between(block, "result__snippet", "") .map(|s| { s.rsplit('>').next() .map(|t| strip_html_tags(t).trim().to_string()) .unwrap_or_default() }) .unwrap_or_default(); if title.is_empty() || url.is_empty() { continue; } if !is_quality_result(&title, &snippet, &url) { continue; } results.push(SearchResult { title, url, snippet, source: "DuckDuckGo".to_string(), relevance: 70, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } results } /// Search using Bing (works well for both Chinese and English) async fn search_bing(&self, query: &str, max_results: usize) -> Result> { let has_cjk = query.chars().any(|c| is_cjk_char(c)); let url = if has_cjk { format!( "https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans", url_encode(query), max_results ) } else { format!( "https://www.bing.com/search?q={}&count={}", url_encode(query), max_results ) }; let response = self.client .get(&url) .header("Accept", "text/html,application/xhtml+xml") .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") .send() .await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Bing search failed: {}", e) ))?; let status = response.status(); if !status.is_success() { return Err(zclaw_types::ZclawError::HandError( format!("Bing returned HTTP {}", status) )); } let html = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Failed to read Bing response: {}", e) ))?; Ok(self.parse_bing_html(&html, max_results)) } /// Parse Bing HTML search results page fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec { let mut results = Vec::new(); // Bing results are in
  • for block in html.split("class=\"b_algo\"") { if results.len() >= max_results { break; } // Extract title from first inside the block let title = extract_between(block, ">", "") .map(|s| strip_html_tags(s).trim().to_string()) .unwrap_or_default(); // Extract URL from href attribute of first let url = extract_href(block).unwrap_or_default(); // Extract snippet from

    ...

    or

    let snippet = extract_between(block, "

    ", "

    ") .or_else(|| extract_between(block, "b_caption", "
    ")) .map(|s| strip_html_tags(s).trim().to_string()) .unwrap_or_default(); if title.is_empty() || url.is_empty() { continue; } // Skip Bing internal URLs if url.contains("bing.com/search") || url.contains("go.microsoft.com") { continue; } if !is_quality_result(&title, &snippet, &url) { continue; } results.push(SearchResult { title, url, snippet, source: "Bing".to_string(), relevance: 75, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } results } /// Search using Baidu (essential for Chinese content) async fn search_baidu(&self, query: &str, max_results: usize) -> Result> { let url = format!( "https://www.baidu.com/s?wd={}&rn={}", url_encode(query), max_results ); let response = self.client .get(&url) .header("Accept", "text/html,application/xhtml+xml") .header("Accept-Language", "zh-CN,zh;q=0.9") .send() .await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Baidu search failed: {}", e) ))?; let status = response.status(); if !status.is_success() { return Err(zclaw_types::ZclawError::HandError( format!("Baidu returned HTTP {}", status) )); } let html = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Failed to read Baidu response: {}", e) ))?; Ok(self.parse_baidu_html(&html, max_results)) } /// Parse Baidu HTML search results page fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec { let mut results = Vec::new(); // Baidu uses multiple class patterns: "result c-container", "c-container new-pmd", "result-op c-container" let blocks: Vec<&str> = html.split("c-container") .enumerate() .filter_map(|(i, block)| { if i == 0 { return None; } if block.contains("href=\"http") { Some(block) } else { None } }) .collect(); for block in &blocks { if results.len() >= max_results { break; } let title = extract_between(block, ">", "
    ") .map(|s| strip_html_tags(s).trim().to_string()) .unwrap_or_default(); let url = extract_href(block).unwrap_or_default(); let snippet = extract_between(block, "c-abstract", "") .or_else(|| extract_between(block, "content-right_", "")) .map(|s| strip_html_tags(s).trim().to_string()) .unwrap_or_default(); if title.is_empty() || url.is_empty() { continue; } if !is_quality_result(&title, &snippet, &url) { continue; } results.push(SearchResult { title, url, snippet, source: "Baidu".to_string(), relevance: 80, content: None, fetched_at: Some(chrono::Utc::now().to_rfc3339()), }); } results } /// Fetch content from a URL (with SSRF protection) /// Tries Jina Reader API first for clean Markdown, falls back to direct fetch async fn execute_fetch(&self, url: &str) -> Result { let start = std::time::Instant::now(); // SSRF validation validate_fetch_url(url)?; // Check cache first { let cache = self.cache.read().await; if let Some(cached) = cache.get(url) { if cached.content.is_some() { return Ok(cached.clone()); } } } // Try Jina Reader API first (returns clean Markdown) let content = match self.fetch_via_jina(url).await { Ok(text) => text, Err(e) => { tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch"); self.fetch_direct(url).await? } }; let result = SearchResult { title: url.to_string(), url: url.to_string(), snippet: content.chars().take(500).collect(), source: url.to_string(), relevance: 100, content: Some(content), fetched_at: Some(chrono::Utc::now().to_rfc3339()), }; // Cache the result (with capacity limit) { let mut cache = self.cache.write().await; if cache.len() >= CACHE_MAX_ENTRIES { // Simple eviction: remove first entry if let Some(key) = cache.keys().next().cloned() { cache.remove(&key); } } cache.insert(url.to_string(), result.clone()); } let duration = start.elapsed().as_millis() as u64; tracing::info!( target: "researcher", url = url, duration_ms = duration, "Fetch completed" ); Ok(result) } /// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern) async fn fetch_via_jina(&self, url: &str) -> Result { let client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(20)) .build() .unwrap_or_else(|_| reqwest::Client::new()); let mut builder = client .post("https://r.jina.ai/") .header("Content-Type", "application/json") .header("X-Return-Format", "markdown") .header("X-Timeout", "15") .json(&serde_json::json!({ "url": url })); // Optional API key for higher rate limits if let Some(ref key) = self.search_config.jina_api_key { builder = builder.header("Authorization", format!("Bearer {}", key)); } let response = builder.send().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Jina Reader request failed: {}", e) ))?; let status = response.status(); if !status.is_success() { return Err(zclaw_types::ZclawError::HandError( format!("Jina Reader returned HTTP {}", status) )); } let text = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError( format!("Failed to read Jina response: {}", e) ))?; if text.trim().is_empty() { return Err(zclaw_types::ZclawError::HandError( "Jina Reader returned empty response".to_string() )); } // Truncate to 4096 chars (DeerFlow pattern) let truncated: String = text.chars().take(4096).collect(); Ok(truncated) } /// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable) async fn fetch_direct(&self, url: &str) -> Result { let response = self.client .get(url) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?; let content_type = response.headers() .get(reqwest::header::CONTENT_TYPE) .and_then(|v| v.to_str().ok()) .unwrap_or(""); let content = if content_type.contains("text/html") { let html = response.text().await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?; self.extract_text_from_html(&html) } else if content_type.contains("text/") || content_type.contains("application/json") { response.text().await .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))? } else { "[Binary content]".to_string() }; Ok(content) } /// Extract readable text from HTML fn extract_text_from_html(&self, html: &str) -> String { let html_lower = html.to_lowercase(); let mut text = String::new(); let mut in_tag = false; let mut in_script = false; let mut in_style = false; let mut pos: usize = 0; for c in html.chars() { let char_len = c.len_utf8(); match c { '<' => { // Check for closing tags before entering tag mode let remaining = &html_lower[pos..]; if remaining.starts_with("' => { in_tag = false; } _ if in_tag => {} _ if in_script || in_style => {} ' ' | '\n' | '\t' | '\r' => { if !text.ends_with(' ') && !text.is_empty() { text.push(' '); } } _ => text.push(c), } pos += char_len; } if text.len() > 10000 { text.truncate(10000); text.push_str("..."); } text.trim().to_string() } /// Generate a comprehensive research report async fn execute_report(&self, query: &ResearchQuery) -> Result { let start = std::time::Instant::now(); // First, execute search let mut results = self.execute_search(query).await?; // Fetch content for top results let fetch_limit = match query.depth { ResearchDepth::Quick => 1, ResearchDepth::Standard => 3, ResearchDepth::Deep => 5, }; for result in results.iter_mut().take(fetch_limit) { if !result.url.is_empty() { match self.execute_fetch(&result.url).await { Ok(fetched) => { result.content = fetched.content; result.fetched_at = fetched.fetched_at; } Err(e) => { tracing::warn!(target: "researcher", error = %e, "Failed to fetch content"); } } } } // Extract key findings let key_findings: Vec = results.iter() .take(5) .filter_map(|r| { r.content.as_ref().map(|c| { c.split(". ") .take(3) .collect::>() .join(". ") }) }) .collect(); // Extract related topics from snippets let related_topics: Vec = results.iter() .filter_map(|r| { if r.snippet.len() > 50 { Some(r.title.clone()) } else { None } }) .take(5) .collect(); let duration = start.elapsed().as_millis() as u64; // Generate summary from top results let summary = if results.is_empty() { "未找到相关结果,建议调整搜索关键词后重试".to_string() } else { let top_snippets: Vec<&str> = results .iter() .take(3) .filter_map(|r| { let s = r.snippet.trim(); if s.is_empty() { None } else { Some(s) } }) .collect(); if top_snippets.is_empty() { format!("找到 {} 条相关结果,但无摘要信息", results.len()) } else { format!( "基于 {} 条搜索结果:{}", results.len(), top_snippets.join(";") ) } }; Ok(ResearchReport { query: query.query.clone(), results, summary: Some(summary), key_findings, related_topics, researched_at: chrono::Utc::now().to_rfc3339(), duration_ms: duration, }) } } impl Default for ResearcherHand { fn default() -> Self { Self::new() } } #[async_trait] impl Hand for ResearcherHand { fn config(&self) -> &HandConfig { &self.config } async fn execute(&self, _context: &HandContext, input: Value) -> Result { tracing::debug!(target: "researcher", input = %input, "Researcher hand received input"); // Try strict deserialization first, then fall back to inference let action: ResearcherAction = match serde_json::from_value(input.clone()) { Ok(a) => a, Err(e) => { tracing::debug!(target: "researcher", error = %e, input = %input, "Strict deserialization failed, trying inference"); Self::infer_action(&input)? } }; let start = std::time::Instant::now(); let result = match action { ResearcherAction::Search { query } => { let results = self.execute_search(&query).await?; json!({ "action": "search", "query": query.query, "results": results, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Fetch { url } => { let result = self.execute_fetch(&url).await?; json!({ "action": "fetch", "url": url, "result": result, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Summarize { urls } => { let mut results = Vec::new(); for url in urls.iter().take(5) { if let Ok(result) = self.execute_fetch(url).await { results.push(result); } } json!({ "action": "summarize", "urls": urls, "results": results, "duration_ms": start.elapsed().as_millis() }) } ResearcherAction::Report { query } => { let report = self.execute_report(&query).await?; json!({ "action": "report", "report": report }) } }; Ok(HandResult::success(result)) } fn needs_approval(&self) -> bool { false // Research operations are generally safe } fn check_dependencies(&self) -> Result> { // Network connectivity will be checked at runtime Ok(Vec::new()) } fn status(&self) -> crate::HandStatus { crate::HandStatus::Idle } } /// URL encoding helper — encodes each UTF-8 byte, not Unicode code points. fn url_encode(s: &str) -> String { s.bytes() .map(|b| match b { b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { (b as char).to_string() } _ => format!("%{:02X}", b), }) .collect() } /// Check if a character is CJK (Chinese/Japanese/Korean) fn is_cjk_char(c: char) -> bool { matches!(c, '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A '\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation '\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms '\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs ) } /// Validate a URL for SSRF safety before fetching fn validate_fetch_url(url_str: &str) -> Result<()> { if url_str.len() > MAX_URL_LENGTH { return Err(zclaw_types::ZclawError::HandError( format!("URL exceeds maximum length of {} characters", MAX_URL_LENGTH) )); } let url = Url::parse(url_str) .map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid URL: {}", e)))?; match url.scheme() { "http" | "https" => {} scheme => { return Err(zclaw_types::ZclawError::HandError( format!("URL scheme '{}' not allowed, only http/https", scheme) )); } } let host = url.host_str() .ok_or_else(|| zclaw_types::ZclawError::HandError("URL must have a host".into()))?; // Strip IPv6 brackets for parsing let host_for_parsing = if host.starts_with('[') && host.ends_with(']') { &host[1..host.len()-1] } else { host }; if let Ok(ip) = host_for_parsing.parse::() { validate_ip(&ip)?; } else { validate_hostname(host)?; } Ok(()) } fn validate_ip(ip: &IpAddr) -> Result<()> { match ip { IpAddr::V4(v4) => validate_ipv4(v4), IpAddr::V6(v6) => validate_ipv6(v6), } } fn validate_ipv4(ip: &Ipv4Addr) -> Result<()> { let o = ip.octets(); if o[0] == 127 { return Err(ssrf_err("loopback")); } if o[0] == 10 { return Err(ssrf_err("private 10.x.x.x")); } if o[0] == 172 && (16..=31).contains(&o[1]) { return Err(ssrf_err("private 172.16-31.x.x")); } if o[0] == 192 && o[1] == 168 { return Err(ssrf_err("private 192.168.x.x")); } if o[0] == 169 && o[1] == 254 { return Err(ssrf_err("link-local/metadata")); } if o[0] == 0 { return Err(ssrf_err("0.x.x.x")); } if *ip == Ipv4Addr::new(255, 255, 255, 255) { return Err(ssrf_err("broadcast")); } if (224..=239).contains(&o[0]) { return Err(ssrf_err("multicast")); } Ok(()) } fn validate_ipv6(ip: &Ipv6Addr) -> Result<()> { if *ip == Ipv6Addr::LOCALHOST { return Err(ssrf_err("IPv6 loopback")); } if *ip == Ipv6Addr::UNSPECIFIED { return Err(ssrf_err("IPv6 unspecified")); } let segs = ip.segments(); // IPv4-mapped: ::ffff:x.x.x.x if segs[5] == 0xffff { let v4 = ((segs[6] as u32) << 16) | (segs[7] as u32); validate_ipv4(&Ipv4Addr::from(v4))?; } // Link-local fe80::/10 if (segs[0] & 0xffc0) == 0xfe80 { return Err(ssrf_err("IPv6 link-local")); } // Unique local fc00::/7 if (segs[0] & 0xfe00) == 0xfc00 { return Err(ssrf_err("IPv6 unique local")); } Ok(()) } fn validate_hostname(host: &str) -> Result<()> { let h = host.to_lowercase(); let blocked = [ "localhost", "localhost.localdomain", "ip6-localhost", "ip6-loopback", "metadata.google.internal", "metadata", "kubernetes.default", "kubernetes.default.svc", ]; for b in &blocked { if h == *b || h.ends_with(&format!(".{}", b)) { return Err(ssrf_err(&format!("blocked host '{}'", host))); } } // Decimal IP bypass: 2130706433 = 127.0.0.1 if h.chars().all(|c| c.is_ascii_digit()) { if let Ok(num) = h.parse::() { validate_ipv4(&Ipv4Addr::from(num))?; } } Ok(()) } fn ssrf_err(reason: &str) -> zclaw_types::ZclawError { zclaw_types::ZclawError::HandError(format!("Access denied: {}", reason)) } /// Extract text between two delimiters fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> { let start_idx = text.find(start)?; let rest = &text[start_idx + start.len()..]; let end_idx = rest.find(end)?; Some(&rest[..end_idx]) } /// Strip HTML tags from a string fn strip_html_tags(s: &str) -> String { let mut result = String::with_capacity(s.len()); let mut in_tag = false; for c in s.chars() { match c { '<' => in_tag = true, '>' => in_tag = false, _ if !in_tag => result.push(c), _ => {} } } // Decode common HTML entities result = result.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace(""", "\"") .replace("'", "'") .replace(" ", " ") .replace("'", "'") .replace("/", "/"); // Collapse whitespace let collapsed: String = result.split_whitespace().collect::>().join(" "); collapsed } /// Check if a search result is likely genuine (not navigation/ad/script garbage) fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool { // Title quality checks let title_trimmed = title.trim(); if title_trimmed.len() < 2 || title_trimmed.len() > 300 { return false; } // Reject titles with JavaScript/CSS indicators let lower = title_trimmed.to_lowercase(); if lower.contains("function(") || lower.contains("var ") || lower.contains("const ") || lower.contains("window.") || lower.contains("document.") || lower.contains("{") || lower.contains("}") || lower.starts_with("//") || lower.starts_with("/*") || lower.contains("cookie") || lower.contains("navigator.") || lower.contains(".css") || lower.contains("stylesheet") || lower.contains("google-analytics") || lower.contains("gtag") { return false; } // URL quality checks if url.contains("javascript:") || url.contains("data:") { return false; } // Reject URLs that are just fragments or relative paths if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") { return false; } // Snippet quality — if snippet looks like code, reject let snippet_lower = snippet.to_lowercase(); if snippet_lower.contains("function(") && snippet_lower.contains("return ") || snippet_lower.contains("var ") && snippet_lower.contains("=") { return false; } true } /// Extract href URL from the first tag in text fn extract_href(text: &str) -> Option { let href_start = text.find("href=\"")?; let rest = &text[href_start + 6..]; let end = rest.find('"')?; let url = &rest[..end]; if url.starts_with("http") { Some(url.to_string()) } else if url.starts_with("//") { Some(format!("https:{}", url)) } else { None } } /// Extract the real URL from DDG's redirect link (uddg= parameter) fn extract_href_uddg(text: &str) -> Option { if let Some(idx) = text.find("uddg=") { let rest = &text[idx + 5..]; let url_encoded = rest.split('&').next().unwrap_or(""); // Use standard percent decoding instead of manual replacement let decoded = percent_decode(url_encoded); if decoded.starts_with("http") { return Some(decoded); } } // Fallback: try regular href extraction extract_href(text) } /// Standard percent-decode a URL-encoded string fn percent_decode(input: &str) -> String { let mut result = Vec::new(); let bytes = input.as_bytes(); let mut i = 0; while i < bytes.len() { if bytes[i] == b'%' && i + 2 < bytes.len() { if let Ok(byte) = u8::from_str_radix( &input[i + 1..i + 3], 16 ) { result.push(byte); i += 3; continue; } } result.push(bytes[i]); i += 1; } String::from_utf8_lossy(&result).to_string() } #[cfg(test)] mod tests { use super::*; fn create_test_hand() -> ResearcherHand { ResearcherHand::new() } fn test_context() -> HandContext { HandContext::default() } // --- Config & Type Tests --- #[test] fn test_config_id() { let hand = create_test_hand(); assert_eq!(hand.config().id, "researcher"); assert_eq!(hand.config().name, "研究员"); assert!(hand.config().enabled); assert!(!hand.config().needs_approval); } #[test] fn test_search_engine_default_is_auto() { let engine = SearchEngine::default(); assert!(matches!(engine, SearchEngine::Auto)); } #[test] fn test_search_engine_searxng_deserialize() { let engine: SearchEngine = serde_json::from_str("\"searxng\"").unwrap(); assert!(matches!(engine, SearchEngine::SearXNG)); } #[test] fn test_research_depth_default_is_standard() { let depth = ResearchDepth::default(); assert!(matches!(depth, ResearchDepth::Standard)); } #[test] fn test_research_depth_serialize() { let json = serde_json::to_string(&ResearchDepth::Deep).unwrap(); assert_eq!(json, "\"deep\""); } #[test] fn test_research_depth_deserialize() { let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap(); assert!(matches!(depth, ResearchDepth::Quick)); } #[test] fn test_search_engine_serialize_roundtrip() { for engine in [SearchEngine::SearXNG, SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] { let json = serde_json::to_string(&engine).unwrap(); let back: SearchEngine = serde_json::from_str(&json).unwrap(); assert_eq!(json, serde_json::to_string(&back).unwrap()); } } // --- Action Deserialization Tests --- #[test] fn test_action_search_deserialize() { let json = json!({ "action": "search", "query": { "query": "Rust programming", "engine": "duckduckgo", "depth": "quick", "maxResults": 5 } }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Search { query } => { assert_eq!(query.query, "Rust programming"); assert!(matches!(query.engine, SearchEngine::DuckDuckGo)); assert!(matches!(query.depth, ResearchDepth::Quick)); assert_eq!(query.max_results, 5); } _ => panic!("Expected Search action"), } } #[test] fn test_action_fetch_deserialize() { let json = json!({ "action": "fetch", "url": "https://example.com/page" }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Fetch { url } => { assert_eq!(url, "https://example.com/page"); } _ => panic!("Expected Fetch action"), } } #[test] fn test_action_report_deserialize() { let json = json!({ "action": "report", "query": { "query": "AI trends 2026", "depth": "deep" } }); let action: ResearcherAction = serde_json::from_value(json).unwrap(); match action { ResearcherAction::Report { query } => { assert_eq!(query.query, "AI trends 2026"); assert!(matches!(query.depth, ResearchDepth::Deep)); } _ => panic!("Expected Report action"), } } #[test] fn test_action_invalid_rejected() { let json = json!({ "action": "unknown_action", "data": "whatever" }); let result: std::result::Result = serde_json::from_value(json); assert!(result.is_err()); } // --- URL Encoding Tests --- #[test] fn test_url_encode_ascii() { assert_eq!(url_encode("hello world"), "hello%20world"); } #[test] fn test_url_encode_chinese() { // "医" = UTF-8 bytes E5 8C BB → must produce %E5%8C%BB, not %533B let encoded = url_encode("医"); assert_eq!(encoded, "%E5%8C%BB"); // Full phrase: "中文" = E4 B8 AD E6 96 87 let encoded = url_encode("中文搜索"); assert_eq!(&encoded[0..9], "%E4%B8%AD"); assert!(!encoded.contains("中文")); } #[test] fn test_url_encode_safe_chars() { assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string()); } #[test] fn test_url_encode_empty() { assert_eq!(url_encode(""), ""); } // --- HTML Text Extraction Tests --- #[test] fn test_extract_text_basic() { let hand = create_test_hand(); let html = "

    Title

    Content here

    "; let text = hand.extract_text_from_html(html); assert!(text.contains("Title")); assert!(text.contains("Content here")); } #[test] fn test_extract_text_strips_scripts() { let hand = create_test_hand(); let html = "

    Safe text

    "; let text = hand.extract_text_from_html(html); assert!(!text.contains("alert")); assert!(text.contains("Safe text")); } #[test] fn test_extract_text_strips_styles() { let hand = create_test_hand(); let html = "

    Visible

    "; let text = hand.extract_text_from_html(html); assert!(!text.contains("color")); assert!(text.contains("Visible")); } #[test] fn test_extract_text_truncates_long_content() { let hand = create_test_hand(); let long_body: String = "x".repeat(20000); let html = format!("

    {}

    ", long_body); let text = hand.extract_text_from_html(&html); assert!(text.len() <= 10003); // 10000 + "..." } #[test] fn test_extract_text_empty_body() { let hand = create_test_hand(); let html = ""; let text = hand.extract_text_from_html(html); assert!(text.is_empty()); } // --- Hand Trait Tests --- #[tokio::test] async fn test_needs_approval_is_false() { let hand = create_test_hand(); assert!(!hand.needs_approval()); } #[tokio::test] async fn test_status_is_idle() { let hand = create_test_hand(); assert!(matches!(hand.status(), crate::HandStatus::Idle)); } #[tokio::test] async fn test_check_dependencies_ok() { let hand = create_test_hand(); let missing = hand.check_dependencies().unwrap(); // Default is_dependency_available returns true for all assert!(missing.is_empty()); } // --- Default Values Tests --- #[test] fn test_research_query_defaults() { let json = json!({ "query": "test" }); let query: ResearchQuery = serde_json::from_value(json).unwrap(); assert_eq!(query.query, "test"); assert!(matches!(query.engine, SearchEngine::Auto)); assert!(matches!(query.depth, ResearchDepth::Standard)); assert_eq!(query.max_results, 10); assert_eq!(query.time_limit_secs, 60); assert!(!query.include_related); } #[test] fn test_search_result_serialization() { let result = SearchResult { title: "Test".to_string(), url: "https://example.com".to_string(), snippet: "A snippet".to_string(), source: "TestSource".to_string(), relevance: 90, content: None, fetched_at: None, }; let json = serde_json::to_string(&result).unwrap(); assert!(json.contains("Test")); assert!(json.contains("https://example.com")); } #[test] fn test_research_report_summary_is_some_when_results() { // Verify the struct allows Some value let report = ResearchReport { query: "test".to_string(), results: vec![SearchResult { title: "R".to_string(), url: "https://r.co".to_string(), snippet: "snippet text".to_string(), source: "S".to_string(), relevance: 80, content: None, fetched_at: None, }], summary: Some("基于 1 条搜索结果:snippet text".to_string()), key_findings: vec![], related_topics: vec![], researched_at: "2026-01-01T00:00:00Z".to_string(), duration_ms: 100, }; assert!(report.summary.is_some()); assert!(report.summary.unwrap().contains("snippet text")); } // --- SearchConfig Tests --- #[test] fn test_search_config_default() { let config = SearchConfig::default(); assert!(matches!(config.default_engine, SearchEngine::Auto)); assert_eq!(config.searxng_url, "http://localhost:8888"); assert_eq!(config.timeout_secs, 15); } #[test] fn test_search_config_load_fallback_on_missing_file() { // Config loads from config/config.toml which may not exist in test CWD let config = SearchConfig::load(); // Should return a valid config either way assert!(!config.searxng_url.is_empty()); } // --- SearXNG Response Parsing Tests --- #[test] fn test_searxng_response_parse() { let mock_response = json!({ "query": "Rust programming", "number_of_results": 42, "results": [ { "url": "https://www.rust-lang.org/", "title": "Rust Programming Language", "content": "A language empowering everyone to build reliable software.", "engine": "google", "engines": ["google", "duckduckgo"], "score": 5.2, "category": "general" }, { "url": "https://doc.rust-lang.org/book/", "title": "The Rust Book", "content": "The official guide to Rust programming.", "engine": "bing", "engines": ["bing"], "score": 3.1, "category": "general" } ], "suggestions": ["rust tutorial", "rust vs go"] }); let results = mock_response.get("results").unwrap().as_array().unwrap(); assert_eq!(results.len(), 2); // Verify first result mapping let r0 = &results[0]; assert_eq!(r0["title"].as_str().unwrap(), "Rust Programming Language"); assert_eq!(r0["url"].as_str().unwrap(), "https://www.rust-lang.org/"); assert_eq!(r0["content"].as_str().unwrap(), "A language empowering everyone to build reliable software."); let engines: Vec<&str> = r0["engines"].as_array().unwrap() .iter().filter_map(|e| e.as_str()).collect(); assert_eq!(engines, vec!["google", "duckduckgo"]); } #[test] fn test_searxng_empty_results() { let mock_response = json!({ "query": "nonexistent xyzzy123", "number_of_results": 0, "results": [], "suggestions": [] }); let results = mock_response.get("results").unwrap().as_array().unwrap(); assert!(results.is_empty()); } #[test] fn test_searxng_score_normalization() { // Score 5.2 → (5.2 * 10) = 52 → relevance 52 let score = 5.2_f64; let relevance = if score > 0.0 { (score.min(10.0) * 10.0) as u8 } else { 50 }; assert_eq!(relevance, 52); // Score 15.0 → clamped to 10.0 → relevance 100 let score = 15.0_f64; let relevance = if score > 0.0 { (score.min(10.0) * 10.0) as u8 } else { 50 }; assert_eq!(relevance, 100); // Score 0.0 → default relevance 50 let score = 0.0_f64; let relevance = if score > 0.0 { (score.min(10.0) * 10.0) as u8 } else { 50 }; assert_eq!(relevance, 50); } #[test] fn test_searxng_url_construction() { let config = SearchConfig::default(); let query = "2024年中国医疗政策"; let url = format!( "{}/search?q={}&format=json&categories=general&language=auto&pageno=1", config.searxng_url.trim_end_matches('/'), url_encode(query) ); assert!(url.starts_with("http://localhost:8888/search?")); assert!(url.contains("format=json")); assert!(url.contains("categories=general")); assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD assert!(!url.contains("%4E2D")); // NOT Unicode codepoint } // --- Native Search Helper Tests --- #[test] fn test_is_cjk_char_chinese() { assert!(is_cjk_char('中')); assert!(is_cjk_char('医')); assert!(is_cjk_char('。')); assert!(!is_cjk_char('a')); assert!(!is_cjk_char('1')); assert!(!is_cjk_char(' ')); } #[test] fn test_is_cjk_char_detects_chinese_query() { let query = "2024年中国医疗政策"; assert!(query.chars().any(|c| is_cjk_char(c))); let query_en = "Rust programming language"; assert!(!query_en.chars().any(|c| is_cjk_char(c))); } #[test] fn test_strip_html_tags() { assert_eq!(strip_html_tags("Hello"), "Hello"); assert_eq!(strip_html_tags("
    Link"), "Link"); assert_eq!(strip_html_tags("plain text"), "plain text"); assert_eq!(strip_html_tags("&<>"), "&<>"); // strip_html_tags only removes tags, not script content assert_eq!(strip_html_tags("Safe"), "alert()Safe"); } #[test] fn test_extract_between_basic() { let text = "prefix
    content
    suffix"; assert_eq!(extract_between(text, "
    ", "
    "), Some("content")); } #[test] fn test_extract_between_not_found() { let text = "no delimiters here"; assert_eq!(extract_between(text, "
    ", "
    "), None); } #[test] fn test_extract_href() { let text = r#"Title"#; assert_eq!(extract_href(text), Some("https://example.com/page".to_string())); } #[test] fn test_extract_href_protocol_relative() { let text = r#"Title"#; assert_eq!(extract_href(text), Some("https://example.com/page".to_string())); } #[test] fn test_extract_href_uddg() { let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc""#; assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string())); } #[test] fn test_extract_href_uddg_fallback() { let text = r#"Title"#; assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string())); } // --- HTML Parser Tests --- #[test] fn test_parse_ddg_html() { let hand = create_test_hand(); let html = r#" "#; let results = hand.parse_ddg_html(html, 10); assert_eq!(results.len(), 2); assert_eq!(results[0].title, "Rust Programming Language"); assert_eq!(results[0].url, "https://rust-lang.org"); assert_eq!(results[0].source, "DuckDuckGo"); assert_eq!(results[1].title, "The Rust Book"); } #[test] fn test_parse_ddg_html_max_results() { let hand = create_test_hand(); let mut html = String::new(); for i in 0..20 { html.push_str(&format!( r#""#, i, i, i )); } let results = hand.parse_ddg_html(&html, 5); assert_eq!(results.len(), 5); } #[test] fn test_parse_ddg_html_empty() { let hand = create_test_hand(); let html = "No results here"; let results = hand.parse_ddg_html(html, 10); assert!(results.is_empty()); } #[test] fn test_parse_bing_html() { let hand = create_test_hand(); let html = r#"
  • Example Result 1

    This is the first result snippet.

  • Example Result 2

    This is the second result snippet.

  • "#; let results = hand.parse_bing_html(html, 10); assert_eq!(results.len(), 2); assert_eq!(results[0].title, "Example Result 1"); assert_eq!(results[0].url, "https://example.com/result1"); assert_eq!(results[0].source, "Bing"); } #[test] fn test_parse_bing_html_skips_internal_urls() { let hand = create_test_hand(); let html = r#"
  • More Results

  • Real Result

  • "#; let results = hand.parse_bing_html(html, 10); assert_eq!(results.len(), 1); assert_eq!(results[0].url, "https://example.com/real"); } #[test] fn test_parse_bing_html_empty() { let hand = create_test_hand(); let html = "Nothing here"; let results = hand.parse_bing_html(html, 10); assert!(results.is_empty()); } #[test] fn test_parse_baidu_html() { let hand = create_test_hand(); let html = r#"

    中国医疗政策 2024

    这是关于医疗政策的摘要信息。
    "#; let results = hand.parse_baidu_html(html, 10); assert!(results.len() >= 1, "Should find at least 1 result, got {}", results.len()); assert_eq!(results[0].source, "Baidu"); } // --- SSRF Validation Tests --- #[test] fn test_ssrf_blocks_localhost() { assert!(validate_fetch_url("http://localhost:8080/admin").is_err()); assert!(validate_fetch_url("http://127.0.0.1:5432/db").is_err()); } #[test] fn test_ssrf_blocks_private_ip() { assert!(validate_fetch_url("http://10.0.0.1/secret").is_err()); assert!(validate_fetch_url("http://192.168.1.1/router").is_err()); assert!(validate_fetch_url("http://172.16.0.1/internal").is_err()); } #[test] fn test_ssrf_blocks_cloud_metadata() { assert!(validate_fetch_url("http://169.254.169.254/metadata").is_err()); } #[test] fn test_ssrf_blocks_non_http_scheme() { assert!(validate_fetch_url("file:///etc/passwd").is_err()); assert!(validate_fetch_url("ftp://example.com/file").is_err()); } #[test] fn test_ssrf_allows_public_url() { assert!(validate_fetch_url("https://www.rust-lang.org/learn").is_ok()); assert!(validate_fetch_url("https://example.com/page?q=test").is_ok()); } // --- Percent Decode Tests --- #[test] fn test_percent_decode_basic() { assert_eq!(percent_decode("hello%20world"), "hello world"); assert_eq!(percent_decode("%E4%B8%AD%E6%96%87"), "中文"); } #[test] fn test_percent_decode_full_url() { assert_eq!( percent_decode("https%3A%2F%2Fexample.com%2Fpage%3Fq%3Dtest"), "https://example.com/page?q=test" ); } #[test] fn test_percent_decode_no_encoding() { assert_eq!(percent_decode("plain-text_123"), "plain-text_123"); } // --- Input Validation Tests --- #[test] fn test_research_query_validate_empty() { let query = ResearchQuery { query: " ".to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }; assert!(query.validate().is_err()); } #[test] fn test_research_query_validate_too_long() { let query = ResearchQuery { query: "x".repeat(501), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }; assert!(query.validate().is_err()); } #[test] fn test_research_query_validate_max_results_overflow() { let query = ResearchQuery { query: "test".to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 999, include_related: false, time_limit_secs: 60, }; assert!(query.validate().is_err()); } #[test] fn test_research_query_validate_ok() { let query = ResearchQuery { query: "Rust programming".to_string(), engine: SearchEngine::Auto, depth: ResearchDepth::Standard, max_results: 10, include_related: false, time_limit_secs: 60, }; assert!(query.validate().is_ok()); } // --- Quality Filter Tests --- #[test] fn test_quality_rejects_javascript_title() { assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com")); } #[test] fn test_quality_rejects_short_title() { assert!(!is_quality_result("A", "snippet", "https://example.com")); } #[test] fn test_quality_rejects_css_title() { assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com")); } #[test] fn test_quality_rejects_javascript_url() { assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)")); } #[test] fn test_quality_accepts_normal_result() { assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy")); } #[test] fn test_quality_accepts_english_result() { assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org")); } #[test] fn test_quality_rejects_long_title() { let long_title: String = "x".repeat(301); assert!(!is_quality_result(&long_title, "ok", "https://example.com")); } #[test] fn test_strip_html_tags_collapses_whitespace() { assert_eq!(strip_html_tags("Hello World"), "Hello World"); assert_eq!(strip_html_tags("a\n\t b"), "a b"); } }