zclaw_openfang/crates/zclaw-hands/src/hands/researcher.rs

//! Researcher Hand - Deep research and analysis capabilities
//!
//! This hand provides web search, content fetching, and research synthesis.

use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::collections::HashMap;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use std::sync::Arc;
use tokio::sync::RwLock;
use url::Url;
use zclaw_types::Result;

use crate::{Hand, HandConfig, HandContext, HandResult};

/// Search engine options
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SearchEngine {
    SearXNG,
    Google,
    Bing,
    DuckDuckGo,
    Auto,
}

impl Default for SearchEngine {
    fn default() -> Self {
        Self::Auto
    }
}

/// Search configuration loaded from config/config.toml
#[derive(Debug, Clone)]
struct SearchConfig {
    default_engine: SearchEngine,
    searxng_url: String,
    timeout_secs: u64,
    jina_api_key: Option<String>,
}

impl Default for SearchConfig {
    fn default() -> Self {
        Self {
            default_engine: SearchEngine::Auto,
            searxng_url: "http://localhost:8888".to_string(),
            timeout_secs: 15,
            jina_api_key: None,
        }
    }
}

impl SearchConfig {
    fn load() -> Self {
        let path = "config/config.toml";
        let content = match std::fs::read_to_string(path) {
            Ok(c) => c,
            Err(_) => return Self::default(),
        };

        #[derive(Deserialize)]
        struct ToolsWebSearch {
            default_engine: Option<String>,
            #[allow(dead_code)]
            max_results: Option<usize>,
            searxng_url: Option<String>,
            searxng_timeout: Option<u64>,
        }

        #[derive(Deserialize)]
        struct ToolsWeb {
            search: Option<ToolsWebSearch>,
        }

        #[derive(Deserialize)]
        struct Tools {
            web: Option<ToolsWeb>,
        }

        #[derive(Deserialize)]
        struct Config {
            tools: Option<Tools>,
        }

        let config: Config = match toml::from_str(&content) {
            Ok(c) => c,
            Err(_) => return Self::default(),
        };

        let search = config.tools
            .and_then(|t| t.web)
            .and_then(|w| w.search);

        match search {
            Some(s) => {
                let engine = s.default_engine
                    .as_deref()
                    .and_then(|e| serde_json::from_str(&format!("\"{}\"", e)).ok())
                    .unwrap_or_default();
                Self {
                    default_engine: engine,
                    searxng_url: s.searxng_url
                        .unwrap_or_else(|| "http://localhost:8888".to_string()),
                    timeout_secs: s.searxng_timeout.unwrap_or(15),
                    jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(),
                }
            }
            None => Self::default(),
        }
    }
}

/// Research depth level
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ResearchDepth {
    Quick,    // Fast search, top 3 results
    Standard, // Normal search, top 10 results
    Deep,     // Comprehensive search, multiple sources
}

impl Default for ResearchDepth {
    fn default() -> Self {
        Self::Standard
    }
}

/// Research query configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchQuery {
    /// Search query
    pub query: String,
    /// Search engine to use
    #[serde(default)]
    pub engine: SearchEngine,
    /// Research depth
    #[serde(default)]
    pub depth: ResearchDepth,
    /// Maximum results to return
    #[serde(default = "default_max_results")]
    pub max_results: usize,
    /// Include related topics
    #[serde(default)]
    pub include_related: bool,
    /// Time limit in seconds
    #[serde(default = "default_time_limit")]
    pub time_limit_secs: u64,
}

fn default_max_results() -> usize { 10 }
fn default_time_limit() -> u64 { 60 }

const MAX_QUERY_LENGTH: usize = 500;
const MAX_RESULTS_CAP: usize = 50;
const MAX_URL_LENGTH: usize = 2048;
const CACHE_MAX_ENTRIES: usize = 200;

impl ResearchQuery {
    fn validate(&self) -> std::result::Result<(), String> {
        if self.query.trim().is_empty() {
            return Err("搜索查询不能为空".to_string());
        }
        if self.query.len() > MAX_QUERY_LENGTH {
            return Err(format!("查询过长（上限 {} 字符）", MAX_QUERY_LENGTH));
        }
        if self.max_results > MAX_RESULTS_CAP {
            return Err(format!("max_results 上限为 {}", MAX_RESULTS_CAP));
        }
        Ok(())
    }
}

/// Search result item
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
    /// Title of the result
    pub title: String,
    /// URL
    pub url: String,
    /// Snippet/summary
    pub snippet: String,
    /// Source name
    pub source: String,
    /// Relevance score (0-100)
    #[serde(default)]
    pub relevance: u8,
    /// Fetched content (if available)
    #[serde(default)]
    pub content: Option<String>,
    /// Timestamp
    #[serde(default)]
    pub fetched_at: Option<String>,
}

/// Research report
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchReport {
    /// Original query
    pub query: String,
    /// Search results
    pub results: Vec<SearchResult>,
    /// Synthesized summary
    #[serde(default)]
    pub summary: Option<String>,
    /// Key findings
    #[serde(default)]
    pub key_findings: Vec<String>,
    /// Related topics discovered
    #[serde(default)]
    pub related_topics: Vec<String>,
    /// Research timestamp
    pub researched_at: String,
    /// Total time spent (ms)
    pub duration_ms: u64,
}

/// Researcher action types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "action")]
pub enum ResearcherAction {
    #[serde(rename = "search")]
    Search { query: ResearchQuery },
    #[serde(rename = "fetch")]
    Fetch { url: String },
    #[serde(rename = "summarize")]
    Summarize { urls: Vec<String> },
    #[serde(rename = "report")]
    Report { query: ResearchQuery },
}

/// Researcher Hand implementation
pub struct ResearcherHand {
    config: HandConfig,
    search_config: SearchConfig,
    client: reqwest::Client,
    cache: Arc<RwLock<HashMap<String, SearchResult>>>,
}

impl ResearcherHand {
    /// Create a new researcher hand
    pub fn new() -> Self {
        Self {
            config: HandConfig {
                id: "researcher".to_string(),
                name: "研究员".to_string(),
                description: "深度研究和分析能力，支持网络搜索和内容获取".to_string(),
                needs_approval: false,
                dependencies: vec!["network".to_string()],
                input_schema: Some(serde_json::json!({
                    "type": "object",
                    "properties": {
                        "action": {
                            "type": "string",
                            "enum": ["search", "fetch", "report", "summarize"],
                            "description": "Action to perform: search (web search), fetch (get URL content), report (deep research), summarize (multiple URLs)"
                        },
                        "query": {
                            "type": "string",
                            "description": "Search query string for search/report actions"
                        },
                        "url": {
                            "type": "string",
                            "description": "URL to fetch content from"
                        },
                        "urls": {
                            "type": "array",
                            "items": { "type": "string" },
                            "description": "List of URLs to summarize"
                        },
                        "engine": {
                            "type": "string",
                            "enum": ["auto", "searxng", "google", "bing", "duckduckgo"],
                            "description": "Search engine preference"
                        }
                    },
                    "description": "Provide 'query' for search/report, or 'url' for fetch, or 'urls' for summarize"
                })),
                tags: vec!["research".to_string(), "web".to_string(), "search".to_string()],
                enabled: true,
                max_concurrent: 0,
                timeout_secs: 0,
            },
            search_config: SearchConfig::load(),
            client: reqwest::Client::builder()
                .timeout(std::time::Duration::from_secs(30))
                .user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
                .redirect(reqwest::redirect::Policy::limited(3))
                .build()
                .unwrap_or_else(|_| reqwest::Client::new()),
            cache: Arc::new(RwLock::new(HashMap::new())),
        }
    }

    /// Infer action from input fields when LLM omits the `action` field.
    /// Many LLMs (especially non-OpenAI models like glm) call tools without
    /// including the enum tag, e.g. sending `{"query": "search terms"}` instead
    /// of `{"action": "search", "query": "search terms"}`.
    fn infer_action(input: &Value) -> Result<ResearcherAction> {
        // Debug: log all keys in the input
        let keys: Vec<&str> = input.as_object()
            .map(|obj| obj.keys().map(|k| k.as_str()).collect())
            .unwrap_or_default();
        tracing::debug!(target: "researcher", ?keys, %input, "infer_action examining input");

        // Check for action field with wrong value
        if let Some(action) = input.get("action").and_then(|v| v.as_str()) {
            if action == "search" || action == "report" {
                if let Some(query_val) = input.get("query") {
                    let query = Self::parse_query(query_val);
                    if !query.query.trim().is_empty() {
                        return Ok(if action == "report" {
                            ResearcherAction::Report { query }
                        } else {
                            ResearcherAction::Search { query }
                        });
                    }
                }
            }
            if action == "fetch" {
                if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
                    return Ok(ResearcherAction::Fetch { url: url.to_string() });
                }
            }
        }

        // Has "url" (singular) → fetch
        if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
            if !url.is_empty() && url.starts_with("http") {
                return Ok(ResearcherAction::Fetch { url: url.to_string() });
            }
        }
        // Has "urls" (plural) → summarize
        if let Some(urls) = input.get("urls").and_then(|v| v.as_array()) {
            let url_list: Vec<String> = urls.iter()
                .filter_map(|v| v.as_str().map(|s| s.to_string()))
                .collect();
            if !url_list.is_empty() {
                return Ok(ResearcherAction::Summarize { urls: url_list });
            }
        }
        // Has "query" → search
        if let Some(query_val) = input.get("query") {
            let query = Self::parse_query(query_val);
            if !query.query.trim().is_empty() {
                return Ok(ResearcherAction::Search { query });
            }
        }
        // Has "search" or "search_query" → search
        for key in &["search", "search_query", "keyword", "keywords", "q", "text"] {
            if let Some(val) = input.get(key) {
                let query = Self::parse_query(val);
                if !query.query.trim().is_empty() {
                    return Ok(ResearcherAction::Search { query });
                }
            }
        }
        // Check for injected fallback query from loop_runner (when LLM sends empty args)
        if let Some(fallback) = input.get("_fallback_query").and_then(|v| v.as_str()) {
            if !fallback.trim().is_empty() {
                tracing::debug!(target: "researcher", query = %fallback, "Using fallback user message as search query");
                return Ok(ResearcherAction::Search { query: ResearchQuery {
                    query: fallback.to_string(),
                    engine: SearchEngine::Auto,
                    depth: ResearchDepth::Standard,
                    max_results: 10,
                    include_related: false,
                    time_limit_secs: 60,
                }});
            }
        }

        // Last resort: if any string field looks like a search query
        if let Some(obj) = input.as_object() {
            for (key, val) in obj {
                if let Some(s) = val.as_str() {
                    if s.len() > 2 && !s.starts_with("http") && key != "action" && key != "engine" {
                        tracing::debug!(target: "researcher", key = %key, value = %s, "Using fallback field as query");
                        return Ok(ResearcherAction::Search { query: ResearchQuery {
                            query: s.to_string(),
                            engine: SearchEngine::Auto,
                            depth: ResearchDepth::Standard,
                            max_results: 10,
                            include_related: false,
                            time_limit_secs: 60,
                        }});
                    }
                }
            }
        }
        Err(zclaw_types::ZclawError::HandError(
            "无法识别搜索意图：请提供 query（搜索）或 url（获取网页）参数".to_string()
        ))
    }

    fn parse_query(query_val: &Value) -> ResearchQuery {
        if query_val.is_string() {
            ResearchQuery {
                query: query_val.as_str().unwrap_or("").to_string(),
                engine: SearchEngine::Auto,
                depth: ResearchDepth::Standard,
                max_results: 10,
                include_related: false,
                time_limit_secs: 60,
            }
        } else {
            serde_json::from_value(query_val.clone()).unwrap_or_else(|_| ResearchQuery {
                query: query_val.get("query")
                    .or_else(|| query_val.get("search"))
                    .or_else(|| query_val.get("q"))
                    .or_else(|| query_val.get("keyword"))
                    .and_then(|v| v.as_str())
                    .unwrap_or("")
                    .to_string(),
                engine: SearchEngine::Auto,
                depth: ResearchDepth::Standard,
                max_results: 10,
                include_related: false,
                time_limit_secs: 60,
            })
        }
    }

    /// Execute a web search — route to the configured backend
    async fn execute_search(&self, query: &ResearchQuery) -> Result<Vec<SearchResult>> {
        query.validate().map_err(|e| zclaw_types::ZclawError::HandError(e))?;

        let max_results = query.max_results.min(MAX_RESULTS_CAP);
        let start = std::time::Instant::now();

        let engine = match &query.engine {
            SearchEngine::Auto => &self.search_config.default_engine,
            other => other,
        };

        let results = match engine {
            SearchEngine::SearXNG => {
                match self.search_searxng(&query.query, max_results).await {
                    Ok(r) if !r.is_empty() => r,
                    _ => self.search_native(&query.query, max_results).await?,
                }
            }
            SearchEngine::Auto => {
                self.search_native(&query.query, max_results).await?
            }
            SearchEngine::DuckDuckGo => {
                // DDG在国内不可用，降级到百度
                tracing::warn!(target: "researcher", "DuckDuckGo在国内不可用，降级到百度");
                self.search_baidu(&query.query, max_results).await?
            }
            SearchEngine::Google => {
                tracing::warn!(target: "researcher", "Google在国内不可用，降级到百度");
                self.search_baidu(&query.query, max_results).await?
            }
            SearchEngine::Bing => {
                self.search_bing(&query.query, max_results).await?
            }
        };

        let duration = start.elapsed().as_millis() as u64;
        tracing::info!(
            target: "researcher",
            query = %query.query,
            engine = ?engine,
            duration_ms = duration,
            results_count = results.len(),
            "Search completed"
        );

        Ok(results)
    }

    /// Rust-native multi-engine search — optimized for China mainland users
    /// Priority: Baidu + Bing CN (both always work in China)
    /// DuckDuckGo as optional fallback (may be blocked by GFW)
    async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let mut all_results = Vec::new();

        // Always use Baidu + Bing CN in parallel (both work in China)
        let baidu_fut = self.search_baidu(query, max_results);
        let bing_fut = self.search_bing(query, max_results);

        let (baidu_res, bing_res) = tokio::join!(
            async { baidu_fut.await },
            async { bing_fut.await },
        );

        if let Ok(r) = baidu_res {
            all_results.extend(r);
        }
        if let Ok(r) = bing_res {
            all_results.extend(r);
        }

        // If both primary engines returned nothing, try DDG as last resort
        if all_results.is_empty() {
            tracing::info!(target: "researcher", "Primary engines empty, trying DuckDuckGo as fallback");
            if let Ok(r) = self.search_duckduckgo_html(query, max_results).await {
                all_results.extend(r);
            }
        }

        // Deduplicate by URL
        let mut seen_urls = std::collections::HashSet::new();
        all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));

        // Sort by relevance descending, take top N
        all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
        all_results.truncate(max_results);

        if all_results.is_empty() {
            tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
        }

        Ok(all_results)
    }

    /// Search using SearXNG meta-search engine (aggregates 70+ engines)
    async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let url = format!(
            "{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
            self.search_config.searxng_url.trim_end_matches('/'),
            url_encode(query)
        );

        let response = self.client
            .get(&url)
            .timeout(std::time::Duration::from_secs(self.search_config.timeout_secs))
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("SearXNG request failed: {}", e)
            ))?;

        let status = response.status();
        if !status.is_success() {
            return Err(zclaw_types::ZclawError::HandError(
                format!("SearXNG returned HTTP {}", status)
            ));
        }

        let json: Value = response.json().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to parse SearXNG response: {}", e)
            ))?;

        let mut results = Vec::new();

        if let Some(items) = json.get("results").and_then(|v| v.as_array()) {
            for item in items.iter().take(max_results) {
                let title = item.get("title")
                    .and_then(|v| v.as_str())
                    .unwrap_or("")
                    .to_string();
                let url = item.get("url")
                    .and_then(|v| v.as_str())
                    .unwrap_or("")
                    .to_string();
                let snippet = item.get("content")
                    .and_then(|v| v.as_str())
                    .unwrap_or("")
                    .to_string();
                let engines = item.get("engines")
                    .and_then(|v| v.as_array())
                    .map(|arr| {
                        arr.iter()
                            .filter_map(|e| e.as_str())
                            .collect::<Vec<_>>()
                            .join(",")
                    })
                    .unwrap_or_default();
                let score = item.get("score")
                    .and_then(|v| v.as_f64())
                    .unwrap_or(0.0);

                // Normalize score to 0-100 range
                let relevance = if score > 0.0 {
                    (score.min(10.0) * 10.0) as u8
                } else {
                    50
                };

                if !title.is_empty() && !url.is_empty() {
                    results.push(SearchResult {
                        title,
                        url,
                        snippet,
                        source: if engines.is_empty() {
                            "SearXNG".to_string()
                        } else {
                            format!("SearXNG({})", engines)
                        },
                        relevance,
                        content: None,
                        fetched_at: Some(chrono::Utc::now().to_rfc3339()),
                    });
                }
            }
        }

        Ok(results)
    }

    /// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior)
    async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let has_cjk = query.chars().any(|c| is_cjk_char(c));
        let region = if has_cjk { "wt-wt" } else { "wt-wt" };
        let body = format!("q={}&b=&l={}", url_encode(query), region);

        let response = self.client
            .post("https://html.duckduckgo.com/html/")
            .header("Content-Type", "application/x-www-form-urlencoded")
            .header("Accept", "text/html,application/xhtml+xml")
            .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
            .body(body)
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("DuckDuckGo HTML search failed: {}", e)
            ))?;

        let status = response.status();
        if !status.is_success() {
            return Err(zclaw_types::ZclawError::HandError(
                format!("DuckDuckGo returned HTTP {}", status)
            ));
        }

        let html = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read DuckDuckGo response: {}", e)
            ))?;

        Ok(self.parse_ddg_html(&html, max_results))
    }

    /// Parse DuckDuckGo HTML search results page
    fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();

        for block in html.split("class=\"result__body\"") {
            if results.len() >= max_results {
                break;
            }

            // Find the result title link: <a class="result__a" href="...">Title</a>
            let title_link = match extract_between(block, "result__a", "</a>") {
                Some(s) => s,
                None => continue,
            };
            // title_link is like:  href="//duckduckgo.com/l/?uddg=...">Title Text
            let title = title_link.rsplit('>').next()
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();

            let url = extract_href_uddg(block).unwrap_or_default();

            let snippet = extract_between(block, "result__snippet", "</a>")
                .map(|s| {
                    s.rsplit('>').next()
                        .map(|t| strip_html_tags(t).trim().to_string())
                        .unwrap_or_default()
                })
                .unwrap_or_default();

            if title.is_empty() || url.is_empty() {
                continue;
            }

            if !is_quality_result(&title, &snippet, &url) {
                continue;
            }

            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "DuckDuckGo".to_string(),
                relevance: 70,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }

        results
    }

    /// Search using Bing (works well for both Chinese and English)
    async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let has_cjk = query.chars().any(|c| is_cjk_char(c));
        let url = if has_cjk {
            format!(
                "https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
                url_encode(query),
                max_results
            )
        } else {
            format!(
                "https://www.bing.com/search?q={}&count={}",
                url_encode(query),
                max_results
            )
        };

        let response = self.client
            .get(&url)
            .header("Accept", "text/html,application/xhtml+xml")
            .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Bing search failed: {}", e)
            ))?;

        let status = response.status();
        if !status.is_success() {
            return Err(zclaw_types::ZclawError::HandError(
                format!("Bing returned HTTP {}", status)
            ));
        }

        let html = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read Bing response: {}", e)
            ))?;

        Ok(self.parse_bing_html(&html, max_results))
    }

    /// Parse Bing HTML search results page
    fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();

        // Bing results are in <li class="b_algo">
        for block in html.split("class=\"b_algo\"") {
            if results.len() >= max_results {
                break;
            }

            // Extract title from first <a> inside the block
            let title = extract_between(block, ">", "</a>")
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();

            // Extract URL from href attribute of first <a>
            let url = extract_href(block).unwrap_or_default();

            // Extract snippet from <div class="b_caption"><p>...</p> or <p>
            let snippet = extract_between(block, "<p>", "</p>")
                .or_else(|| extract_between(block, "b_caption", "</div>"))
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();

            if title.is_empty() || url.is_empty() {
                continue;
            }

            // Skip Bing internal URLs
            if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
                continue;
            }

            if !is_quality_result(&title, &snippet, &url) {
                continue;
            }

            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "Bing".to_string(),
                relevance: 75,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }

        results
    }

    /// Search using Baidu (essential for Chinese content)
    async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let url = format!(
            "https://www.baidu.com/s?wd={}&rn={}",
            url_encode(query),
            max_results
        );

        let response = self.client
            .get(&url)
            .header("Accept", "text/html,application/xhtml+xml")
            .header("Accept-Language", "zh-CN,zh;q=0.9")
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Baidu search failed: {}", e)
            ))?;

        let status = response.status();
        if !status.is_success() {
            return Err(zclaw_types::ZclawError::HandError(
                format!("Baidu returned HTTP {}", status)
            ));
        }

        let html = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read Baidu response: {}", e)
            ))?;

        Ok(self.parse_baidu_html(&html, max_results))
    }

    /// Parse Baidu HTML search results page
    fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();

        // Baidu uses multiple class patterns: "result c-container", "c-container new-pmd", "result-op c-container"
        let blocks: Vec<&str> = html.split("c-container")
            .enumerate()
            .filter_map(|(i, block)| {
                if i == 0 { return None; }
                if block.contains("href=\"http") { Some(block) } else { None }
            })
            .collect();

        for block in &blocks {
            if results.len() >= max_results {
                break;
            }

            let title = extract_between(block, ">", "</a>")
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();

            let url = extract_href(block).unwrap_or_default();

            let snippet = extract_between(block, "c-abstract", "</div>")
                .or_else(|| extract_between(block, "content-right_", "</div>"))
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();

            if title.is_empty() || url.is_empty() {
                continue;
            }

            if !is_quality_result(&title, &snippet, &url) {
                continue;
            }

            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "Baidu".to_string(),
                relevance: 80,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }

        results
    }

    /// Fetch content from a URL (with SSRF protection)
    /// Tries Jina Reader API first for clean Markdown, falls back to direct fetch
    async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
        let start = std::time::Instant::now();

        // SSRF validation
        validate_fetch_url(url)?;

        // Check cache first
        {
            let cache = self.cache.read().await;
            if let Some(cached) = cache.get(url) {
                if cached.content.is_some() {
                    return Ok(cached.clone());
                }
            }
        }

        // Try Jina Reader API first (returns clean Markdown)
        let content = match self.fetch_via_jina(url).await {
            Ok(text) => text,
            Err(e) => {
                tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch");
                self.fetch_direct(url).await?
            }
        };

        let result = SearchResult {
            title: url.to_string(),
            url: url.to_string(),
            snippet: content.chars().take(500).collect(),
            source: url.to_string(),
            relevance: 100,
            content: Some(content),
            fetched_at: Some(chrono::Utc::now().to_rfc3339()),
        };

        // Cache the result (with capacity limit)
        {
            let mut cache = self.cache.write().await;
            if cache.len() >= CACHE_MAX_ENTRIES {
                // Simple eviction: remove first entry
                if let Some(key) = cache.keys().next().cloned() {
                    cache.remove(&key);
                }
            }
            cache.insert(url.to_string(), result.clone());
        }

        let duration = start.elapsed().as_millis() as u64;
        tracing::info!(
            target: "researcher",
            url = url,
            duration_ms = duration,
            "Fetch completed"
        );

        Ok(result)
    }

    /// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern)
    async fn fetch_via_jina(&self, url: &str) -> Result<String> {
        let client = reqwest::Client::builder()
            .timeout(std::time::Duration::from_secs(20))
            .build()
            .unwrap_or_else(|_| reqwest::Client::new());

        let mut builder = client
            .post("https://r.jina.ai/")
            .header("Content-Type", "application/json")
            .header("X-Return-Format", "markdown")
            .header("X-Timeout", "15")
            .json(&serde_json::json!({ "url": url }));

        // Optional API key for higher rate limits
        if let Some(ref key) = self.search_config.jina_api_key {
            builder = builder.header("Authorization", format!("Bearer {}", key));
        }

        let response = builder.send().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Jina Reader request failed: {}", e)
            ))?;

        let status = response.status();
        if !status.is_success() {
            return Err(zclaw_types::ZclawError::HandError(
                format!("Jina Reader returned HTTP {}", status)
            ));
        }

        let text = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read Jina response: {}", e)
            ))?;

        if text.trim().is_empty() {
            return Err(zclaw_types::ZclawError::HandError(
                "Jina Reader returned empty response".to_string()
            ));
        }

        // Truncate to 4096 chars (DeerFlow pattern)
        let truncated: String = text.chars().take(4096).collect();
        Ok(truncated)
    }

    /// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable)
    async fn fetch_direct(&self, url: &str) -> Result<String> {
        let response = self.client
            .get(url)
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?;

        let content_type = response.headers()
            .get(reqwest::header::CONTENT_TYPE)
            .and_then(|v| v.to_str().ok())
            .unwrap_or("");

        let content = if content_type.contains("text/html") {
            let html = response.text().await
                .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
            self.extract_text_from_html(&html)
        } else if content_type.contains("text/") || content_type.contains("application/json") {
            response.text().await
                .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
        } else {
            "[Binary content]".to_string()
        };

        Ok(content)
    }

    /// Extract readable text from HTML
    fn extract_text_from_html(&self, html: &str) -> String {
        let html_lower = html.to_lowercase();
        let mut text = String::new();
        let mut in_tag = false;
        let mut in_script = false;
        let mut in_style = false;
        let mut pos: usize = 0;

        for c in html.chars() {
            let char_len = c.len_utf8();
            match c {
                '<' => {
                    // Check for closing tags before entering tag mode
                    let remaining = &html_lower[pos..];
                    if remaining.starts_with("</script") {
                        in_script = false;
                    } else if remaining.starts_with("</style") {
                        in_style = false;
                    }
                    // Check for opening tags
                    if remaining.starts_with("<script") {
                        in_script = true;
                    } else if remaining.starts_with("<style") {
                        in_style = true;
                    }
                    in_tag = true;
                }
                '>' => {
                    in_tag = false;
                }
                _ if in_tag => {}
                _ if in_script || in_style => {}
                ' ' | '\n' | '\t' | '\r' => {
                    if !text.ends_with(' ') && !text.is_empty() {
                        text.push(' ');
                    }
                }
                _ => text.push(c),
            }
            pos += char_len;
        }

        if text.len() > 10000 {
            text.truncate(10000);
            text.push_str("...");
        }

        text.trim().to_string()
    }

    /// Generate a comprehensive research report
    async fn execute_report(&self, query: &ResearchQuery) -> Result<ResearchReport> {
        let start = std::time::Instant::now();

        // First, execute search
        let mut results = self.execute_search(query).await?;

        // Fetch content for top results
        let fetch_limit = match query.depth {
            ResearchDepth::Quick => 1,
            ResearchDepth::Standard => 3,
            ResearchDepth::Deep => 5,
        };

        for result in results.iter_mut().take(fetch_limit) {
            if !result.url.is_empty() {
                match self.execute_fetch(&result.url).await {
                    Ok(fetched) => {
                        result.content = fetched.content;
                        result.fetched_at = fetched.fetched_at;
                    }
                    Err(e) => {
                        tracing::warn!(target: "researcher", error = %e, "Failed to fetch content");
                    }
                }
            }
        }

        // Extract key findings
        let key_findings: Vec<String> = results.iter()
            .take(5)
            .filter_map(|r| {
                r.content.as_ref().map(|c| {
                    c.split(". ")
                        .take(3)
                        .collect::<Vec<_>>()
                        .join(". ")
                })
            })
            .collect();

        // Extract related topics from snippets
        let related_topics: Vec<String> = results.iter()
            .filter_map(|r| {
                if r.snippet.len() > 50 {
                    Some(r.title.clone())
                } else {
                    None
                }
            })
            .take(5)
            .collect();

        let duration = start.elapsed().as_millis() as u64;

        // Generate summary from top results
        let summary = if results.is_empty() {
            "未找到相关结果，建议调整搜索关键词后重试".to_string()
        } else {
            let top_snippets: Vec<&str> = results
                .iter()
                .take(3)
                .filter_map(|r| {
                    let s = r.snippet.trim();
                    if s.is_empty() { None } else { Some(s) }
                })
                .collect();
            if top_snippets.is_empty() {
                format!("找到 {} 条相关结果，但无摘要信息", results.len())
            } else {
                format!(
                    "基于 {} 条搜索结果：{}",
                    results.len(),
                    top_snippets.join("；")
                )
            }
        };

        Ok(ResearchReport {
            query: query.query.clone(),
            results,
            summary: Some(summary),
            key_findings,
            related_topics,
            researched_at: chrono::Utc::now().to_rfc3339(),
            duration_ms: duration,
        })
    }
}

impl Default for ResearcherHand {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl Hand for ResearcherHand {
    fn config(&self) -> &HandConfig {
        &self.config
    }

    async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
        tracing::debug!(target: "researcher", input = %input, "Researcher hand received input");
        // Try strict deserialization first, then fall back to inference
        let action: ResearcherAction = match serde_json::from_value(input.clone()) {
            Ok(a) => a,
            Err(e) => {
                tracing::debug!(target: "researcher", error = %e, input = %input, "Strict deserialization failed, trying inference");
                Self::infer_action(&input)?
            }
        };

        let start = std::time::Instant::now();

        let result = match action {
            ResearcherAction::Search { query } => {
                let results = self.execute_search(&query).await?;
                json!({
                    "action": "search",
                    "query": query.query,
                    "results": results,
                    "duration_ms": start.elapsed().as_millis()
                })
            }
            ResearcherAction::Fetch { url } => {
                let result = self.execute_fetch(&url).await?;
                json!({
                    "action": "fetch",
                    "url": url,
                    "result": result,
                    "duration_ms": start.elapsed().as_millis()
                })
            }
            ResearcherAction::Summarize { urls } => {
                let mut results = Vec::new();
                for url in urls.iter().take(5) {
                    if let Ok(result) = self.execute_fetch(url).await {
                        results.push(result);
                    }
                }
                json!({
                    "action": "summarize",
                    "urls": urls,
                    "results": results,
                    "duration_ms": start.elapsed().as_millis()
                })
            }
            ResearcherAction::Report { query } => {
                let report = self.execute_report(&query).await?;
                json!({
                    "action": "report",
                    "report": report
                })
            }
        };

        Ok(HandResult::success(result))
    }

    fn needs_approval(&self) -> bool {
        false // Research operations are generally safe
    }

    fn check_dependencies(&self) -> Result<Vec<String>> {
        // Network connectivity will be checked at runtime
        Ok(Vec::new())
    }

    fn status(&self) -> crate::HandStatus {
        crate::HandStatus::Idle
    }
}

/// URL encoding helper — encodes each UTF-8 byte, not Unicode code points.
fn url_encode(s: &str) -> String {
    s.bytes()
        .map(|b| match b {
            b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
                (b as char).to_string()
            }
            _ => format!("%{:02X}", b),
        })
        .collect()
}

/// Check if a character is CJK (Chinese/Japanese/Korean)
fn is_cjk_char(c: char) -> bool {
    matches!(c,
        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
        '\u{3000}'..='\u{303F}' |   // CJK Symbols and Punctuation
        '\u{FF00}'..='\u{FFEF}' |   // Fullwidth Forms
        '\u{2E80}'..='\u{2EFF}' |   // CJK Radicals Supplement
        '\u{F900}'..='\u{FAFF}'     // CJK Compatibility Ideographs
    )
}

/// Validate a URL for SSRF safety before fetching
fn validate_fetch_url(url_str: &str) -> Result<()> {
    if url_str.len() > MAX_URL_LENGTH {
        return Err(zclaw_types::ZclawError::HandError(
            format!("URL exceeds maximum length of {} characters", MAX_URL_LENGTH)
        ));
    }

    let url = Url::parse(url_str)
        .map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid URL: {}", e)))?;

    match url.scheme() {
        "http" | "https" => {}
        scheme => {
            return Err(zclaw_types::ZclawError::HandError(
                format!("URL scheme '{}' not allowed, only http/https", scheme)
            ));
        }
    }

    let host = url.host_str()
        .ok_or_else(|| zclaw_types::ZclawError::HandError("URL must have a host".into()))?;

    // Strip IPv6 brackets for parsing
    let host_for_parsing = if host.starts_with('[') && host.ends_with(']') {
        &host[1..host.len()-1]
    } else {
        host
    };

    if let Ok(ip) = host_for_parsing.parse::<IpAddr>() {
        validate_ip(&ip)?;
    } else {
        validate_hostname(host)?;
    }

    Ok(())
}

fn validate_ip(ip: &IpAddr) -> Result<()> {
    match ip {
        IpAddr::V4(v4) => validate_ipv4(v4),
        IpAddr::V6(v6) => validate_ipv6(v6),
    }
}

fn validate_ipv4(ip: &Ipv4Addr) -> Result<()> {
    let o = ip.octets();
    if o[0] == 127 { return Err(ssrf_err("loopback")); }
    if o[0] == 10 { return Err(ssrf_err("private 10.x.x.x")); }
    if o[0] == 172 && (16..=31).contains(&o[1]) { return Err(ssrf_err("private 172.16-31.x.x")); }
    if o[0] == 192 && o[1] == 168 { return Err(ssrf_err("private 192.168.x.x")); }
    if o[0] == 169 && o[1] == 254 { return Err(ssrf_err("link-local/metadata")); }
    if o[0] == 0 { return Err(ssrf_err("0.x.x.x")); }
    if *ip == Ipv4Addr::new(255, 255, 255, 255) { return Err(ssrf_err("broadcast")); }
    if (224..=239).contains(&o[0]) { return Err(ssrf_err("multicast")); }
    Ok(())
}

fn validate_ipv6(ip: &Ipv6Addr) -> Result<()> {
    if *ip == Ipv6Addr::LOCALHOST { return Err(ssrf_err("IPv6 loopback")); }
    if *ip == Ipv6Addr::UNSPECIFIED { return Err(ssrf_err("IPv6 unspecified")); }
    let segs = ip.segments();
    // IPv4-mapped: ::ffff:x.x.x.x
    if segs[5] == 0xffff {
        let v4 = ((segs[6] as u32) << 16) | (segs[7] as u32);
        validate_ipv4(&Ipv4Addr::from(v4))?;
    }
    // Link-local fe80::/10
    if (segs[0] & 0xffc0) == 0xfe80 { return Err(ssrf_err("IPv6 link-local")); }
    // Unique local fc00::/7
    if (segs[0] & 0xfe00) == 0xfc00 { return Err(ssrf_err("IPv6 unique local")); }
    Ok(())
}

fn validate_hostname(host: &str) -> Result<()> {
    let h = host.to_lowercase();
    let blocked = [
        "localhost", "localhost.localdomain", "ip6-localhost",
        "ip6-loopback", "metadata.google.internal", "metadata",
        "kubernetes.default", "kubernetes.default.svc",
    ];
    for b in &blocked {
        if h == *b || h.ends_with(&format!(".{}", b)) {
            return Err(ssrf_err(&format!("blocked host '{}'", host)));
        }
    }
    // Decimal IP bypass: 2130706433 = 127.0.0.1
    if h.chars().all(|c| c.is_ascii_digit()) {
        if let Ok(num) = h.parse::<u32>() {
            validate_ipv4(&Ipv4Addr::from(num))?;
        }
    }
    Ok(())
}

fn ssrf_err(reason: &str) -> zclaw_types::ZclawError {
    zclaw_types::ZclawError::HandError(format!("Access denied: {}", reason))
}

/// Extract text between two delimiters
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
    let start_idx = text.find(start)?;
    let rest = &text[start_idx + start.len()..];
    let end_idx = rest.find(end)?;
    Some(&rest[..end_idx])
}

/// Strip HTML tags from a string
fn strip_html_tags(s: &str) -> String {
    let mut result = String::with_capacity(s.len());
    let mut in_tag = false;
    for c in s.chars() {
        match c {
            '<' => in_tag = true,
            '>' => in_tag = false,
            _ if !in_tag => result.push(c),
            _ => {}
        }
    }

    // Decode common HTML entities
    result = result.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&nbsp;", " ")
        .replace("&#x27;", "'")
        .replace("&#x2F;", "/");

    // Collapse whitespace
    let collapsed: String = result.split_whitespace().collect::<Vec<_>>().join(" ");
    collapsed
}

/// Check if a search result is likely genuine (not navigation/ad/script garbage)
fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool {
    // Title quality checks
    let title_trimmed = title.trim();
    if title_trimmed.len() < 2 || title_trimmed.len() > 300 {
        return false;
    }
    // Reject titles with JavaScript/CSS indicators
    let lower = title_trimmed.to_lowercase();
    if lower.contains("function(") || lower.contains("var ") || lower.contains("const ")
        || lower.contains("window.") || lower.contains("document.")
        || lower.contains("{") || lower.contains("}")
        || lower.starts_with("//") || lower.starts_with("/*")
        || lower.contains("cookie") || lower.contains("navigator.")
        || lower.contains(".css") || lower.contains("stylesheet")
        || lower.contains("google-analytics") || lower.contains("gtag")
    {
        return false;
    }

    // URL quality checks
    if url.contains("javascript:") || url.contains("data:") {
        return false;
    }
    // Reject URLs that are just fragments or relative paths
    if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") {
        return false;
    }

    // Snippet quality — if snippet looks like code, reject
    let snippet_lower = snippet.to_lowercase();
    if snippet_lower.contains("function(") && snippet_lower.contains("return ")
        || snippet_lower.contains("var ") && snippet_lower.contains("=")
    {
        return false;
    }

    true
}

/// Extract href URL from the first <a> tag in text
fn extract_href(text: &str) -> Option<String> {
    let href_start = text.find("href=\"")?;
    let rest = &text[href_start + 6..];
    let end = rest.find('"')?;
    let url = &rest[..end];

    if url.starts_with("http") {
        Some(url.to_string())
    } else if url.starts_with("//") {
        Some(format!("https:{}", url))
    } else {
        None
    }
}

/// Extract the real URL from DDG's redirect link (uddg= parameter)
fn extract_href_uddg(text: &str) -> Option<String> {
    if let Some(idx) = text.find("uddg=") {
        let rest = &text[idx + 5..];
        let url_encoded = rest.split('&').next().unwrap_or("");
        // Use standard percent decoding instead of manual replacement
        let decoded = percent_decode(url_encoded);
        if decoded.starts_with("http") {
            return Some(decoded);
        }
    }

    // Fallback: try regular href extraction
    extract_href(text)
}

/// Standard percent-decode a URL-encoded string
fn percent_decode(input: &str) -> String {
    let mut result = Vec::new();
    let bytes = input.as_bytes();
    let mut i = 0;
    while i < bytes.len() {
        if bytes[i] == b'%' && i + 2 < bytes.len() {
            if let Ok(byte) = u8::from_str_radix(
                &input[i + 1..i + 3], 16
            ) {
                result.push(byte);
                i += 3;
                continue;
            }
        }
        result.push(bytes[i]);
        i += 1;
    }
    String::from_utf8_lossy(&result).to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn create_test_hand() -> ResearcherHand {
        ResearcherHand::new()
    }

    fn test_context() -> HandContext {
        HandContext::default()
    }

    // --- Config & Type Tests ---

    #[test]
    fn test_config_id() {
        let hand = create_test_hand();
        assert_eq!(hand.config().id, "researcher");
        assert_eq!(hand.config().name, "研究员");
        assert!(hand.config().enabled);
        assert!(!hand.config().needs_approval);
    }

    #[test]
    fn test_search_engine_default_is_auto() {
        let engine = SearchEngine::default();
        assert!(matches!(engine, SearchEngine::Auto));
    }

    #[test]
    fn test_search_engine_searxng_deserialize() {
        let engine: SearchEngine = serde_json::from_str("\"searxng\"").unwrap();
        assert!(matches!(engine, SearchEngine::SearXNG));
    }

    #[test]
    fn test_research_depth_default_is_standard() {
        let depth = ResearchDepth::default();
        assert!(matches!(depth, ResearchDepth::Standard));
    }

    #[test]
    fn test_research_depth_serialize() {
        let json = serde_json::to_string(&ResearchDepth::Deep).unwrap();
        assert_eq!(json, "\"deep\"");
    }

    #[test]
    fn test_research_depth_deserialize() {
        let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap();
        assert!(matches!(depth, ResearchDepth::Quick));
    }

    #[test]
    fn test_search_engine_serialize_roundtrip() {
        for engine in [SearchEngine::SearXNG, SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] {
            let json = serde_json::to_string(&engine).unwrap();
            let back: SearchEngine = serde_json::from_str(&json).unwrap();
            assert_eq!(json, serde_json::to_string(&back).unwrap());
        }
    }

    // --- Action Deserialization Tests ---

    #[test]
    fn test_action_search_deserialize() {
        let json = json!({
            "action": "search",
            "query": {
                "query": "Rust programming",
                "engine": "duckduckgo",
                "depth": "quick",
                "maxResults": 5
            }
        });
        let action: ResearcherAction = serde_json::from_value(json).unwrap();
        match action {
            ResearcherAction::Search { query } => {
                assert_eq!(query.query, "Rust programming");
                assert!(matches!(query.engine, SearchEngine::DuckDuckGo));
                assert!(matches!(query.depth, ResearchDepth::Quick));
                assert_eq!(query.max_results, 5);
            }
            _ => panic!("Expected Search action"),
        }
    }

    #[test]
    fn test_action_fetch_deserialize() {
        let json = json!({
            "action": "fetch",
            "url": "https://example.com/page"
        });
        let action: ResearcherAction = serde_json::from_value(json).unwrap();
        match action {
            ResearcherAction::Fetch { url } => {
                assert_eq!(url, "https://example.com/page");
            }
            _ => panic!("Expected Fetch action"),
        }
    }

    #[test]
    fn test_action_report_deserialize() {
        let json = json!({
            "action": "report",
            "query": {
                "query": "AI trends 2026",
                "depth": "deep"
            }
        });
        let action: ResearcherAction = serde_json::from_value(json).unwrap();
        match action {
            ResearcherAction::Report { query } => {
                assert_eq!(query.query, "AI trends 2026");
                assert!(matches!(query.depth, ResearchDepth::Deep));
            }
            _ => panic!("Expected Report action"),
        }
    }

    #[test]
    fn test_action_invalid_rejected() {
        let json = json!({
            "action": "unknown_action",
            "data": "whatever"
        });
        let result: std::result::Result<ResearcherAction, _> = serde_json::from_value(json);
        assert!(result.is_err());
    }

    // --- URL Encoding Tests ---

    #[test]
    fn test_url_encode_ascii() {
        assert_eq!(url_encode("hello world"), "hello%20world");
    }

    #[test]
    fn test_url_encode_chinese() {
        // "医" = UTF-8 bytes E5 8C BB → must produce %E5%8C%BB, not %533B
        let encoded = url_encode("医");
        assert_eq!(encoded, "%E5%8C%BB");

        // Full phrase: "中文" = E4 B8 AD E6 96 87
        let encoded = url_encode("中文搜索");
        assert_eq!(&encoded[0..9], "%E4%B8%AD");
        assert!(!encoded.contains("中文"));
    }

    #[test]
    fn test_url_encode_safe_chars() {
        assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string());
    }

    #[test]
    fn test_url_encode_empty() {
        assert_eq!(url_encode(""), "");
    }

    // --- HTML Text Extraction Tests ---

    #[test]
    fn test_extract_text_basic() {
        let hand = create_test_hand();
        let html = "<html><body><h1>Title</h1><p>Content here</p></body></html>";
        let text = hand.extract_text_from_html(html);
        assert!(text.contains("Title"));
        assert!(text.contains("Content here"));
    }

    #[test]
    fn test_extract_text_strips_scripts() {
        let hand = create_test_hand();
        let html = "<html><body><script>alert('xss')</script><p>Safe text</p></body></html>";
        let text = hand.extract_text_from_html(html);
        assert!(!text.contains("alert"));
        assert!(text.contains("Safe text"));
    }

    #[test]
    fn test_extract_text_strips_styles() {
        let hand = create_test_hand();
        let html = "<html><body><style>.class{color:red}</style><p>Visible</p></body></html>";
        let text = hand.extract_text_from_html(html);
        assert!(!text.contains("color"));
        assert!(text.contains("Visible"));
    }

    #[test]
    fn test_extract_text_truncates_long_content() {
        let hand = create_test_hand();
        let long_body: String = "x".repeat(20000);
        let html = format!("<html><body><p>{}</p></body></html>", long_body);
        let text = hand.extract_text_from_html(&html);
        assert!(text.len() <= 10003); // 10000 + "..."
    }

    #[test]
    fn test_extract_text_empty_body() {
        let hand = create_test_hand();
        let html = "<html><body></body></html>";
        let text = hand.extract_text_from_html(html);
        assert!(text.is_empty());
    }

    // --- Hand Trait Tests ---

    #[tokio::test]
    async fn test_needs_approval_is_false() {
        let hand = create_test_hand();
        assert!(!hand.needs_approval());
    }

    #[tokio::test]
    async fn test_status_is_idle() {
        let hand = create_test_hand();
        assert!(matches!(hand.status(), crate::HandStatus::Idle));
    }

    #[tokio::test]
    async fn test_check_dependencies_ok() {
        let hand = create_test_hand();
        let missing = hand.check_dependencies().unwrap();
        // Default is_dependency_available returns true for all
        assert!(missing.is_empty());
    }

    // --- Default Values Tests ---

    #[test]
    fn test_research_query_defaults() {
        let json = json!({ "query": "test" });
        let query: ResearchQuery = serde_json::from_value(json).unwrap();
        assert_eq!(query.query, "test");
        assert!(matches!(query.engine, SearchEngine::Auto));
        assert!(matches!(query.depth, ResearchDepth::Standard));
        assert_eq!(query.max_results, 10);
        assert_eq!(query.time_limit_secs, 60);
        assert!(!query.include_related);
    }

    #[test]
    fn test_search_result_serialization() {
        let result = SearchResult {
            title: "Test".to_string(),
            url: "https://example.com".to_string(),
            snippet: "A snippet".to_string(),
            source: "TestSource".to_string(),
            relevance: 90,
            content: None,
            fetched_at: None,
        };
        let json = serde_json::to_string(&result).unwrap();
        assert!(json.contains("Test"));
        assert!(json.contains("https://example.com"));
    }

    #[test]
    fn test_research_report_summary_is_some_when_results() {
        // Verify the struct allows Some value
        let report = ResearchReport {
            query: "test".to_string(),
            results: vec![SearchResult {
                title: "R".to_string(),
                url: "https://r.co".to_string(),
                snippet: "snippet text".to_string(),
                source: "S".to_string(),
                relevance: 80,
                content: None,
                fetched_at: None,
            }],
            summary: Some("基于 1 条搜索结果：snippet text".to_string()),
            key_findings: vec![],
            related_topics: vec![],
            researched_at: "2026-01-01T00:00:00Z".to_string(),
            duration_ms: 100,
        };
        assert!(report.summary.is_some());
        assert!(report.summary.unwrap().contains("snippet text"));
    }

    // --- SearchConfig Tests ---

    #[test]
    fn test_search_config_default() {
        let config = SearchConfig::default();
        assert!(matches!(config.default_engine, SearchEngine::Auto));
        assert_eq!(config.searxng_url, "http://localhost:8888");
        assert_eq!(config.timeout_secs, 15);
    }

    #[test]
    fn test_search_config_load_fallback_on_missing_file() {
        // Config loads from config/config.toml which may not exist in test CWD
        let config = SearchConfig::load();
        // Should return a valid config either way
        assert!(!config.searxng_url.is_empty());
    }

    // --- SearXNG Response Parsing Tests ---

    #[test]
    fn test_searxng_response_parse() {
        let mock_response = json!({
            "query": "Rust programming",
            "number_of_results": 42,
            "results": [
                {
                    "url": "https://www.rust-lang.org/",
                    "title": "Rust Programming Language",
                    "content": "A language empowering everyone to build reliable software.",
                    "engine": "google",
                    "engines": ["google", "duckduckgo"],
                    "score": 5.2,
                    "category": "general"
                },
                {
                    "url": "https://doc.rust-lang.org/book/",
                    "title": "The Rust Book",
                    "content": "The official guide to Rust programming.",
                    "engine": "bing",
                    "engines": ["bing"],
                    "score": 3.1,
                    "category": "general"
                }
            ],
            "suggestions": ["rust tutorial", "rust vs go"]
        });

        let results = mock_response.get("results").unwrap().as_array().unwrap();
        assert_eq!(results.len(), 2);

        // Verify first result mapping
        let r0 = &results[0];
        assert_eq!(r0["title"].as_str().unwrap(), "Rust Programming Language");
        assert_eq!(r0["url"].as_str().unwrap(), "https://www.rust-lang.org/");
        assert_eq!(r0["content"].as_str().unwrap(), "A language empowering everyone to build reliable software.");

        let engines: Vec<&str> = r0["engines"].as_array().unwrap()
            .iter().filter_map(|e| e.as_str()).collect();
        assert_eq!(engines, vec!["google", "duckduckgo"]);
    }

    #[test]
    fn test_searxng_empty_results() {
        let mock_response = json!({
            "query": "nonexistent xyzzy123",
            "number_of_results": 0,
            "results": [],
            "suggestions": []
        });

        let results = mock_response.get("results").unwrap().as_array().unwrap();
        assert!(results.is_empty());
    }

    #[test]
    fn test_searxng_score_normalization() {
        // Score 5.2 → (5.2 * 10) = 52 → relevance 52
        let score = 5.2_f64;
        let relevance = if score > 0.0 {
            (score.min(10.0) * 10.0) as u8
        } else {
            50
        };
        assert_eq!(relevance, 52);

        // Score 15.0 → clamped to 10.0 → relevance 100
        let score = 15.0_f64;
        let relevance = if score > 0.0 {
            (score.min(10.0) * 10.0) as u8
        } else {
            50
        };
        assert_eq!(relevance, 100);

        // Score 0.0 → default relevance 50
        let score = 0.0_f64;
        let relevance = if score > 0.0 {
            (score.min(10.0) * 10.0) as u8
        } else {
            50
        };
        assert_eq!(relevance, 50);
    }

    #[test]
    fn test_searxng_url_construction() {
        let config = SearchConfig::default();
        let query = "2024年中国医疗政策";
        let url = format!(
            "{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
            config.searxng_url.trim_end_matches('/'),
            url_encode(query)
        );
        assert!(url.starts_with("http://localhost:8888/search?"));
        assert!(url.contains("format=json"));
        assert!(url.contains("categories=general"));
        assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
        assert!(!url.contains("%4E2D"));    // NOT Unicode codepoint
    }

    // --- Native Search Helper Tests ---

    #[test]
    fn test_is_cjk_char_chinese() {
        assert!(is_cjk_char('中'));
        assert!(is_cjk_char('医'));
        assert!(is_cjk_char('。'));
        assert!(!is_cjk_char('a'));
        assert!(!is_cjk_char('1'));
        assert!(!is_cjk_char(' '));
    }

    #[test]
    fn test_is_cjk_char_detects_chinese_query() {
        let query = "2024年中国医疗政策";
        assert!(query.chars().any(|c| is_cjk_char(c)));

        let query_en = "Rust programming language";
        assert!(!query_en.chars().any(|c| is_cjk_char(c)));
    }

    #[test]
    fn test_strip_html_tags() {
        assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
        assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
        assert_eq!(strip_html_tags("plain text"), "plain text");
        assert_eq!(strip_html_tags("&amp;&lt;&gt;"), "&<>");
        // strip_html_tags only removes tags, not script content
        assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
    }

    #[test]
    fn test_extract_between_basic() {
        let text = "prefix<div>content</div>suffix";
        assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
    }

    #[test]
    fn test_extract_between_not_found() {
        let text = "no delimiters here";
        assert_eq!(extract_between(text, "<div>", "</div>"), None);
    }

    #[test]
    fn test_extract_href() {
        let text = r#"<a href="https://example.com/page">Title</a>"#;
        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
    }

    #[test]
    fn test_extract_href_protocol_relative() {
        let text = r#"<a href="//example.com/page">Title</a>"#;
        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
    }

    #[test]
    fn test_extract_href_uddg() {
        let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&amp;rut=abc""#;
        assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
    }

    #[test]
    fn test_extract_href_uddg_fallback() {
        let text = r#"<a href="https://example.com/direct">Title</a>"#;
        assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
    }

    // --- HTML Parser Tests ---

    #[test]
    fn test_parse_ddg_html() {
        let hand = create_test_hand();
        let html = r#"
        <div class="result__body">
            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&amp;rut=abc">Rust Programming Language</a>
            <a class="result__snippet">A systems programming language focused on safety and speed.</a>
        </div>
        <div class="result__body">
            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&amp;rut=def">The Rust Book</a>
            <a class="result__snippet">The official guide to Rust programming.</a>
        </div>
        "#;

        let results = hand.parse_ddg_html(html, 10);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].title, "Rust Programming Language");
        assert_eq!(results[0].url, "https://rust-lang.org");
        assert_eq!(results[0].source, "DuckDuckGo");
        assert_eq!(results[1].title, "The Rust Book");
    }

    #[test]
    fn test_parse_ddg_html_max_results() {
        let hand = create_test_hand();
        let mut html = String::new();
        for i in 0..20 {
            html.push_str(&format!(
                r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
                i, i, i
            ));
        }
        let results = hand.parse_ddg_html(&html, 5);
        assert_eq!(results.len(), 5);
    }

    #[test]
    fn test_parse_ddg_html_empty() {
        let hand = create_test_hand();
        let html = "<html><body>No results here</body></html>";
        let results = hand.parse_ddg_html(html, 10);
        assert!(results.is_empty());
    }

    #[test]
    fn test_parse_bing_html() {
        let hand = create_test_hand();
        let html = r#"
        <li class="b_algo">
            <h2><a href="https://example.com/result1">Example Result 1</a></h2>
            <div class="b_caption"><p>This is the first result snippet.</p></div>
        </li>
        <li class="b_algo">
            <h2><a href="https://example.com/result2">Example Result 2</a></h2>
            <div class="b_caption"><p>This is the second result snippet.</p></div>
        </li>
        "#;

        let results = hand.parse_bing_html(html, 10);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].title, "Example Result 1");
        assert_eq!(results[0].url, "https://example.com/result1");
        assert_eq!(results[0].source, "Bing");
    }

    #[test]
    fn test_parse_bing_html_skips_internal_urls() {
        let hand = create_test_hand();
        let html = r#"
        <li class="b_algo">
            <h2><a href="https://bing.com/search?q=more">More Results</a></h2>
        </li>
        <li class="b_algo">
            <h2><a href="https://example.com/real">Real Result</a></h2>
        </li>
        "#;

        let results = hand.parse_bing_html(html, 10);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].url, "https://example.com/real");
    }

    #[test]
    fn test_parse_bing_html_empty() {
        let hand = create_test_hand();
        let html = "<html><body>Nothing here</body></html>";
        let results = hand.parse_bing_html(html, 10);
        assert!(results.is_empty());
    }

    #[test]
    fn test_parse_baidu_html() {
        let hand = create_test_hand();
        let html = r#"
        <div class="result c-container">
            <h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
            <div class="c-abstract">这是关于医疗政策的摘要信息。</div>
        </div>
        <div class="c-container new-pmd">
            <h3><a href="https://www.example.cn/page2">第二条结果</a></h3>
        </div>
        "#;

        let results = hand.parse_baidu_html(html, 10);
        assert!(results.len() >= 1, "Should find at least 1 result, got {}", results.len());
        assert_eq!(results[0].source, "Baidu");
    }

    // --- SSRF Validation Tests ---

    #[test]
    fn test_ssrf_blocks_localhost() {
        assert!(validate_fetch_url("http://localhost:8080/admin").is_err());
        assert!(validate_fetch_url("http://127.0.0.1:5432/db").is_err());
    }

    #[test]
    fn test_ssrf_blocks_private_ip() {
        assert!(validate_fetch_url("http://10.0.0.1/secret").is_err());
        assert!(validate_fetch_url("http://192.168.1.1/router").is_err());
        assert!(validate_fetch_url("http://172.16.0.1/internal").is_err());
    }

    #[test]
    fn test_ssrf_blocks_cloud_metadata() {
        assert!(validate_fetch_url("http://169.254.169.254/metadata").is_err());
    }

    #[test]
    fn test_ssrf_blocks_non_http_scheme() {
        assert!(validate_fetch_url("file:///etc/passwd").is_err());
        assert!(validate_fetch_url("ftp://example.com/file").is_err());
    }

    #[test]
    fn test_ssrf_allows_public_url() {
        assert!(validate_fetch_url("https://www.rust-lang.org/learn").is_ok());
        assert!(validate_fetch_url("https://example.com/page?q=test").is_ok());
    }

    // --- Percent Decode Tests ---

    #[test]
    fn test_percent_decode_basic() {
        assert_eq!(percent_decode("hello%20world"), "hello world");
        assert_eq!(percent_decode("%E4%B8%AD%E6%96%87"), "中文");
    }

    #[test]
    fn test_percent_decode_full_url() {
        assert_eq!(
            percent_decode("https%3A%2F%2Fexample.com%2Fpage%3Fq%3Dtest"),
            "https://example.com/page?q=test"
        );
    }

    #[test]
    fn test_percent_decode_no_encoding() {
        assert_eq!(percent_decode("plain-text_123"), "plain-text_123");
    }

    // --- Input Validation Tests ---

    #[test]
    fn test_research_query_validate_empty() {
        let query = ResearchQuery {
            query: "  ".to_string(), engine: SearchEngine::Auto,
            depth: ResearchDepth::Standard, max_results: 10,
            include_related: false, time_limit_secs: 60,
        };
        assert!(query.validate().is_err());
    }

    #[test]
    fn test_research_query_validate_too_long() {
        let query = ResearchQuery {
            query: "x".repeat(501), engine: SearchEngine::Auto,
            depth: ResearchDepth::Standard, max_results: 10,
            include_related: false, time_limit_secs: 60,
        };
        assert!(query.validate().is_err());
    }

    #[test]
    fn test_research_query_validate_max_results_overflow() {
        let query = ResearchQuery {
            query: "test".to_string(), engine: SearchEngine::Auto,
            depth: ResearchDepth::Standard, max_results: 999,
            include_related: false, time_limit_secs: 60,
        };
        assert!(query.validate().is_err());
    }

    #[test]
    fn test_research_query_validate_ok() {
        let query = ResearchQuery {
            query: "Rust programming".to_string(), engine: SearchEngine::Auto,
            depth: ResearchDepth::Standard, max_results: 10,
            include_related: false, time_limit_secs: 60,
        };
        assert!(query.validate().is_ok());
    }

    // --- Quality Filter Tests ---

    #[test]
    fn test_quality_rejects_javascript_title() {
        assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com"));
    }

    #[test]
    fn test_quality_rejects_short_title() {
        assert!(!is_quality_result("A", "snippet", "https://example.com"));
    }

    #[test]
    fn test_quality_rejects_css_title() {
        assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com"));
    }

    #[test]
    fn test_quality_rejects_javascript_url() {
        assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)"));
    }

    #[test]
    fn test_quality_accepts_normal_result() {
        assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy"));
    }

    #[test]
    fn test_quality_accepts_english_result() {
        assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org"));
    }

    #[test]
    fn test_quality_rejects_long_title() {
        let long_title: String = "x".repeat(301);
        assert!(!is_quality_result(&long_title, "ok", "https://example.com"));
    }

    #[test]
    fn test_strip_html_tags_collapses_whitespace() {
        assert_eq!(strip_html_tags("<b>Hello</b>  <i>World</i>"), "Hello World");
        assert_eq!(strip_html_tags("a\n\t b"), "a b");
    }
}