diff --git a/crates/zclaw-hands/src/hands/researcher.rs b/crates/zclaw-hands/src/hands/researcher.rs index 9e24094..829ae7a 100644 --- a/crates/zclaw-hands/src/hands/researcher.rs +++ b/crates/zclaw-hands/src/hands/researcher.rs @@ -37,6 +37,7 @@ struct SearchConfig { default_engine: SearchEngine, searxng_url: String, timeout_secs: u64, + jina_api_key: Option, } impl Default for SearchConfig { @@ -45,6 +46,7 @@ impl Default for SearchConfig { default_engine: SearchEngine::Auto, searxng_url: "http://localhost:8888".to_string(), timeout_secs: 15, + jina_api_key: None, } } } @@ -101,6 +103,7 @@ impl SearchConfig { searxng_url: s.searxng_url .unwrap_or_else(|| "http://localhost:8888".to_string()), timeout_secs: s.searxng_timeout.unwrap_or(15), + jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(), } } None => Self::default(), @@ -492,16 +495,18 @@ impl ResearcherHand { Ok(results) } - /// Search using DuckDuckGo HTML (real search results, not Instant Answer API) + /// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior) async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result> { - let url = format!( - "https://html.duckduckgo.com/html/?q={}", - url_encode(query) - ); + let has_cjk = query.chars().any(|c| is_cjk_char(c)); + let region = if has_cjk { "wt-wt" } else { "wt-wt" }; + let body = format!("q={}&b=&l={}", url_encode(query), region); let response = self.client - .get(&url) - .header("Accept", "text/html") + .post("https://html.duckduckgo.com/html/") + .header("Content-Type", "application/x-www-form-urlencoded") + .header("Accept", "text/html,application/xhtml+xml") + .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .body(body) .send() .await .map_err(|e| zclaw_types::ZclawError::HandError( @@ -752,6 +757,7 @@ impl ResearcherHand { } /// Fetch content from a URL (with SSRF protection) + /// Tries Jina Reader API first for clean Markdown, falls back to direct fetch async fn execute_fetch(&self, url: &str) -> Result { let start = std::time::Instant::now(); @@ -768,27 +774,13 @@ impl ResearcherHand { } } - let response = self.client - .get(url) - .send() - .await - .map_err(|e| zclaw_types::ZclawError::HandError(format!("Fetch request failed: {}", e)))?; - - let content_type = response.headers() - .get(reqwest::header::CONTENT_TYPE) - .and_then(|v| v.to_str().ok()) - .unwrap_or(""); - - let content = if content_type.contains("text/html") { - // Extract text from HTML - let html = response.text().await - .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?; - self.extract_text_from_html(&html) - } else if content_type.contains("text/") || content_type.contains("application/json") { - response.text().await - .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))? - } else { - "[Binary content]".to_string() + // Try Jina Reader API first (returns clean Markdown) + let content = match self.fetch_via_jina(url).await { + Ok(text) => text, + Err(e) => { + tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch"); + self.fetch_direct(url).await? + } }; let result = SearchResult { @@ -824,6 +816,80 @@ impl ResearcherHand { Ok(result) } + /// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern) + async fn fetch_via_jina(&self, url: &str) -> Result { + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(20)) + .build() + .unwrap_or_else(|_| reqwest::Client::new()); + + let mut builder = client + .post("https://r.jina.ai/") + .header("Content-Type", "application/json") + .header("X-Return-Format", "markdown") + .header("X-Timeout", "15") + .json(&serde_json::json!({ "url": url })); + + // Optional API key for higher rate limits + if let Some(ref key) = self.search_config.jina_api_key { + builder = builder.header("Authorization", format!("Bearer {}", key)); + } + + let response = builder.send().await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Jina Reader request failed: {}", e) + ))?; + + let status = response.status(); + if !status.is_success() { + return Err(zclaw_types::ZclawError::HandError( + format!("Jina Reader returned HTTP {}", status) + )); + } + + let text = response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Failed to read Jina response: {}", e) + ))?; + + if text.trim().is_empty() { + return Err(zclaw_types::ZclawError::HandError( + "Jina Reader returned empty response".to_string() + )); + } + + // Truncate to 4096 chars (DeerFlow pattern) + let truncated: String = text.chars().take(4096).collect(); + Ok(truncated) + } + + /// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable) + async fn fetch_direct(&self, url: &str) -> Result { + let response = self.client + .get(url) + .send() + .await + .map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?; + + let content_type = response.headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + + let content = if content_type.contains("text/html") { + let html = response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?; + self.extract_text_from_html(&html) + } else if content_type.contains("text/") || content_type.contains("application/json") { + response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))? + } else { + "[Binary content]".to_string() + }; + + Ok(content) + } + /// Extract readable text from HTML fn extract_text_from_html(&self, html: &str) -> String { let html_lower = html.to_lowercase();