diff --git a/crates/zclaw-hands/src/hands/researcher.rs b/crates/zclaw-hands/src/hands/researcher.rs index 981d643..41bf3a2 100644 --- a/crates/zclaw-hands/src/hands/researcher.rs +++ b/crates/zclaw-hands/src/hands/researcher.rs @@ -285,24 +285,23 @@ impl ResearcherHand { }; let results = match engine { - SearchEngine::SearXNG | SearchEngine::Auto => { + SearchEngine::SearXNG => { match self.search_searxng(&query.query, query.max_results).await { Ok(r) if !r.is_empty() => r, - _ => { - tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo"); - self.search_duckduckgo(&query.query, query.max_results).await? - } + _ => self.search_native(&query.query, query.max_results).await?, } } + SearchEngine::Auto => { + self.search_native(&query.query, query.max_results).await? + } SearchEngine::DuckDuckGo => { - self.search_duckduckgo(&query.query, query.max_results).await? + self.search_duckduckgo_html(&query.query, query.max_results).await? } - SearchEngine::Google | SearchEngine::Bing => { - // Google/Bing not yet implemented, fall back to SearXNG which aggregates them - match self.search_searxng(&query.query, query.max_results).await { - Ok(r) if !r.is_empty() => r, - _ => self.search_duckduckgo(&query.query, query.max_results).await?, - } + SearchEngine::Google => { + self.search_bing(&query.query, query.max_results).await? + } + SearchEngine::Bing => { + self.search_bing(&query.query, query.max_results).await? } }; @@ -319,6 +318,67 @@ impl ResearcherHand { Ok(results) } + /// Rust-native multi-engine search with Chinese auto-detection + async fn search_native(&self, query: &str, max_results: usize) -> Result> { + let has_cjk = query.chars().any(|c| is_cjk_char(c)); + + // Strategy: try multiple engines in parallel, merge results + let mut all_results = Vec::new(); + + if has_cjk { + // Chinese query: Bing CN + Baidu + DuckDuckGo in parallel + let bing_fut = self.search_bing(query, max_results); + let baidu_fut = self.search_baidu(query, max_results); + let ddg_fut = self.search_duckduckgo_html(query, max_results); + + let (bing_res, baidu_res, ddg_res) = tokio::join!( + async { bing_fut.await }, + async { baidu_fut.await }, + async { ddg_fut.await }, + ); + + if let Ok(r) = bing_res { + all_results.extend(r); + } + if let Ok(r) = baidu_res { + all_results.extend(r); + } + if let Ok(r) = ddg_res { + all_results.extend(r); + } + } else { + // English query: DuckDuckGo HTML first, then Bing + let ddg_fut = self.search_duckduckgo_html(query, max_results); + let bing_fut = self.search_bing(query, max_results); + + let (ddg_res, bing_res) = tokio::join!( + async { ddg_fut.await }, + async { bing_fut.await }, + ); + + if let Ok(r) = ddg_res { + all_results.extend(r); + } + if let Ok(r) = bing_res { + all_results.extend(r); + } + } + + // Deduplicate by URL + let mut seen_urls = std::collections::HashSet::new(); + all_results.retain(|r| seen_urls.insert(r.url.to_lowercase())); + + // Sort by relevance descending, take top N + all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance)); + all_results.truncate(max_results); + + if all_results.is_empty() { + tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query); + } + + Ok(all_results) + } + /// Search using SearXNG meta-search engine (aggregates 70+ engines) async fn search_searxng(&self, query: &str, max_results: usize) -> Result> { let url = format!( @@ -405,70 +465,225 @@ impl ResearcherHand { Ok(results) } - /// Search using DuckDuckGo (no API key required) - async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result> { - let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1", - url_encode(query)); + /// Search using DuckDuckGo HTML (real search results, not Instant Answer API) + async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result> { + let url = format!( + "https://html.duckduckgo.com/html/?q={}", + url_encode(query) + ); let response = self.client .get(&url) + .header("Accept", "text/html") .send() .await - .map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?; + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("DuckDuckGo HTML search failed: {}", e) + ))?; - let json: Value = response.json().await - .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?; + let html = response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Failed to read DuckDuckGo response: {}", e) + ))?; + Ok(self.parse_ddg_html(&html, max_results)) + } + + /// Parse DuckDuckGo HTML search results page + fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec { let mut results = Vec::new(); - // Parse DuckDuckGo Instant Answer - if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) { - if !abstract_text.is_empty() { - results.push(SearchResult { - title: query.to_string(), - url: json.get("AbstractURL") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(), - snippet: abstract_text.to_string(), - source: json.get("AbstractSource") - .and_then(|v| v.as_str()) - .unwrap_or("DuckDuckGo") - .to_string(), - relevance: 100, - content: None, - fetched_at: Some(chrono::Utc::now().to_rfc3339()), - }); + for block in html.split("result__body") { + if results.len() >= max_results { + break; } + + // Find the result title link: Title + let title_link = match extract_between(block, "result__a", "") { + Some(s) => s, + None => continue, + }; + // title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text + let title = title_link.rsplit('>').next() + .map(|s| strip_html_tags(s).trim().to_string()) + .unwrap_or_default(); + + let url = extract_href_uddg(block).unwrap_or_default(); + + let snippet = extract_between(block, "result__snippet", "") + .map(|s| { + s.rsplit('>').next() + .map(|t| strip_html_tags(t).trim().to_string()) + .unwrap_or_default() + }) + .unwrap_or_default(); + + if title.is_empty() || url.is_empty() { + continue; + } + + results.push(SearchResult { + title, + url, + snippet, + source: "DuckDuckGo".to_string(), + relevance: 70, + content: None, + fetched_at: Some(chrono::Utc::now().to_rfc3339()), + }); } - // Parse related topics - if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) { - for item in related.iter().take(max_results) { - if let Some(obj) = item.as_object() { - results.push(SearchResult { - title: obj.get("Text") - .and_then(|v| v.as_str()) - .unwrap_or("Related Topic") - .to_string(), - url: obj.get("FirstURL") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(), - snippet: obj.get("Text") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(), - source: "DuckDuckGo".to_string(), - relevance: 80, - content: None, - fetched_at: Some(chrono::Utc::now().to_rfc3339()), - }); - } + results + } + + /// Search using Bing (works well for both Chinese and English) + async fn search_bing(&self, query: &str, max_results: usize) -> Result> { + let has_cjk = query.chars().any(|c| is_cjk_char(c)); + let url = if has_cjk { + format!( + "https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans", + url_encode(query), + max_results + ) + } else { + format!( + "https://www.bing.com/search?q={}&count={}", + url_encode(query), + max_results + ) + }; + + let response = self.client + .get(&url) + .header("Accept", "text/html,application/xhtml+xml") + .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8") + .send() + .await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Bing search failed: {}", e) + ))?; + + let html = response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Failed to read Bing response: {}", e) + ))?; + + Ok(self.parse_bing_html(&html, max_results)) + } + + /// Parse Bing HTML search results page + fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec { + let mut results = Vec::new(); + + // Bing results are in
  • + for block in html.split("class=\"b_algo\"") { + if results.len() >= max_results { + break; } + + // Extract title from first inside the block + let title = extract_between(block, ">", "") + .map(|s| strip_html_tags(s).trim().to_string()) + .unwrap_or_default(); + + // Extract URL from href attribute of first + let url = extract_href(block).unwrap_or_default(); + + // Extract snippet from

    ...

    or

    + let snippet = extract_between(block, "

    ", "

    ") + .or_else(|| extract_between(block, "b_caption", "
    ")) + .map(|s| strip_html_tags(s).trim().to_string()) + .unwrap_or_default(); + + if title.is_empty() || url.is_empty() { + continue; + } + + // Skip Bing internal URLs + if url.contains("bing.com/search") || url.contains("go.microsoft.com") { + continue; + } + + results.push(SearchResult { + title, + url, + snippet, + source: "Bing".to_string(), + relevance: 75, + content: None, + fetched_at: Some(chrono::Utc::now().to_rfc3339()), + }); } - Ok(results) + results + } + + /// Search using Baidu (essential for Chinese content) + async fn search_baidu(&self, query: &str, max_results: usize) -> Result> { + let url = format!( + "https://www.baidu.com/s?wd={}&rn={}", + url_encode(query), + max_results + ); + + let response = self.client + .get(&url) + .header("Accept", "text/html,application/xhtml+xml") + .header("Accept-Language", "zh-CN,zh;q=0.9") + .send() + .await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Baidu search failed: {}", e) + ))?; + + let html = response.text().await + .map_err(|e| zclaw_types::ZclawError::HandError( + format!("Failed to read Baidu response: {}", e) + ))?; + + Ok(self.parse_baidu_html(&html, max_results)) + } + + /// Parse Baidu HTML search results page + fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec { + let mut results = Vec::new(); + + for block in html.split("class=\"result c-container\"") { + if results.len() >= max_results { + break; + } + + if !block.contains("href=\"http") { + continue; + } + + let title = extract_between(block, ">", "
    ") + .map(|s| strip_html_tags(s).trim().to_string()) + .unwrap_or_default(); + + let url = extract_href(block).unwrap_or_default(); + + let snippet = extract_between(block, "c-abstract", "") + .or_else(|| extract_between(block, "content-right_", "")) + .map(|s| strip_html_tags(s).trim().to_string()) + .unwrap_or_default(); + + if title.is_empty() || url.is_empty() { + continue; + } + + results.push(SearchResult { + title, + url, + snippet, + source: "Baidu".to_string(), + relevance: 80, + content: None, + fetched_at: Some(chrono::Utc::now().to_rfc3339()), + }); + } + + results } /// Fetch content from a URL @@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String { .collect() } +/// Check if a character is CJK (Chinese/Japanese/Korean) +fn is_cjk_char(c: char) -> bool { + matches!(c, + '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs + '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A + '\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation + '\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms + '\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement + '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs + ) +} + +/// Extract text between two delimiters +fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> { + let start_idx = text.find(start)?; + let rest = &text[start_idx + start.len()..]; + let end_idx = rest.find(end)?; + Some(&rest[..end_idx]) +} + +/// Strip HTML tags from a string +fn strip_html_tags(s: &str) -> String { + let mut result = String::with_capacity(s.len()); + let mut in_tag = false; + for c in s.chars() { + match c { + '<' => in_tag = true, + '>' => in_tag = false, + _ if !in_tag => result.push(c), + _ => {} + } + } + + // Decode common HTML entities + result = result.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", "\"") + .replace("'", "'") + .replace(" ", " "); + + result +} + +/// Extract href URL from the first tag in text +fn extract_href(text: &str) -> Option { + let href_start = text.find("href=\"")?; + let rest = &text[href_start + 6..]; + let end = rest.find('"')?; + let url = &rest[..end]; + + if url.starts_with("http") { + Some(url.to_string()) + } else if url.starts_with("//") { + Some(format!("https:{}", url)) + } else { + None + } +} + +/// Extract the real URL from DDG's redirect link (uddg= parameter) +fn extract_href_uddg(text: &str) -> Option { + // DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&..." + if let Some(idx) = text.find("uddg=") { + let rest = &text[idx + 5..]; + let url_encoded = rest.split('&').next().unwrap_or(""); + let decoded = url_encoded.replace("%3A", ":") + .replace("%2F", "/") + .replace("%3F", "?") + .replace("%3D", "=") + .replace("%26", "&") + .replace("%20", " ") + .replace("%25", "%"); + if decoded.starts_with("http") { + return Some(decoded); + } + } + + // Fallback: try regular href extraction + extract_href(text) +} + #[cfg(test)] mod tests { use super::*; @@ -1164,8 +1461,181 @@ mod tests { assert!(url.starts_with("http://localhost:8888/search?")); assert!(url.contains("format=json")); assert!(url.contains("categories=general")); - // Verify UTF-8 encoding, not Unicode codepoints assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD assert!(!url.contains("%4E2D")); // NOT Unicode codepoint } + + // --- Native Search Helper Tests --- + + #[test] + fn test_is_cjk_char_chinese() { + assert!(is_cjk_char('中')); + assert!(is_cjk_char('医')); + assert!(is_cjk_char('。')); + assert!(!is_cjk_char('a')); + assert!(!is_cjk_char('1')); + assert!(!is_cjk_char(' ')); + } + + #[test] + fn test_is_cjk_char_detects_chinese_query() { + let query = "2024年中国医疗政策"; + assert!(query.chars().any(|c| is_cjk_char(c))); + + let query_en = "Rust programming language"; + assert!(!query_en.chars().any(|c| is_cjk_char(c))); + } + + #[test] + fn test_strip_html_tags() { + assert_eq!(strip_html_tags("Hello"), "Hello"); + assert_eq!(strip_html_tags("Link"), "Link"); + assert_eq!(strip_html_tags("plain text"), "plain text"); + assert_eq!(strip_html_tags("&<>"), "&<>"); + // strip_html_tags only removes tags, not script content + assert_eq!(strip_html_tags("Safe"), "alert()Safe"); + } + + #[test] + fn test_extract_between_basic() { + let text = "prefix
    content
    suffix"; + assert_eq!(extract_between(text, "
    ", "
    "), Some("content")); + } + + #[test] + fn test_extract_between_not_found() { + let text = "no delimiters here"; + assert_eq!(extract_between(text, "
    ", "
    "), None); + } + + #[test] + fn test_extract_href() { + let text = r#"Title"#; + assert_eq!(extract_href(text), Some("https://example.com/page".to_string())); + } + + #[test] + fn test_extract_href_protocol_relative() { + let text = r#"Title"#; + assert_eq!(extract_href(text), Some("https://example.com/page".to_string())); + } + + #[test] + fn test_extract_href_uddg() { + let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc""#; + assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string())); + } + + #[test] + fn test_extract_href_uddg_fallback() { + let text = r#"Title"#; + assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string())); + } + + // --- HTML Parser Tests --- + + #[test] + fn test_parse_ddg_html() { + let hand = create_test_hand(); + let html = r#" + + + "#; + + let results = hand.parse_ddg_html(html, 10); + assert_eq!(results.len(), 2); + assert_eq!(results[0].title, "Rust Programming Language"); + assert_eq!(results[0].url, "https://rust-lang.org"); + assert_eq!(results[0].source, "DuckDuckGo"); + assert_eq!(results[1].title, "The Rust Book"); + } + + #[test] + fn test_parse_ddg_html_max_results() { + let hand = create_test_hand(); + let mut html = String::new(); + for i in 0..20 { + html.push_str(&format!( + r#""#, + i, i, i + )); + } + let results = hand.parse_ddg_html(&html, 5); + assert_eq!(results.len(), 5); + } + + #[test] + fn test_parse_ddg_html_empty() { + let hand = create_test_hand(); + let html = "No results here"; + let results = hand.parse_ddg_html(html, 10); + assert!(results.is_empty()); + } + + #[test] + fn test_parse_bing_html() { + let hand = create_test_hand(); + let html = r#" +
  • +

    Example Result 1

    +

    This is the first result snippet.

    +
  • +
  • +

    Example Result 2

    +

    This is the second result snippet.

    +
  • + "#; + + let results = hand.parse_bing_html(html, 10); + assert_eq!(results.len(), 2); + assert_eq!(results[0].title, "Example Result 1"); + assert_eq!(results[0].url, "https://example.com/result1"); + assert_eq!(results[0].source, "Bing"); + } + + #[test] + fn test_parse_bing_html_skips_internal_urls() { + let hand = create_test_hand(); + let html = r#" +
  • +

    More Results

    +
  • +
  • +

    Real Result

    +
  • + "#; + + let results = hand.parse_bing_html(html, 10); + assert_eq!(results.len(), 1); + assert_eq!(results[0].url, "https://example.com/real"); + } + + #[test] + fn test_parse_bing_html_empty() { + let hand = create_test_hand(); + let html = "Nothing here"; + let results = hand.parse_bing_html(html, 10); + assert!(results.is_empty()); + } + + #[test] + fn test_parse_baidu_html() { + let hand = create_test_hand(); + let html = r#" +
    +

    中国医疗政策 2024

    +
    这是关于医疗政策的摘要信息。
    +
    + "#; + + let results = hand.parse_baidu_html(html, 10); + assert_eq!(results.len(), 1); + assert_eq!(results[0].source, "Baidu"); + } }