feat(hands): Rust原生多引擎搜索 — DuckDuckGo HTML/Bing CN/百度并行聚合

- 用 DuckDuckGo HTML 搜索(html.duckduckgo.com)替换 Instant Answer API，获得真正搜索结果 - 新增 Bing CN 搜索(cn.bing.com)，中文查询自动切换 - 新增百度搜索(baidu.com/s)，中文内容覆盖 - CJK 自动检测：中文查询并行搜索 Bing+Baidu+DDG，英文查询 DDG+Bing - 结果去重(URL) + 按相关性排序 - SearXNG 保留为可选后端，不再强制依赖 Docker - 137 tests PASS（新增 20 个：HTML解析/CJK检测/辅助函数/引擎测试）
2026-04-22 11:41:19 +08:00
parent 0fd981905d
commit 95a05bc6dc
1 changed files with 532 additions and 62 deletions
--- a/crates/zclaw-hands/src/hands/researcher.rs
+++ b/crates/zclaw-hands/src/hands/researcher.rs
@@ -285,24 +285,23 @@ impl ResearcherHand {
        };
        let results = match engine {
-            SearchEngine::SearXNG | SearchEngine::Auto => {
+            SearchEngine::SearXNG => {
                match self.search_searxng(&query.query, query.max_results).await {
                    Ok(r) if !r.is_empty() => r,
-                    _ => {
+                    _ => self.search_native(&query.query, query.max_results).await?,
                        tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo");
                        self.search_duckduckgo(&query.query, query.max_results).await?
                    }
                }
            }
            SearchEngine::Auto => {
                self.search_native(&query.query, query.max_results).await?
            }
            SearchEngine::DuckDuckGo => {
-                self.search_duckduckgo(&query.query, query.max_results).await?
+                self.search_duckduckgo_html(&query.query, query.max_results).await?
            }
-            SearchEngine::Google | SearchEngine::Bing => {
+            SearchEngine::Google => {
-                // Google/Bing not yet implemented, fall back to SearXNG which aggregates them
+                self.search_bing(&query.query, query.max_results).await?
-                match self.search_searxng(&query.query, query.max_results).await {
+            }
-                    Ok(r) if !r.is_empty() => r,
+            SearchEngine::Bing => {
-                    _ => self.search_duckduckgo(&query.query, query.max_results).await?,
+                self.search_bing(&query.query, query.max_results).await?
                }
            }
        };
@@ -319,6 +318,67 @@ impl ResearcherHand {
        Ok(results)
    }
    /// Rust-native multi-engine search with Chinese auto-detection
    async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let has_cjk = query.chars().any(|c| is_cjk_char(c));
        // Strategy: try multiple engines in parallel, merge results
        let mut all_results = Vec::new();
        if has_cjk {
            // Chinese query: Bing CN + Baidu + DuckDuckGo in parallel
            let bing_fut = self.search_bing(query, max_results);
            let baidu_fut = self.search_baidu(query, max_results);
            let ddg_fut = self.search_duckduckgo_html(query, max_results);
            let (bing_res, baidu_res, ddg_res) = tokio::join!(
                async { bing_fut.await },
                async { baidu_fut.await },
                async { ddg_fut.await },
            );
            if let Ok(r) = bing_res {
                all_results.extend(r);
            }
            if let Ok(r) = baidu_res {
                all_results.extend(r);
            }
            if let Ok(r) = ddg_res {
                all_results.extend(r);
            }
        } else {
            // English query: DuckDuckGo HTML first, then Bing
            let ddg_fut = self.search_duckduckgo_html(query, max_results);
            let bing_fut = self.search_bing(query, max_results);
            let (ddg_res, bing_res) = tokio::join!(
                async { ddg_fut.await },
                async { bing_fut.await },
            );
            if let Ok(r) = ddg_res {
                all_results.extend(r);
            }
            if let Ok(r) = bing_res {
                all_results.extend(r);
            }
        }
        // Deduplicate by URL
        let mut seen_urls = std::collections::HashSet::new();
        all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
        // Sort by relevance descending, take top N
        all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
        all_results.truncate(max_results);
        if all_results.is_empty() {
            tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
        }
        Ok(all_results)
    }
    /// Search using SearXNG meta-search engine (aggregates 70+ engines)
    async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let url = format!(
@@ -405,70 +465,225 @@ impl ResearcherHand {
        Ok(results)
    }
-    /// Search using DuckDuckGo (no API key required)
+    /// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
-    async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
+    async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
-        let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
+        let url = format!(
-            url_encode(query));
+            "https://html.duckduckgo.com/html/?q={}",
            url_encode(query)
        );
        let response = self.client
            .get(&url)
            .header("Accept", "text/html")
            .send()
            .await
-            .map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
+            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("DuckDuckGo HTML search failed: {}", e)
            ))?;
-        let json: Value = response.json().await
+        let html = response.text().await
-            .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
+            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read DuckDuckGo response: {}", e)
            ))?;
        Ok(self.parse_ddg_html(&html, max_results))
    }
    /// Parse DuckDuckGo HTML search results page
    fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();
-        // Parse DuckDuckGo Instant Answer
+        for block in html.split("result__body") {
-        if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
+            if results.len() >= max_results {
-            if !abstract_text.is_empty() {
+                break;
                results.push(SearchResult {
                    title: query.to_string(),
                    url: json.get("AbstractURL")
                        .and_then(|v| v.as_str())
                        .unwrap_or("")
                        .to_string(),
                    snippet: abstract_text.to_string(),
                    source: json.get("AbstractSource")
                        .and_then(|v| v.as_str())
                        .unwrap_or("DuckDuckGo")
                        .to_string(),
                    relevance: 100,
                    content: None,
                    fetched_at: Some(chrono::Utc::now().to_rfc3339()),
                });
            }
            // Find the result title link: <a class="result__a" href="...">Title</a>
            let title_link = match extract_between(block, "result__a", "</a>") {
                Some(s) => s,
                None => continue,
            };
            // title_link is like:  href="//duckduckgo.com/l/?uddg=...">Title Text
            let title = title_link.rsplit('>').next()
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();
            let url = extract_href_uddg(block).unwrap_or_default();
            let snippet = extract_between(block, "result__snippet", "</a>")
                .map(|s| {
                    s.rsplit('>').next()
                        .map(|t| strip_html_tags(t).trim().to_string())
                        .unwrap_or_default()
                })
                .unwrap_or_default();
            if title.is_empty() || url.is_empty() {
                continue;
            }
            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "DuckDuckGo".to_string(),
                relevance: 70,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }
-        // Parse related topics
+        results
-        if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
+    }
-            for item in related.iter().take(max_results) {
+
-                if let Some(obj) = item.as_object() {
+    /// Search using Bing (works well for both Chinese and English)
-                    results.push(SearchResult {
+    async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
-                        title: obj.get("Text")
+        let has_cjk = query.chars().any(|c| is_cjk_char(c));
-                            .and_then(|v| v.as_str())
+        let url = if has_cjk {
-                            .unwrap_or("Related Topic")
+            format!(
-                            .to_string(),
+                "https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
-                        url: obj.get("FirstURL")
+                url_encode(query),
-                            .and_then(|v| v.as_str())
+                max_results
-                            .unwrap_or("")
+            )
-                            .to_string(),
+        } else {
-                        snippet: obj.get("Text")
+            format!(
-                            .and_then(|v| v.as_str())
+                "https://www.bing.com/search?q={}&count={}",
-                            .unwrap_or("")
+                url_encode(query),
-                            .to_string(),
+                max_results
-                        source: "DuckDuckGo".to_string(),
+            )
-                        relevance: 80,
+        };
-                        content: None,
+
-                        fetched_at: Some(chrono::Utc::now().to_rfc3339()),
+        let response = self.client
-                    });
+            .get(&url)
-                }
+            .header("Accept", "text/html,application/xhtml+xml")
            .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Bing search failed: {}", e)
            ))?;
        let html = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read Bing response: {}", e)
            ))?;
        Ok(self.parse_bing_html(&html, max_results))
    }
    /// Parse Bing HTML search results page
    fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();
        // Bing results are in <li class="b_algo">
        for block in html.split("class=\"b_algo\"") {
            if results.len() >= max_results {
                break;
            }
            // Extract title from first <a> inside the block
            let title = extract_between(block, ">", "</a>")
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();
            // Extract URL from href attribute of first <a>
            let url = extract_href(block).unwrap_or_default();
            // Extract snippet from <div class="b_caption"><p>...</p> or <p>
            let snippet = extract_between(block, "<p>", "</p>")
                .or_else(|| extract_between(block, "b_caption", "</div>"))
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();
            if title.is_empty() || url.is_empty() {
                continue;
            }
            // Skip Bing internal URLs
            if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
                continue;
            }
            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "Bing".to_string(),
                relevance: 75,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }
-        Ok(results)
+        results
    }
    /// Search using Baidu (essential for Chinese content)
    async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let url = format!(
            "https://www.baidu.com/s?wd={}&rn={}",
            url_encode(query),
            max_results
        );
        let response = self.client
            .get(&url)
            .header("Accept", "text/html,application/xhtml+xml")
            .header("Accept-Language", "zh-CN,zh;q=0.9")
            .send()
            .await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Baidu search failed: {}", e)
            ))?;
        let html = response.text().await
            .map_err(|e| zclaw_types::ZclawError::HandError(
                format!("Failed to read Baidu response: {}", e)
            ))?;
        Ok(self.parse_baidu_html(&html, max_results))
    }
    /// Parse Baidu HTML search results page
    fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();
        for block in html.split("class=\"result c-container\"") {
            if results.len() >= max_results {
                break;
            }
            if !block.contains("href=\"http") {
                continue;
            }
            let title = extract_between(block, ">", "</a>")
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();
            let url = extract_href(block).unwrap_or_default();
            let snippet = extract_between(block, "c-abstract", "</div>")
                .or_else(|| extract_between(block, "content-right_", "</div>"))
                .map(|s| strip_html_tags(s).trim().to_string())
                .unwrap_or_default();
            if title.is_empty() || url.is_empty() {
                continue;
            }
            results.push(SearchResult {
                title,
                url,
                snippet,
                source: "Baidu".to_string(),
                relevance: 80,
                content: None,
                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
            });
        }
        results
    }
    /// Fetch content from a URL
@@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String {
        .collect()
 }
 /// Check if a character is CJK (Chinese/Japanese/Korean)
 fn is_cjk_char(c: char) -> bool {
    matches!(c,
        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
        '\u{3000}'..='\u{303F}' |   // CJK Symbols and Punctuation
        '\u{FF00}'..='\u{FFEF}' |   // Fullwidth Forms
        '\u{2E80}'..='\u{2EFF}' |   // CJK Radicals Supplement
        '\u{F900}'..='\u{FAFF}'     // CJK Compatibility Ideographs
    )
 }
 /// Extract text between two delimiters
 fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
    let start_idx = text.find(start)?;
    let rest = &text[start_idx + start.len()..];
    let end_idx = rest.find(end)?;
    Some(&rest[..end_idx])
 }
 /// Strip HTML tags from a string
 fn strip_html_tags(s: &str) -> String {
    let mut result = String::with_capacity(s.len());
    let mut in_tag = false;
    for c in s.chars() {
        match c {
            '<' => in_tag = true,
            '>' => in_tag = false,
            _ if !in_tag => result.push(c),
            _ => {}
        }
    }
    // Decode common HTML entities
    result = result.replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&nbsp;", " ");
    result
 }
 /// Extract href URL from the first <a> tag in text
 fn extract_href(text: &str) -> Option<String> {
    let href_start = text.find("href=\"")?;
    let rest = &text[href_start + 6..];
    let end = rest.find('"')?;
    let url = &rest[..end];
    if url.starts_with("http") {
        Some(url.to_string())
    } else if url.starts_with("//") {
        Some(format!("https:{}", url))
    } else {
        None
    }
 }
 /// Extract the real URL from DDG's redirect link (uddg= parameter)
 fn extract_href_uddg(text: &str) -> Option<String> {
    // DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&amp;..."
    if let Some(idx) = text.find("uddg=") {
        let rest = &text[idx + 5..];
        let url_encoded = rest.split('&').next().unwrap_or("");
        let decoded = url_encoded.replace("%3A", ":")
            .replace("%2F", "/")
            .replace("%3F", "?")
            .replace("%3D", "=")
            .replace("%26", "&")
            .replace("%20", " ")
            .replace("%25", "%");
        if decoded.starts_with("http") {
            return Some(decoded);
        }
    }
    // Fallback: try regular href extraction
    extract_href(text)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1164,8 +1461,181 @@ mod tests {
        assert!(url.starts_with("http://localhost:8888/search?"));
        assert!(url.contains("format=json"));
        assert!(url.contains("categories=general"));
        // Verify UTF-8 encoding, not Unicode codepoints
        assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
        assert!(!url.contains("%4E2D"));    // NOT Unicode codepoint
    }
    // --- Native Search Helper Tests ---
    #[test]
    fn test_is_cjk_char_chinese() {
        assert!(is_cjk_char('中'));
        assert!(is_cjk_char('医'));
        assert!(is_cjk_char('。'));
        assert!(!is_cjk_char('a'));
        assert!(!is_cjk_char('1'));
        assert!(!is_cjk_char(' '));
    }
    #[test]
    fn test_is_cjk_char_detects_chinese_query() {
        let query = "2024年中国医疗政策";
        assert!(query.chars().any(|c| is_cjk_char(c)));
        let query_en = "Rust programming language";
        assert!(!query_en.chars().any(|c| is_cjk_char(c)));
    }
    #[test]
    fn test_strip_html_tags() {
        assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
        assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
        assert_eq!(strip_html_tags("plain text"), "plain text");
        assert_eq!(strip_html_tags("&amp;&lt;&gt;"), "&<>");
        // strip_html_tags only removes tags, not script content
        assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
    }
    #[test]
    fn test_extract_between_basic() {
        let text = "prefix<div>content</div>suffix";
        assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
    }
    #[test]
    fn test_extract_between_not_found() {
        let text = "no delimiters here";
        assert_eq!(extract_between(text, "<div>", "</div>"), None);
    }
    #[test]
    fn test_extract_href() {
        let text = r#"<a href="https://example.com/page">Title</a>"#;
        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
    }
    #[test]
    fn test_extract_href_protocol_relative() {
        let text = r#"<a href="//example.com/page">Title</a>"#;
        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
    }
    #[test]
    fn test_extract_href_uddg() {
        let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&amp;rut=abc""#;
        assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
    }
    #[test]
    fn test_extract_href_uddg_fallback() {
        let text = r#"<a href="https://example.com/direct">Title</a>"#;
        assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
    }
    // --- HTML Parser Tests ---
    #[test]
    fn test_parse_ddg_html() {
        let hand = create_test_hand();
        let html = r#"
        <div class="result__body">
            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&amp;rut=abc">Rust Programming Language</a>
            <a class="result__snippet">A systems programming language focused on safety and speed.</a>
        </div>
        <div class="result__body">
            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&amp;rut=def">The Rust Book</a>
            <a class="result__snippet">The official guide to Rust programming.</a>
        </div>
        "#;
        let results = hand.parse_ddg_html(html, 10);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].title, "Rust Programming Language");
        assert_eq!(results[0].url, "https://rust-lang.org");
        assert_eq!(results[0].source, "DuckDuckGo");
        assert_eq!(results[1].title, "The Rust Book");
    }
    #[test]
    fn test_parse_ddg_html_max_results() {
        let hand = create_test_hand();
        let mut html = String::new();
        for i in 0..20 {
            html.push_str(&format!(
                r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
                i, i, i
            ));
        }
        let results = hand.parse_ddg_html(&html, 5);
        assert_eq!(results.len(), 5);
    }
    #[test]
    fn test_parse_ddg_html_empty() {
        let hand = create_test_hand();
        let html = "<html><body>No results here</body></html>";
        let results = hand.parse_ddg_html(html, 10);
        assert!(results.is_empty());
    }
    #[test]
    fn test_parse_bing_html() {
        let hand = create_test_hand();
        let html = r#"
        <li class="b_algo">
            <h2><a href="https://example.com/result1">Example Result 1</a></h2>
            <div class="b_caption"><p>This is the first result snippet.</p></div>
        </li>
        <li class="b_algo">
            <h2><a href="https://example.com/result2">Example Result 2</a></h2>
            <div class="b_caption"><p>This is the second result snippet.</p></div>
        </li>
        "#;
        let results = hand.parse_bing_html(html, 10);
        assert_eq!(results.len(), 2);
        assert_eq!(results[0].title, "Example Result 1");
        assert_eq!(results[0].url, "https://example.com/result1");
        assert_eq!(results[0].source, "Bing");
    }
    #[test]
    fn test_parse_bing_html_skips_internal_urls() {
        let hand = create_test_hand();
        let html = r#"
        <li class="b_algo">
            <h2><a href="https://bing.com/search?q=more">More Results</a></h2>
        </li>
        <li class="b_algo">
            <h2><a href="https://example.com/real">Real Result</a></h2>
        </li>
        "#;
        let results = hand.parse_bing_html(html, 10);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].url, "https://example.com/real");
    }
    #[test]
    fn test_parse_bing_html_empty() {
        let hand = create_test_hand();
        let html = "<html><body>Nothing here</body></html>";
        let results = hand.parse_bing_html(html, 10);
        assert!(results.is_empty());
    }
    #[test]
    fn test_parse_baidu_html() {
        let hand = create_test_hand();
        let html = r#"
        <div class="result c-container">
            <h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
            <div class="c-abstract">这是关于医疗政策的摘要信息。</div>
        </div>
        "#;
        let results = hand.parse_baidu_html(html, 10);
        assert_eq!(results.len(), 1);
        assert_eq!(results[0].source, "Baidu");
    }
 }