feat(hands): Rust原生多引擎搜索 — DuckDuckGo HTML/Bing CN/百度并行聚合

- 用 DuckDuckGo HTML 搜索(html.duckduckgo.com)替换 Instant Answer API，获得真正搜索结果 - 新增 Bing CN 搜索(cn.bing.com)，中文查询自动切换 - 新增百度搜索(baidu.com/s)，中文内容覆盖 - CJK 自动检测：中文查询并行搜索 Bing+Baidu+DDG，英文查询 DDG+Bing - 结果去重(URL) + 按相关性排序 - SearXNG 保留为可选后端，不再强制依赖 Docker - 137 tests PASS（新增 20 个：HTML解析/CJK检测/辅助函数/引擎测试）
2026-04-22 11:41:19 +08:00
parent 0fd981905d
commit 95a05bc6dc
1 changed files with 532 additions and 62 deletions
--- a/crates/zclaw-hands/src/hands/researcher.rs
+++ b/crates/zclaw-hands/src/hands/researcher.rs
@@ -285,24 +285,23 @@ impl ResearcherHand {
        };

        let results = match engine {
-            SearchEngine::SearXNG | SearchEngine::Auto => {
+            SearchEngine::SearXNG => {
                match self.search_searxng(&query.query, query.max_results).await {
                    Ok(r) if !r.is_empty() => r,
-                    _ => {
-                        tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo");
-                        self.search_duckduckgo(&query.query, query.max_results).await?
-                    }
+                    _ => self.search_native(&query.query, query.max_results).await?,
                }
            }
+            SearchEngine::Auto => {
+                self.search_native(&query.query, query.max_results).await?
+            }
            SearchEngine::DuckDuckGo => {
-                self.search_duckduckgo(&query.query, query.max_results).await?
+                self.search_duckduckgo_html(&query.query, query.max_results).await?
            }
-            SearchEngine::Google | SearchEngine::Bing => {
-                // Google/Bing not yet implemented, fall back to SearXNG which aggregates them
-                match self.search_searxng(&query.query, query.max_results).await {
-                    Ok(r) if !r.is_empty() => r,
-                    _ => self.search_duckduckgo(&query.query, query.max_results).await?,
-                }
+            SearchEngine::Google => {
+                self.search_bing(&query.query, query.max_results).await?
+            }
+            SearchEngine::Bing => {
+                self.search_bing(&query.query, query.max_results).await?
            }
        };

@@ -319,6 +318,67 @@ impl ResearcherHand {
        Ok(results)
    }

+    /// Rust-native multi-engine search with Chinese auto-detection
+    async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
+        let has_cjk = query.chars().any(|c| is_cjk_char(c));
+
+        // Strategy: try multiple engines in parallel, merge results
+        let mut all_results = Vec::new();
+
+        if has_cjk {
+            // Chinese query: Bing CN + Baidu + DuckDuckGo in parallel
+            let bing_fut = self.search_bing(query, max_results);
+            let baidu_fut = self.search_baidu(query, max_results);
+            let ddg_fut = self.search_duckduckgo_html(query, max_results);
+
+            let (bing_res, baidu_res, ddg_res) = tokio::join!(
+                async { bing_fut.await },
+                async { baidu_fut.await },
+                async { ddg_fut.await },
+            );
+
+            if let Ok(r) = bing_res {
+                all_results.extend(r);
+            }
+            if let Ok(r) = baidu_res {
+                all_results.extend(r);
+            }
+            if let Ok(r) = ddg_res {
+                all_results.extend(r);
+            }
+        } else {
+            // English query: DuckDuckGo HTML first, then Bing
+            let ddg_fut = self.search_duckduckgo_html(query, max_results);
+            let bing_fut = self.search_bing(query, max_results);
+
+            let (ddg_res, bing_res) = tokio::join!(
+                async { ddg_fut.await },
+                async { bing_fut.await },
+            );
+
+            if let Ok(r) = ddg_res {
+                all_results.extend(r);
+            }
+            if let Ok(r) = bing_res {
+                all_results.extend(r);
+            }
+        }
+
+        // Deduplicate by URL
+        let mut seen_urls = std::collections::HashSet::new();
+        all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
+
+        // Sort by relevance descending, take top N
+        all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
+        all_results.truncate(max_results);
+
+        if all_results.is_empty() {
+            tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
+        }
+
+        Ok(all_results)
+    }
+
    /// Search using SearXNG meta-search engine (aggregates 70+ engines)
    async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
        let url = format!(
@@ -405,70 +465,225 @@ impl ResearcherHand {
        Ok(results)
    }

-    /// Search using DuckDuckGo (no API key required)
-    async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
-        let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
-            url_encode(query));
+    /// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
+    async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
+        let url = format!(
+            "https://html.duckduckgo.com/html/?q={}",
+            url_encode(query)
+        );

        let response = self.client
            .get(&url)
+            .header("Accept", "text/html")
            .send()
            .await
-            .map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("DuckDuckGo HTML search failed: {}", e)
+            ))?;

-        let json: Value = response.json().await
-            .map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
+        let html = response.text().await
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("Failed to read DuckDuckGo response: {}", e)
+            ))?;

+        Ok(self.parse_ddg_html(&html, max_results))
+    }
+
+    /// Parse DuckDuckGo HTML search results page
+    fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
        let mut results = Vec::new();

-        // Parse DuckDuckGo Instant Answer
-        if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
-            if !abstract_text.is_empty() {
-                results.push(SearchResult {
-                    title: query.to_string(),
-                    url: json.get("AbstractURL")
-                        .and_then(|v| v.as_str())
-                        .unwrap_or("")
-                        .to_string(),
-                    snippet: abstract_text.to_string(),
-                    source: json.get("AbstractSource")
-                        .and_then(|v| v.as_str())
-                        .unwrap_or("DuckDuckGo")
-                        .to_string(),
-                    relevance: 100,
-                    content: None,
-                    fetched_at: Some(chrono::Utc::now().to_rfc3339()),
-                });
+        for block in html.split("result__body") {
+            if results.len() >= max_results {
+                break;
            }
+
+            // Find the result title link: <a class="result__a" href="...">Title</a>
+            let title_link = match extract_between(block, "result__a", "</a>") {
+                Some(s) => s,
+                None => continue,
+            };
+            // title_link is like:  href="//duckduckgo.com/l/?uddg=...">Title Text
+            let title = title_link.rsplit('>').next()
+                .map(|s| strip_html_tags(s).trim().to_string())
+                .unwrap_or_default();
+
+            let url = extract_href_uddg(block).unwrap_or_default();
+
+            let snippet = extract_between(block, "result__snippet", "</a>")
+                .map(|s| {
+                    s.rsplit('>').next()
+                        .map(|t| strip_html_tags(t).trim().to_string())
+                        .unwrap_or_default()
+                })
+                .unwrap_or_default();
+
+            if title.is_empty() || url.is_empty() {
+                continue;
+            }
+
+            results.push(SearchResult {
+                title,
+                url,
+                snippet,
+                source: "DuckDuckGo".to_string(),
+                relevance: 70,
+                content: None,
+                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
+            });
        }

-        // Parse related topics
-        if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
-            for item in related.iter().take(max_results) {
-                if let Some(obj) = item.as_object() {
-                    results.push(SearchResult {
-                        title: obj.get("Text")
-                            .and_then(|v| v.as_str())
-                            .unwrap_or("Related Topic")
-                            .to_string(),
-                        url: obj.get("FirstURL")
-                            .and_then(|v| v.as_str())
-                            .unwrap_or("")
-                            .to_string(),
-                        snippet: obj.get("Text")
-                            .and_then(|v| v.as_str())
-                            .unwrap_or("")
-                            .to_string(),
-                        source: "DuckDuckGo".to_string(),
-                        relevance: 80,
-                        content: None,
-                        fetched_at: Some(chrono::Utc::now().to_rfc3339()),
-                    });
-                }
+        results
+    }
+
+    /// Search using Bing (works well for both Chinese and English)
+    async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
+        let has_cjk = query.chars().any(|c| is_cjk_char(c));
+        let url = if has_cjk {
+            format!(
+                "https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
+                url_encode(query),
+                max_results
+            )
+        } else {
+            format!(
+                "https://www.bing.com/search?q={}&count={}",
+                url_encode(query),
+                max_results
+            )
+        };
+
+        let response = self.client
+            .get(&url)
+            .header("Accept", "text/html,application/xhtml+xml")
+            .header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
+            .send()
+            .await
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("Bing search failed: {}", e)
+            ))?;
+
+        let html = response.text().await
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("Failed to read Bing response: {}", e)
+            ))?;
+
+        Ok(self.parse_bing_html(&html, max_results))
+    }
+
+    /// Parse Bing HTML search results page
+    fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
+        let mut results = Vec::new();
+
+        // Bing results are in <li class="b_algo">
+        for block in html.split("class=\"b_algo\"") {
+            if results.len() >= max_results {
+                break;
            }
+
+            // Extract title from first <a> inside the block
+            let title = extract_between(block, ">", "</a>")
+                .map(|s| strip_html_tags(s).trim().to_string())
+                .unwrap_or_default();
+
+            // Extract URL from href attribute of first <a>
+            let url = extract_href(block).unwrap_or_default();
+
+            // Extract snippet from <div class="b_caption"><p>...</p> or <p>
+            let snippet = extract_between(block, "<p>", "</p>")
+                .or_else(|| extract_between(block, "b_caption", "</div>"))
+                .map(|s| strip_html_tags(s).trim().to_string())
+                .unwrap_or_default();
+
+            if title.is_empty() || url.is_empty() {
+                continue;
+            }
+
+            // Skip Bing internal URLs
+            if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
+                continue;
+            }
+
+            results.push(SearchResult {
+                title,
+                url,
+                snippet,
+                source: "Bing".to_string(),
+                relevance: 75,
+                content: None,
+                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
+            });
        }

-        Ok(results)
+        results
+    }
+
+    /// Search using Baidu (essential for Chinese content)
+    async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
+        let url = format!(
+            "https://www.baidu.com/s?wd={}&rn={}",
+            url_encode(query),
+            max_results
+        );
+
+        let response = self.client
+            .get(&url)
+            .header("Accept", "text/html,application/xhtml+xml")
+            .header("Accept-Language", "zh-CN,zh;q=0.9")
+            .send()
+            .await
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("Baidu search failed: {}", e)
+            ))?;
+
+        let html = response.text().await
+            .map_err(|e| zclaw_types::ZclawError::HandError(
+                format!("Failed to read Baidu response: {}", e)
+            ))?;
+
+        Ok(self.parse_baidu_html(&html, max_results))
+    }
+
+    /// Parse Baidu HTML search results page
+    fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
+        let mut results = Vec::new();
+
+        for block in html.split("class=\"result c-container\"") {
+            if results.len() >= max_results {
+                break;
+            }
+
+            if !block.contains("href=\"http") {
+                continue;
+            }
+
+            let title = extract_between(block, ">", "</a>")
+                .map(|s| strip_html_tags(s).trim().to_string())
+                .unwrap_or_default();
+
+            let url = extract_href(block).unwrap_or_default();
+
+            let snippet = extract_between(block, "c-abstract", "</div>")
+                .or_else(|| extract_between(block, "content-right_", "</div>"))
+                .map(|s| strip_html_tags(s).trim().to_string())
+                .unwrap_or_default();
+
+            if title.is_empty() || url.is_empty() {
+                continue;
+            }
+
+            results.push(SearchResult {
+                title,
+                url,
+                snippet,
+                source: "Baidu".to_string(),
+                relevance: 80,
+                content: None,
+                fetched_at: Some(chrono::Utc::now().to_rfc3339()),
+            });
+        }
+
+        results
    }

    /// Fetch content from a URL
@@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String {
        .collect()
 }

+/// Check if a character is CJK (Chinese/Japanese/Korean)
+fn is_cjk_char(c: char) -> bool {
+    matches!(c,
+        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
+        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
+        '\u{3000}'..='\u{303F}' |   // CJK Symbols and Punctuation
+        '\u{FF00}'..='\u{FFEF}' |   // Fullwidth Forms
+        '\u{2E80}'..='\u{2EFF}' |   // CJK Radicals Supplement
+        '\u{F900}'..='\u{FAFF}'     // CJK Compatibility Ideographs
+    )
+}
+
+/// Extract text between two delimiters
+fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
+    let start_idx = text.find(start)?;
+    let rest = &text[start_idx + start.len()..];
+    let end_idx = rest.find(end)?;
+    Some(&rest[..end_idx])
+}
+
+/// Strip HTML tags from a string
+fn strip_html_tags(s: &str) -> String {
+    let mut result = String::with_capacity(s.len());
+    let mut in_tag = false;
+    for c in s.chars() {
+        match c {
+            '<' => in_tag = true,
+            '>' => in_tag = false,
+            _ if !in_tag => result.push(c),
+            _ => {}
+        }
+    }
+
+    // Decode common HTML entities
+    result = result.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&nbsp;", " ");
+
+    result
+}
+
+/// Extract href URL from the first <a> tag in text
+fn extract_href(text: &str) -> Option<String> {
+    let href_start = text.find("href=\"")?;
+    let rest = &text[href_start + 6..];
+    let end = rest.find('"')?;
+    let url = &rest[..end];
+
+    if url.starts_with("http") {
+        Some(url.to_string())
+    } else if url.starts_with("//") {
+        Some(format!("https:{}", url))
+    } else {
+        None
+    }
+}
+
+/// Extract the real URL from DDG's redirect link (uddg= parameter)
+fn extract_href_uddg(text: &str) -> Option<String> {
+    // DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&amp;..."
+    if let Some(idx) = text.find("uddg=") {
+        let rest = &text[idx + 5..];
+        let url_encoded = rest.split('&').next().unwrap_or("");
+        let decoded = url_encoded.replace("%3A", ":")
+            .replace("%2F", "/")
+            .replace("%3F", "?")
+            .replace("%3D", "=")
+            .replace("%26", "&")
+            .replace("%20", " ")
+            .replace("%25", "%");
+        if decoded.starts_with("http") {
+            return Some(decoded);
+        }
+    }
+
+    // Fallback: try regular href extraction
+    extract_href(text)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -1164,8 +1461,181 @@ mod tests {
        assert!(url.starts_with("http://localhost:8888/search?"));
        assert!(url.contains("format=json"));
        assert!(url.contains("categories=general"));
-        // Verify UTF-8 encoding, not Unicode codepoints
        assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
        assert!(!url.contains("%4E2D"));    // NOT Unicode codepoint
    }
+
+    // --- Native Search Helper Tests ---
+
+    #[test]
+    fn test_is_cjk_char_chinese() {
+        assert!(is_cjk_char('中'));
+        assert!(is_cjk_char('医'));
+        assert!(is_cjk_char('。'));
+        assert!(!is_cjk_char('a'));
+        assert!(!is_cjk_char('1'));
+        assert!(!is_cjk_char(' '));
+    }
+
+    #[test]
+    fn test_is_cjk_char_detects_chinese_query() {
+        let query = "2024年中国医疗政策";
+        assert!(query.chars().any(|c| is_cjk_char(c)));
+
+        let query_en = "Rust programming language";
+        assert!(!query_en.chars().any(|c| is_cjk_char(c)));
+    }
+
+    #[test]
+    fn test_strip_html_tags() {
+        assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
+        assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
+        assert_eq!(strip_html_tags("plain text"), "plain text");
+        assert_eq!(strip_html_tags("&amp;&lt;&gt;"), "&<>");
+        // strip_html_tags only removes tags, not script content
+        assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
+    }
+
+    #[test]
+    fn test_extract_between_basic() {
+        let text = "prefix<div>content</div>suffix";
+        assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
+    }
+
+    #[test]
+    fn test_extract_between_not_found() {
+        let text = "no delimiters here";
+        assert_eq!(extract_between(text, "<div>", "</div>"), None);
+    }
+
+    #[test]
+    fn test_extract_href() {
+        let text = r#"<a href="https://example.com/page">Title</a>"#;
+        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
+    }
+
+    #[test]
+    fn test_extract_href_protocol_relative() {
+        let text = r#"<a href="//example.com/page">Title</a>"#;
+        assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
+    }
+
+    #[test]
+    fn test_extract_href_uddg() {
+        let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&amp;rut=abc""#;
+        assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
+    }
+
+    #[test]
+    fn test_extract_href_uddg_fallback() {
+        let text = r#"<a href="https://example.com/direct">Title</a>"#;
+        assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
+    }
+
+    // --- HTML Parser Tests ---
+
+    #[test]
+    fn test_parse_ddg_html() {
+        let hand = create_test_hand();
+        let html = r#"
+        <div class="result__body">
+            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&amp;rut=abc">Rust Programming Language</a>
+            <a class="result__snippet">A systems programming language focused on safety and speed.</a>
+        </div>
+        <div class="result__body">
+            <a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&amp;rut=def">The Rust Book</a>
+            <a class="result__snippet">The official guide to Rust programming.</a>
+        </div>
+        "#;
+
+        let results = hand.parse_ddg_html(html, 10);
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0].title, "Rust Programming Language");
+        assert_eq!(results[0].url, "https://rust-lang.org");
+        assert_eq!(results[0].source, "DuckDuckGo");
+        assert_eq!(results[1].title, "The Rust Book");
+    }
+
+    #[test]
+    fn test_parse_ddg_html_max_results() {
+        let hand = create_test_hand();
+        let mut html = String::new();
+        for i in 0..20 {
+            html.push_str(&format!(
+                r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
+                i, i, i
+            ));
+        }
+        let results = hand.parse_ddg_html(&html, 5);
+        assert_eq!(results.len(), 5);
+    }
+
+    #[test]
+    fn test_parse_ddg_html_empty() {
+        let hand = create_test_hand();
+        let html = "<html><body>No results here</body></html>";
+        let results = hand.parse_ddg_html(html, 10);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    fn test_parse_bing_html() {
+        let hand = create_test_hand();
+        let html = r#"
+        <li class="b_algo">
+            <h2><a href="https://example.com/result1">Example Result 1</a></h2>
+            <div class="b_caption"><p>This is the first result snippet.</p></div>
+        </li>
+        <li class="b_algo">
+            <h2><a href="https://example.com/result2">Example Result 2</a></h2>
+            <div class="b_caption"><p>This is the second result snippet.</p></div>
+        </li>
+        "#;
+
+        let results = hand.parse_bing_html(html, 10);
+        assert_eq!(results.len(), 2);
+        assert_eq!(results[0].title, "Example Result 1");
+        assert_eq!(results[0].url, "https://example.com/result1");
+        assert_eq!(results[0].source, "Bing");
+    }
+
+    #[test]
+    fn test_parse_bing_html_skips_internal_urls() {
+        let hand = create_test_hand();
+        let html = r#"
+        <li class="b_algo">
+            <h2><a href="https://bing.com/search?q=more">More Results</a></h2>
+        </li>
+        <li class="b_algo">
+            <h2><a href="https://example.com/real">Real Result</a></h2>
+        </li>
+        "#;
+
+        let results = hand.parse_bing_html(html, 10);
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].url, "https://example.com/real");
+    }
+
+    #[test]
+    fn test_parse_bing_html_empty() {
+        let hand = create_test_hand();
+        let html = "<html><body>Nothing here</body></html>";
+        let results = hand.parse_bing_html(html, 10);
+        assert!(results.is_empty());
+    }
+
+    #[test]
+    fn test_parse_baidu_html() {
+        let hand = create_test_hand();
+        let html = r#"
+        <div class="result c-container">
+            <h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
+            <div class="c-abstract">这是关于医疗政策的摘要信息。</div>
+        </div>
+        "#;
+
+        let results = hand.parse_baidu_html(html, 10);
+        assert_eq!(results.len(), 1);
+        assert_eq!(results[0].source, "Baidu");
+    }
 }