feat(hands): Rust原生多引擎搜索 — DuckDuckGo HTML/Bing CN/百度并行聚合
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

- 用 DuckDuckGo HTML 搜索(html.duckduckgo.com)替换 Instant Answer API,获得真正搜索结果
- 新增 Bing CN 搜索(cn.bing.com),中文查询自动切换
- 新增百度搜索(baidu.com/s),中文内容覆盖
- CJK 自动检测:中文查询并行搜索 Bing+Baidu+DDG,英文查询 DDG+Bing
- 结果去重(URL) + 按相关性排序
- SearXNG 保留为可选后端,不再强制依赖 Docker
- 137 tests PASS(新增 20 个:HTML解析/CJK检测/辅助函数/引擎测试)
This commit is contained in:
iven
2026-04-22 11:41:19 +08:00
parent 0fd981905d
commit 95a05bc6dc

View File

@@ -285,24 +285,23 @@ impl ResearcherHand {
};
let results = match engine {
SearchEngine::SearXNG | SearchEngine::Auto => {
SearchEngine::SearXNG => {
match self.search_searxng(&query.query, query.max_results).await {
Ok(r) if !r.is_empty() => r,
_ => {
tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo");
self.search_duckduckgo(&query.query, query.max_results).await?
}
_ => self.search_native(&query.query, query.max_results).await?,
}
}
SearchEngine::Auto => {
self.search_native(&query.query, query.max_results).await?
}
SearchEngine::DuckDuckGo => {
self.search_duckduckgo(&query.query, query.max_results).await?
self.search_duckduckgo_html(&query.query, query.max_results).await?
}
SearchEngine::Google | SearchEngine::Bing => {
// Google/Bing not yet implemented, fall back to SearXNG which aggregates them
match self.search_searxng(&query.query, query.max_results).await {
Ok(r) if !r.is_empty() => r,
_ => self.search_duckduckgo(&query.query, query.max_results).await?,
}
SearchEngine::Google => {
self.search_bing(&query.query, query.max_results).await?
}
SearchEngine::Bing => {
self.search_bing(&query.query, query.max_results).await?
}
};
@@ -319,6 +318,67 @@ impl ResearcherHand {
Ok(results)
}
/// Rust-native multi-engine search with Chinese auto-detection
async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let has_cjk = query.chars().any(|c| is_cjk_char(c));
// Strategy: try multiple engines in parallel, merge results
let mut all_results = Vec::new();
if has_cjk {
// Chinese query: Bing CN + Baidu + DuckDuckGo in parallel
let bing_fut = self.search_bing(query, max_results);
let baidu_fut = self.search_baidu(query, max_results);
let ddg_fut = self.search_duckduckgo_html(query, max_results);
let (bing_res, baidu_res, ddg_res) = tokio::join!(
async { bing_fut.await },
async { baidu_fut.await },
async { ddg_fut.await },
);
if let Ok(r) = bing_res {
all_results.extend(r);
}
if let Ok(r) = baidu_res {
all_results.extend(r);
}
if let Ok(r) = ddg_res {
all_results.extend(r);
}
} else {
// English query: DuckDuckGo HTML first, then Bing
let ddg_fut = self.search_duckduckgo_html(query, max_results);
let bing_fut = self.search_bing(query, max_results);
let (ddg_res, bing_res) = tokio::join!(
async { ddg_fut.await },
async { bing_fut.await },
);
if let Ok(r) = ddg_res {
all_results.extend(r);
}
if let Ok(r) = bing_res {
all_results.extend(r);
}
}
// Deduplicate by URL
let mut seen_urls = std::collections::HashSet::new();
all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
// Sort by relevance descending, take top N
all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
all_results.truncate(max_results);
if all_results.is_empty() {
tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
}
Ok(all_results)
}
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
@@ -405,70 +465,225 @@ impl ResearcherHand {
Ok(results)
}
/// Search using DuckDuckGo (no API key required)
async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
url_encode(query));
/// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://html.duckduckgo.com/html/?q={}",
url_encode(query)
);
let response = self.client
.get(&url)
.header("Accept", "text/html")
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("DuckDuckGo HTML search failed: {}", e)
))?;
let json: Value = response.json().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read DuckDuckGo response: {}", e)
))?;
Ok(self.parse_ddg_html(&html, max_results))
}
/// Parse DuckDuckGo HTML search results page
fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
// Parse DuckDuckGo Instant Answer
if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
if !abstract_text.is_empty() {
results.push(SearchResult {
title: query.to_string(),
url: json.get("AbstractURL")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
snippet: abstract_text.to_string(),
source: json.get("AbstractSource")
.and_then(|v| v.as_str())
.unwrap_or("DuckDuckGo")
.to_string(),
relevance: 100,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
for block in html.split("result__body") {
if results.len() >= max_results {
break;
}
// Find the result title link: <a class="result__a" href="...">Title</a>
let title_link = match extract_between(block, "result__a", "</a>") {
Some(s) => s,
None => continue,
};
// title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text
let title = title_link.rsplit('>').next()
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
let url = extract_href_uddg(block).unwrap_or_default();
let snippet = extract_between(block, "result__snippet", "</a>")
.map(|s| {
s.rsplit('>').next()
.map(|t| strip_html_tags(t).trim().to_string())
.unwrap_or_default()
})
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "DuckDuckGo".to_string(),
relevance: 70,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
// Parse related topics
if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
for item in related.iter().take(max_results) {
if let Some(obj) = item.as_object() {
results.push(SearchResult {
title: obj.get("Text")
.and_then(|v| v.as_str())
.unwrap_or("Related Topic")
.to_string(),
url: obj.get("FirstURL")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
snippet: obj.get("Text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
source: "DuckDuckGo".to_string(),
relevance: 80,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
results
}
/// Search using Bing (works well for both Chinese and English)
async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let has_cjk = query.chars().any(|c| is_cjk_char(c));
let url = if has_cjk {
format!(
"https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
url_encode(query),
max_results
)
} else {
format!(
"https://www.bing.com/search?q={}&count={}",
url_encode(query),
max_results
)
};
let response = self.client
.get(&url)
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Bing search failed: {}", e)
))?;
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Bing response: {}", e)
))?;
Ok(self.parse_bing_html(&html, max_results))
}
/// Parse Bing HTML search results page
fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
// Bing results are in <li class="b_algo">
for block in html.split("class=\"b_algo\"") {
if results.len() >= max_results {
break;
}
// Extract title from first <a> inside the block
let title = extract_between(block, ">", "</a>")
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
// Extract URL from href attribute of first <a>
let url = extract_href(block).unwrap_or_default();
// Extract snippet from <div class="b_caption"><p>...</p> or <p>
let snippet = extract_between(block, "<p>", "</p>")
.or_else(|| extract_between(block, "b_caption", "</div>"))
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
// Skip Bing internal URLs
if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "Bing".to_string(),
relevance: 75,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
Ok(results)
results
}
/// Search using Baidu (essential for Chinese content)
async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://www.baidu.com/s?wd={}&rn={}",
url_encode(query),
max_results
);
let response = self.client
.get(&url)
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9")
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Baidu search failed: {}", e)
))?;
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Baidu response: {}", e)
))?;
Ok(self.parse_baidu_html(&html, max_results))
}
/// Parse Baidu HTML search results page
fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
for block in html.split("class=\"result c-container\"") {
if results.len() >= max_results {
break;
}
if !block.contains("href=\"http") {
continue;
}
let title = extract_between(block, ">", "</a>")
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
let url = extract_href(block).unwrap_or_default();
let snippet = extract_between(block, "c-abstract", "</div>")
.or_else(|| extract_between(block, "content-right_", "</div>"))
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "Baidu".to_string(),
relevance: 80,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
results
}
/// Fetch content from a URL
@@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String {
.collect()
}
/// Check if a character is CJK (Chinese/Japanese/Korean)
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
'\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
'\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms
'\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
)
}
/// Extract text between two delimiters
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
let start_idx = text.find(start)?;
let rest = &text[start_idx + start.len()..];
let end_idx = rest.find(end)?;
Some(&rest[..end_idx])
}
/// Strip HTML tags from a string
fn strip_html_tags(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut in_tag = false;
for c in s.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
// Decode common HTML entities
result = result.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&nbsp;", " ");
result
}
/// Extract href URL from the first <a> tag in text
fn extract_href(text: &str) -> Option<String> {
let href_start = text.find("href=\"")?;
let rest = &text[href_start + 6..];
let end = rest.find('"')?;
let url = &rest[..end];
if url.starts_with("http") {
Some(url.to_string())
} else if url.starts_with("//") {
Some(format!("https:{}", url))
} else {
None
}
}
/// Extract the real URL from DDG's redirect link (uddg= parameter)
fn extract_href_uddg(text: &str) -> Option<String> {
// DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&amp;..."
if let Some(idx) = text.find("uddg=") {
let rest = &text[idx + 5..];
let url_encoded = rest.split('&').next().unwrap_or("");
let decoded = url_encoded.replace("%3A", ":")
.replace("%2F", "/")
.replace("%3F", "?")
.replace("%3D", "=")
.replace("%26", "&")
.replace("%20", " ")
.replace("%25", "%");
if decoded.starts_with("http") {
return Some(decoded);
}
}
// Fallback: try regular href extraction
extract_href(text)
}
#[cfg(test)]
mod tests {
use super::*;
@@ -1164,8 +1461,181 @@ mod tests {
assert!(url.starts_with("http://localhost:8888/search?"));
assert!(url.contains("format=json"));
assert!(url.contains("categories=general"));
// Verify UTF-8 encoding, not Unicode codepoints
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
}
// --- Native Search Helper Tests ---
#[test]
fn test_is_cjk_char_chinese() {
assert!(is_cjk_char('中'));
assert!(is_cjk_char('医'));
assert!(is_cjk_char('。'));
assert!(!is_cjk_char('a'));
assert!(!is_cjk_char('1'));
assert!(!is_cjk_char(' '));
}
#[test]
fn test_is_cjk_char_detects_chinese_query() {
let query = "2024年中国医疗政策";
assert!(query.chars().any(|c| is_cjk_char(c)));
let query_en = "Rust programming language";
assert!(!query_en.chars().any(|c| is_cjk_char(c)));
}
#[test]
fn test_strip_html_tags() {
assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
assert_eq!(strip_html_tags("plain text"), "plain text");
assert_eq!(strip_html_tags("&amp;&lt;&gt;"), "&<>");
// strip_html_tags only removes tags, not script content
assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
}
#[test]
fn test_extract_between_basic() {
let text = "prefix<div>content</div>suffix";
assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
}
#[test]
fn test_extract_between_not_found() {
let text = "no delimiters here";
assert_eq!(extract_between(text, "<div>", "</div>"), None);
}
#[test]
fn test_extract_href() {
let text = r#"<a href="https://example.com/page">Title</a>"#;
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_protocol_relative() {
let text = r#"<a href="//example.com/page">Title</a>"#;
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_uddg() {
let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&amp;rut=abc""#;
assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_uddg_fallback() {
let text = r#"<a href="https://example.com/direct">Title</a>"#;
assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
}
// --- HTML Parser Tests ---
#[test]
fn test_parse_ddg_html() {
let hand = create_test_hand();
let html = r#"
<div class="result__body">
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&amp;rut=abc">Rust Programming Language</a>
<a class="result__snippet">A systems programming language focused on safety and speed.</a>
</div>
<div class="result__body">
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&amp;rut=def">The Rust Book</a>
<a class="result__snippet">The official guide to Rust programming.</a>
</div>
"#;
let results = hand.parse_ddg_html(html, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Rust Programming Language");
assert_eq!(results[0].url, "https://rust-lang.org");
assert_eq!(results[0].source, "DuckDuckGo");
assert_eq!(results[1].title, "The Rust Book");
}
#[test]
fn test_parse_ddg_html_max_results() {
let hand = create_test_hand();
let mut html = String::new();
for i in 0..20 {
html.push_str(&format!(
r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
i, i, i
));
}
let results = hand.parse_ddg_html(&html, 5);
assert_eq!(results.len(), 5);
}
#[test]
fn test_parse_ddg_html_empty() {
let hand = create_test_hand();
let html = "<html><body>No results here</body></html>";
let results = hand.parse_ddg_html(html, 10);
assert!(results.is_empty());
}
#[test]
fn test_parse_bing_html() {
let hand = create_test_hand();
let html = r#"
<li class="b_algo">
<h2><a href="https://example.com/result1">Example Result 1</a></h2>
<div class="b_caption"><p>This is the first result snippet.</p></div>
</li>
<li class="b_algo">
<h2><a href="https://example.com/result2">Example Result 2</a></h2>
<div class="b_caption"><p>This is the second result snippet.</p></div>
</li>
"#;
let results = hand.parse_bing_html(html, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Example Result 1");
assert_eq!(results[0].url, "https://example.com/result1");
assert_eq!(results[0].source, "Bing");
}
#[test]
fn test_parse_bing_html_skips_internal_urls() {
let hand = create_test_hand();
let html = r#"
<li class="b_algo">
<h2><a href="https://bing.com/search?q=more">More Results</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://example.com/real">Real Result</a></h2>
</li>
"#;
let results = hand.parse_bing_html(html, 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/real");
}
#[test]
fn test_parse_bing_html_empty() {
let hand = create_test_hand();
let html = "<html><body>Nothing here</body></html>";
let results = hand.parse_bing_html(html, 10);
assert!(results.is_empty());
}
#[test]
fn test_parse_baidu_html() {
let hand = create_test_hand();
let html = r#"
<div class="result c-container">
<h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
<div class="c-abstract">这是关于医疗政策的摘要信息。</div>
</div>
"#;
let results = hand.parse_baidu_html(html, 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].source, "Baidu");
}
}