feat(hands): Rust原生多引擎搜索 — DuckDuckGo HTML/Bing CN/百度并行聚合
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
- 用 DuckDuckGo HTML 搜索(html.duckduckgo.com)替换 Instant Answer API,获得真正搜索结果 - 新增 Bing CN 搜索(cn.bing.com),中文查询自动切换 - 新增百度搜索(baidu.com/s),中文内容覆盖 - CJK 自动检测:中文查询并行搜索 Bing+Baidu+DDG,英文查询 DDG+Bing - 结果去重(URL) + 按相关性排序 - SearXNG 保留为可选后端,不再强制依赖 Docker - 137 tests PASS(新增 20 个:HTML解析/CJK检测/辅助函数/引擎测试)
This commit is contained in:
@@ -285,24 +285,23 @@ impl ResearcherHand {
|
||||
};
|
||||
|
||||
let results = match engine {
|
||||
SearchEngine::SearXNG | SearchEngine::Auto => {
|
||||
SearchEngine::SearXNG => {
|
||||
match self.search_searxng(&query.query, query.max_results).await {
|
||||
Ok(r) if !r.is_empty() => r,
|
||||
_ => {
|
||||
tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo");
|
||||
self.search_duckduckgo(&query.query, query.max_results).await?
|
||||
}
|
||||
_ => self.search_native(&query.query, query.max_results).await?,
|
||||
}
|
||||
}
|
||||
SearchEngine::Auto => {
|
||||
self.search_native(&query.query, query.max_results).await?
|
||||
}
|
||||
SearchEngine::DuckDuckGo => {
|
||||
self.search_duckduckgo(&query.query, query.max_results).await?
|
||||
self.search_duckduckgo_html(&query.query, query.max_results).await?
|
||||
}
|
||||
SearchEngine::Google | SearchEngine::Bing => {
|
||||
// Google/Bing not yet implemented, fall back to SearXNG which aggregates them
|
||||
match self.search_searxng(&query.query, query.max_results).await {
|
||||
Ok(r) if !r.is_empty() => r,
|
||||
_ => self.search_duckduckgo(&query.query, query.max_results).await?,
|
||||
}
|
||||
SearchEngine::Google => {
|
||||
self.search_bing(&query.query, query.max_results).await?
|
||||
}
|
||||
SearchEngine::Bing => {
|
||||
self.search_bing(&query.query, query.max_results).await?
|
||||
}
|
||||
};
|
||||
|
||||
@@ -319,6 +318,67 @@ impl ResearcherHand {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Rust-native multi-engine search with Chinese auto-detection
|
||||
async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||||
|
||||
// Strategy: try multiple engines in parallel, merge results
|
||||
let mut all_results = Vec::new();
|
||||
|
||||
if has_cjk {
|
||||
// Chinese query: Bing CN + Baidu + DuckDuckGo in parallel
|
||||
let bing_fut = self.search_bing(query, max_results);
|
||||
let baidu_fut = self.search_baidu(query, max_results);
|
||||
let ddg_fut = self.search_duckduckgo_html(query, max_results);
|
||||
|
||||
let (bing_res, baidu_res, ddg_res) = tokio::join!(
|
||||
async { bing_fut.await },
|
||||
async { baidu_fut.await },
|
||||
async { ddg_fut.await },
|
||||
);
|
||||
|
||||
if let Ok(r) = bing_res {
|
||||
all_results.extend(r);
|
||||
}
|
||||
if let Ok(r) = baidu_res {
|
||||
all_results.extend(r);
|
||||
}
|
||||
if let Ok(r) = ddg_res {
|
||||
all_results.extend(r);
|
||||
}
|
||||
} else {
|
||||
// English query: DuckDuckGo HTML first, then Bing
|
||||
let ddg_fut = self.search_duckduckgo_html(query, max_results);
|
||||
let bing_fut = self.search_bing(query, max_results);
|
||||
|
||||
let (ddg_res, bing_res) = tokio::join!(
|
||||
async { ddg_fut.await },
|
||||
async { bing_fut.await },
|
||||
);
|
||||
|
||||
if let Ok(r) = ddg_res {
|
||||
all_results.extend(r);
|
||||
}
|
||||
if let Ok(r) = bing_res {
|
||||
all_results.extend(r);
|
||||
}
|
||||
}
|
||||
|
||||
// Deduplicate by URL
|
||||
let mut seen_urls = std::collections::HashSet::new();
|
||||
all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
|
||||
|
||||
// Sort by relevance descending, take top N
|
||||
all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
|
||||
all_results.truncate(max_results);
|
||||
|
||||
if all_results.is_empty() {
|
||||
tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
|
||||
}
|
||||
|
||||
Ok(all_results)
|
||||
}
|
||||
|
||||
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
|
||||
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let url = format!(
|
||||
@@ -405,70 +465,225 @@ impl ResearcherHand {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Search using DuckDuckGo (no API key required)
|
||||
async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
|
||||
url_encode(query));
|
||||
/// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
|
||||
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let url = format!(
|
||||
"https://html.duckduckgo.com/html/?q={}",
|
||||
url_encode(query)
|
||||
);
|
||||
|
||||
let response = self.client
|
||||
.get(&url)
|
||||
.header("Accept", "text/html")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("DuckDuckGo HTML search failed: {}", e)
|
||||
))?;
|
||||
|
||||
let json: Value = response.json().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
|
||||
let html = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Failed to read DuckDuckGo response: {}", e)
|
||||
))?;
|
||||
|
||||
Ok(self.parse_ddg_html(&html, max_results))
|
||||
}
|
||||
|
||||
/// Parse DuckDuckGo HTML search results page
|
||||
fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Parse DuckDuckGo Instant Answer
|
||||
if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
|
||||
if !abstract_text.is_empty() {
|
||||
results.push(SearchResult {
|
||||
title: query.to_string(),
|
||||
url: json.get("AbstractURL")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
snippet: abstract_text.to_string(),
|
||||
source: json.get("AbstractSource")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("DuckDuckGo")
|
||||
.to_string(),
|
||||
relevance: 100,
|
||||
content: None,
|
||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||
});
|
||||
for block in html.split("result__body") {
|
||||
if results.len() >= max_results {
|
||||
break;
|
||||
}
|
||||
|
||||
// Find the result title link: <a class="result__a" href="...">Title</a>
|
||||
let title_link = match extract_between(block, "result__a", "</a>") {
|
||||
Some(s) => s,
|
||||
None => continue,
|
||||
};
|
||||
// title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text
|
||||
let title = title_link.rsplit('>').next()
|
||||
.map(|s| strip_html_tags(s).trim().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let url = extract_href_uddg(block).unwrap_or_default();
|
||||
|
||||
let snippet = extract_between(block, "result__snippet", "</a>")
|
||||
.map(|s| {
|
||||
s.rsplit('>').next()
|
||||
.map(|t| strip_html_tags(t).trim().to_string())
|
||||
.unwrap_or_default()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
if title.is_empty() || url.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
snippet,
|
||||
source: "DuckDuckGo".to_string(),
|
||||
relevance: 70,
|
||||
content: None,
|
||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||
});
|
||||
}
|
||||
|
||||
// Parse related topics
|
||||
if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
|
||||
for item in related.iter().take(max_results) {
|
||||
if let Some(obj) = item.as_object() {
|
||||
results.push(SearchResult {
|
||||
title: obj.get("Text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("Related Topic")
|
||||
.to_string(),
|
||||
url: obj.get("FirstURL")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
snippet: obj.get("Text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string(),
|
||||
source: "DuckDuckGo".to_string(),
|
||||
relevance: 80,
|
||||
content: None,
|
||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||
});
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
/// Search using Bing (works well for both Chinese and English)
|
||||
async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||||
let url = if has_cjk {
|
||||
format!(
|
||||
"https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
|
||||
url_encode(query),
|
||||
max_results
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"https://www.bing.com/search?q={}&count={}",
|
||||
url_encode(query),
|
||||
max_results
|
||||
)
|
||||
};
|
||||
|
||||
let response = self.client
|
||||
.get(&url)
|
||||
.header("Accept", "text/html,application/xhtml+xml")
|
||||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Bing search failed: {}", e)
|
||||
))?;
|
||||
|
||||
let html = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Failed to read Bing response: {}", e)
|
||||
))?;
|
||||
|
||||
Ok(self.parse_bing_html(&html, max_results))
|
||||
}
|
||||
|
||||
/// Parse Bing HTML search results page
|
||||
fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
// Bing results are in <li class="b_algo">
|
||||
for block in html.split("class=\"b_algo\"") {
|
||||
if results.len() >= max_results {
|
||||
break;
|
||||
}
|
||||
|
||||
// Extract title from first <a> inside the block
|
||||
let title = extract_between(block, ">", "</a>")
|
||||
.map(|s| strip_html_tags(s).trim().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
// Extract URL from href attribute of first <a>
|
||||
let url = extract_href(block).unwrap_or_default();
|
||||
|
||||
// Extract snippet from <div class="b_caption"><p>...</p> or <p>
|
||||
let snippet = extract_between(block, "<p>", "</p>")
|
||||
.or_else(|| extract_between(block, "b_caption", "</div>"))
|
||||
.map(|s| strip_html_tags(s).trim().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
if title.is_empty() || url.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip Bing internal URLs
|
||||
if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
snippet,
|
||||
source: "Bing".to_string(),
|
||||
relevance: 75,
|
||||
content: None,
|
||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||
});
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
results
|
||||
}
|
||||
|
||||
/// Search using Baidu (essential for Chinese content)
|
||||
async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let url = format!(
|
||||
"https://www.baidu.com/s?wd={}&rn={}",
|
||||
url_encode(query),
|
||||
max_results
|
||||
);
|
||||
|
||||
let response = self.client
|
||||
.get(&url)
|
||||
.header("Accept", "text/html,application/xhtml+xml")
|
||||
.header("Accept-Language", "zh-CN,zh;q=0.9")
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Baidu search failed: {}", e)
|
||||
))?;
|
||||
|
||||
let html = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Failed to read Baidu response: {}", e)
|
||||
))?;
|
||||
|
||||
Ok(self.parse_baidu_html(&html, max_results))
|
||||
}
|
||||
|
||||
/// Parse Baidu HTML search results page
|
||||
fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||
let mut results = Vec::new();
|
||||
|
||||
for block in html.split("class=\"result c-container\"") {
|
||||
if results.len() >= max_results {
|
||||
break;
|
||||
}
|
||||
|
||||
if !block.contains("href=\"http") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let title = extract_between(block, ">", "</a>")
|
||||
.map(|s| strip_html_tags(s).trim().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
let url = extract_href(block).unwrap_or_default();
|
||||
|
||||
let snippet = extract_between(block, "c-abstract", "</div>")
|
||||
.or_else(|| extract_between(block, "content-right_", "</div>"))
|
||||
.map(|s| strip_html_tags(s).trim().to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
if title.is_empty() || url.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
snippet,
|
||||
source: "Baidu".to_string(),
|
||||
relevance: 80,
|
||||
content: None,
|
||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||
});
|
||||
}
|
||||
|
||||
results
|
||||
}
|
||||
|
||||
/// Fetch content from a URL
|
||||
@@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Check if a character is CJK (Chinese/Japanese/Korean)
|
||||
fn is_cjk_char(c: char) -> bool {
|
||||
matches!(c,
|
||||
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
|
||||
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
|
||||
'\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
|
||||
'\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms
|
||||
'\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
|
||||
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
|
||||
)
|
||||
}
|
||||
|
||||
/// Extract text between two delimiters
|
||||
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
|
||||
let start_idx = text.find(start)?;
|
||||
let rest = &text[start_idx + start.len()..];
|
||||
let end_idx = rest.find(end)?;
|
||||
Some(&rest[..end_idx])
|
||||
}
|
||||
|
||||
/// Strip HTML tags from a string
|
||||
fn strip_html_tags(s: &str) -> String {
|
||||
let mut result = String::with_capacity(s.len());
|
||||
let mut in_tag = false;
|
||||
for c in s.chars() {
|
||||
match c {
|
||||
'<' => in_tag = true,
|
||||
'>' => in_tag = false,
|
||||
_ if !in_tag => result.push(c),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Decode common HTML entities
|
||||
result = result.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace(" ", " ");
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Extract href URL from the first <a> tag in text
|
||||
fn extract_href(text: &str) -> Option<String> {
|
||||
let href_start = text.find("href=\"")?;
|
||||
let rest = &text[href_start + 6..];
|
||||
let end = rest.find('"')?;
|
||||
let url = &rest[..end];
|
||||
|
||||
if url.starts_with("http") {
|
||||
Some(url.to_string())
|
||||
} else if url.starts_with("//") {
|
||||
Some(format!("https:{}", url))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the real URL from DDG's redirect link (uddg= parameter)
|
||||
fn extract_href_uddg(text: &str) -> Option<String> {
|
||||
// DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&..."
|
||||
if let Some(idx) = text.find("uddg=") {
|
||||
let rest = &text[idx + 5..];
|
||||
let url_encoded = rest.split('&').next().unwrap_or("");
|
||||
let decoded = url_encoded.replace("%3A", ":")
|
||||
.replace("%2F", "/")
|
||||
.replace("%3F", "?")
|
||||
.replace("%3D", "=")
|
||||
.replace("%26", "&")
|
||||
.replace("%20", " ")
|
||||
.replace("%25", "%");
|
||||
if decoded.starts_with("http") {
|
||||
return Some(decoded);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: try regular href extraction
|
||||
extract_href(text)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1164,8 +1461,181 @@ mod tests {
|
||||
assert!(url.starts_with("http://localhost:8888/search?"));
|
||||
assert!(url.contains("format=json"));
|
||||
assert!(url.contains("categories=general"));
|
||||
// Verify UTF-8 encoding, not Unicode codepoints
|
||||
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
|
||||
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
|
||||
}
|
||||
|
||||
// --- Native Search Helper Tests ---
|
||||
|
||||
#[test]
|
||||
fn test_is_cjk_char_chinese() {
|
||||
assert!(is_cjk_char('中'));
|
||||
assert!(is_cjk_char('医'));
|
||||
assert!(is_cjk_char('。'));
|
||||
assert!(!is_cjk_char('a'));
|
||||
assert!(!is_cjk_char('1'));
|
||||
assert!(!is_cjk_char(' '));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_cjk_char_detects_chinese_query() {
|
||||
let query = "2024年中国医疗政策";
|
||||
assert!(query.chars().any(|c| is_cjk_char(c)));
|
||||
|
||||
let query_en = "Rust programming language";
|
||||
assert!(!query_en.chars().any(|c| is_cjk_char(c)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_html_tags() {
|
||||
assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
|
||||
assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
|
||||
assert_eq!(strip_html_tags("plain text"), "plain text");
|
||||
assert_eq!(strip_html_tags("&<>"), "&<>");
|
||||
// strip_html_tags only removes tags, not script content
|
||||
assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_between_basic() {
|
||||
let text = "prefix<div>content</div>suffix";
|
||||
assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_between_not_found() {
|
||||
let text = "no delimiters here";
|
||||
assert_eq!(extract_between(text, "<div>", "</div>"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_href() {
|
||||
let text = r#"<a href="https://example.com/page">Title</a>"#;
|
||||
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_href_protocol_relative() {
|
||||
let text = r#"<a href="//example.com/page">Title</a>"#;
|
||||
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_href_uddg() {
|
||||
let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc""#;
|
||||
assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_href_uddg_fallback() {
|
||||
let text = r#"<a href="https://example.com/direct">Title</a>"#;
|
||||
assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
|
||||
}
|
||||
|
||||
// --- HTML Parser Tests ---
|
||||
|
||||
#[test]
|
||||
fn test_parse_ddg_html() {
|
||||
let hand = create_test_hand();
|
||||
let html = r#"
|
||||
<div class="result__body">
|
||||
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&rut=abc">Rust Programming Language</a>
|
||||
<a class="result__snippet">A systems programming language focused on safety and speed.</a>
|
||||
</div>
|
||||
<div class="result__body">
|
||||
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&rut=def">The Rust Book</a>
|
||||
<a class="result__snippet">The official guide to Rust programming.</a>
|
||||
</div>
|
||||
"#;
|
||||
|
||||
let results = hand.parse_ddg_html(html, 10);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].title, "Rust Programming Language");
|
||||
assert_eq!(results[0].url, "https://rust-lang.org");
|
||||
assert_eq!(results[0].source, "DuckDuckGo");
|
||||
assert_eq!(results[1].title, "The Rust Book");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_ddg_html_max_results() {
|
||||
let hand = create_test_hand();
|
||||
let mut html = String::new();
|
||||
for i in 0..20 {
|
||||
html.push_str(&format!(
|
||||
r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
|
||||
i, i, i
|
||||
));
|
||||
}
|
||||
let results = hand.parse_ddg_html(&html, 5);
|
||||
assert_eq!(results.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_ddg_html_empty() {
|
||||
let hand = create_test_hand();
|
||||
let html = "<html><body>No results here</body></html>";
|
||||
let results = hand.parse_ddg_html(html, 10);
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bing_html() {
|
||||
let hand = create_test_hand();
|
||||
let html = r#"
|
||||
<li class="b_algo">
|
||||
<h2><a href="https://example.com/result1">Example Result 1</a></h2>
|
||||
<div class="b_caption"><p>This is the first result snippet.</p></div>
|
||||
</li>
|
||||
<li class="b_algo">
|
||||
<h2><a href="https://example.com/result2">Example Result 2</a></h2>
|
||||
<div class="b_caption"><p>This is the second result snippet.</p></div>
|
||||
</li>
|
||||
"#;
|
||||
|
||||
let results = hand.parse_bing_html(html, 10);
|
||||
assert_eq!(results.len(), 2);
|
||||
assert_eq!(results[0].title, "Example Result 1");
|
||||
assert_eq!(results[0].url, "https://example.com/result1");
|
||||
assert_eq!(results[0].source, "Bing");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bing_html_skips_internal_urls() {
|
||||
let hand = create_test_hand();
|
||||
let html = r#"
|
||||
<li class="b_algo">
|
||||
<h2><a href="https://bing.com/search?q=more">More Results</a></h2>
|
||||
</li>
|
||||
<li class="b_algo">
|
||||
<h2><a href="https://example.com/real">Real Result</a></h2>
|
||||
</li>
|
||||
"#;
|
||||
|
||||
let results = hand.parse_bing_html(html, 10);
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].url, "https://example.com/real");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_bing_html_empty() {
|
||||
let hand = create_test_hand();
|
||||
let html = "<html><body>Nothing here</body></html>";
|
||||
let results = hand.parse_bing_html(html, 10);
|
||||
assert!(results.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_baidu_html() {
|
||||
let hand = create_test_hand();
|
||||
let html = r#"
|
||||
<div class="result c-container">
|
||||
<h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
|
||||
<div class="c-abstract">这是关于医疗政策的摘要信息。</div>
|
||||
</div>
|
||||
"#;
|
||||
|
||||
let results = hand.parse_baidu_html(html, 10);
|
||||
assert_eq!(results.len(), 1);
|
||||
assert_eq!(results[0].source, "Baidu");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user