feat(hands): Rust原生多引擎搜索 — DuckDuckGo HTML/Bing CN/百度并行聚合
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
- 用 DuckDuckGo HTML 搜索(html.duckduckgo.com)替换 Instant Answer API,获得真正搜索结果 - 新增 Bing CN 搜索(cn.bing.com),中文查询自动切换 - 新增百度搜索(baidu.com/s),中文内容覆盖 - CJK 自动检测:中文查询并行搜索 Bing+Baidu+DDG,英文查询 DDG+Bing - 结果去重(URL) + 按相关性排序 - SearXNG 保留为可选后端,不再强制依赖 Docker - 137 tests PASS(新增 20 个:HTML解析/CJK检测/辅助函数/引擎测试)
This commit is contained in:
@@ -285,24 +285,23 @@ impl ResearcherHand {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let results = match engine {
|
let results = match engine {
|
||||||
SearchEngine::SearXNG | SearchEngine::Auto => {
|
SearchEngine::SearXNG => {
|
||||||
match self.search_searxng(&query.query, query.max_results).await {
|
match self.search_searxng(&query.query, query.max_results).await {
|
||||||
Ok(r) if !r.is_empty() => r,
|
Ok(r) if !r.is_empty() => r,
|
||||||
_ => {
|
_ => self.search_native(&query.query, query.max_results).await?,
|
||||||
tracing::warn!(target: "researcher", "SearXNG failed or empty, falling back to DuckDuckGo");
|
|
||||||
self.search_duckduckgo(&query.query, query.max_results).await?
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
SearchEngine::Auto => {
|
||||||
|
self.search_native(&query.query, query.max_results).await?
|
||||||
|
}
|
||||||
SearchEngine::DuckDuckGo => {
|
SearchEngine::DuckDuckGo => {
|
||||||
self.search_duckduckgo(&query.query, query.max_results).await?
|
self.search_duckduckgo_html(&query.query, query.max_results).await?
|
||||||
}
|
}
|
||||||
SearchEngine::Google | SearchEngine::Bing => {
|
SearchEngine::Google => {
|
||||||
// Google/Bing not yet implemented, fall back to SearXNG which aggregates them
|
self.search_bing(&query.query, query.max_results).await?
|
||||||
match self.search_searxng(&query.query, query.max_results).await {
|
}
|
||||||
Ok(r) if !r.is_empty() => r,
|
SearchEngine::Bing => {
|
||||||
_ => self.search_duckduckgo(&query.query, query.max_results).await?,
|
self.search_bing(&query.query, query.max_results).await?
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -319,6 +318,67 @@ impl ResearcherHand {
|
|||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Rust-native multi-engine search with Chinese auto-detection
|
||||||
|
async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||||
|
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||||||
|
|
||||||
|
// Strategy: try multiple engines in parallel, merge results
|
||||||
|
let mut all_results = Vec::new();
|
||||||
|
|
||||||
|
if has_cjk {
|
||||||
|
// Chinese query: Bing CN + Baidu + DuckDuckGo in parallel
|
||||||
|
let bing_fut = self.search_bing(query, max_results);
|
||||||
|
let baidu_fut = self.search_baidu(query, max_results);
|
||||||
|
let ddg_fut = self.search_duckduckgo_html(query, max_results);
|
||||||
|
|
||||||
|
let (bing_res, baidu_res, ddg_res) = tokio::join!(
|
||||||
|
async { bing_fut.await },
|
||||||
|
async { baidu_fut.await },
|
||||||
|
async { ddg_fut.await },
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Ok(r) = bing_res {
|
||||||
|
all_results.extend(r);
|
||||||
|
}
|
||||||
|
if let Ok(r) = baidu_res {
|
||||||
|
all_results.extend(r);
|
||||||
|
}
|
||||||
|
if let Ok(r) = ddg_res {
|
||||||
|
all_results.extend(r);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// English query: DuckDuckGo HTML first, then Bing
|
||||||
|
let ddg_fut = self.search_duckduckgo_html(query, max_results);
|
||||||
|
let bing_fut = self.search_bing(query, max_results);
|
||||||
|
|
||||||
|
let (ddg_res, bing_res) = tokio::join!(
|
||||||
|
async { ddg_fut.await },
|
||||||
|
async { bing_fut.await },
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Ok(r) = ddg_res {
|
||||||
|
all_results.extend(r);
|
||||||
|
}
|
||||||
|
if let Ok(r) = bing_res {
|
||||||
|
all_results.extend(r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deduplicate by URL
|
||||||
|
let mut seen_urls = std::collections::HashSet::new();
|
||||||
|
all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
|
||||||
|
|
||||||
|
// Sort by relevance descending, take top N
|
||||||
|
all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
|
||||||
|
all_results.truncate(max_results);
|
||||||
|
|
||||||
|
if all_results.is_empty() {
|
||||||
|
tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_results)
|
||||||
|
}
|
||||||
|
|
||||||
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
|
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
|
||||||
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||||
let url = format!(
|
let url = format!(
|
||||||
@@ -405,70 +465,225 @@ impl ResearcherHand {
|
|||||||
Ok(results)
|
Ok(results)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search using DuckDuckGo (no API key required)
|
/// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
|
||||||
async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||||
let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
|
let url = format!(
|
||||||
url_encode(query));
|
"https://html.duckduckgo.com/html/?q={}",
|
||||||
|
url_encode(query)
|
||||||
|
);
|
||||||
|
|
||||||
let response = self.client
|
let response = self.client
|
||||||
.get(&url)
|
.get(&url)
|
||||||
|
.header("Accept", "text/html")
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("DuckDuckGo HTML search failed: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
let json: Value = response.json().await
|
let html = response.text().await
|
||||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("Failed to read DuckDuckGo response: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
|
Ok(self.parse_ddg_html(&html, max_results))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse DuckDuckGo HTML search results page
|
||||||
|
fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||||
let mut results = Vec::new();
|
let mut results = Vec::new();
|
||||||
|
|
||||||
// Parse DuckDuckGo Instant Answer
|
for block in html.split("result__body") {
|
||||||
if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
|
if results.len() >= max_results {
|
||||||
if !abstract_text.is_empty() {
|
break;
|
||||||
results.push(SearchResult {
|
|
||||||
title: query.to_string(),
|
|
||||||
url: json.get("AbstractURL")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_string(),
|
|
||||||
snippet: abstract_text.to_string(),
|
|
||||||
source: json.get("AbstractSource")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.unwrap_or("DuckDuckGo")
|
|
||||||
.to_string(),
|
|
||||||
relevance: 100,
|
|
||||||
content: None,
|
|
||||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Find the result title link: <a class="result__a" href="...">Title</a>
|
||||||
|
let title_link = match extract_between(block, "result__a", "</a>") {
|
||||||
|
Some(s) => s,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
// title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text
|
||||||
|
let title = title_link.rsplit('>').next()
|
||||||
|
.map(|s| strip_html_tags(s).trim().to_string())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let url = extract_href_uddg(block).unwrap_or_default();
|
||||||
|
|
||||||
|
let snippet = extract_between(block, "result__snippet", "</a>")
|
||||||
|
.map(|s| {
|
||||||
|
s.rsplit('>').next()
|
||||||
|
.map(|t| strip_html_tags(t).trim().to_string())
|
||||||
|
.unwrap_or_default()
|
||||||
|
})
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
if title.is_empty() || url.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(SearchResult {
|
||||||
|
title,
|
||||||
|
url,
|
||||||
|
snippet,
|
||||||
|
source: "DuckDuckGo".to_string(),
|
||||||
|
relevance: 70,
|
||||||
|
content: None,
|
||||||
|
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse related topics
|
results
|
||||||
if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
|
}
|
||||||
for item in related.iter().take(max_results) {
|
|
||||||
if let Some(obj) = item.as_object() {
|
/// Search using Bing (works well for both Chinese and English)
|
||||||
results.push(SearchResult {
|
async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||||
title: obj.get("Text")
|
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||||||
.and_then(|v| v.as_str())
|
let url = if has_cjk {
|
||||||
.unwrap_or("Related Topic")
|
format!(
|
||||||
.to_string(),
|
"https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
|
||||||
url: obj.get("FirstURL")
|
url_encode(query),
|
||||||
.and_then(|v| v.as_str())
|
max_results
|
||||||
.unwrap_or("")
|
)
|
||||||
.to_string(),
|
} else {
|
||||||
snippet: obj.get("Text")
|
format!(
|
||||||
.and_then(|v| v.as_str())
|
"https://www.bing.com/search?q={}&count={}",
|
||||||
.unwrap_or("")
|
url_encode(query),
|
||||||
.to_string(),
|
max_results
|
||||||
source: "DuckDuckGo".to_string(),
|
)
|
||||||
relevance: 80,
|
};
|
||||||
content: None,
|
|
||||||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
let response = self.client
|
||||||
});
|
.get(&url)
|
||||||
}
|
.header("Accept", "text/html,application/xhtml+xml")
|
||||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("Bing search failed: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
|
let html = response.text().await
|
||||||
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("Failed to read Bing response: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
|
Ok(self.parse_bing_html(&html, max_results))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse Bing HTML search results page
|
||||||
|
fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
// Bing results are in <li class="b_algo">
|
||||||
|
for block in html.split("class=\"b_algo\"") {
|
||||||
|
if results.len() >= max_results {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Extract title from first <a> inside the block
|
||||||
|
let title = extract_between(block, ">", "</a>")
|
||||||
|
.map(|s| strip_html_tags(s).trim().to_string())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
// Extract URL from href attribute of first <a>
|
||||||
|
let url = extract_href(block).unwrap_or_default();
|
||||||
|
|
||||||
|
// Extract snippet from <div class="b_caption"><p>...</p> or <p>
|
||||||
|
let snippet = extract_between(block, "<p>", "</p>")
|
||||||
|
.or_else(|| extract_between(block, "b_caption", "</div>"))
|
||||||
|
.map(|s| strip_html_tags(s).trim().to_string())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
if title.is_empty() || url.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip Bing internal URLs
|
||||||
|
if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(SearchResult {
|
||||||
|
title,
|
||||||
|
url,
|
||||||
|
snippet,
|
||||||
|
source: "Bing".to_string(),
|
||||||
|
relevance: 75,
|
||||||
|
content: None,
|
||||||
|
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(results)
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search using Baidu (essential for Chinese content)
|
||||||
|
async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||||
|
let url = format!(
|
||||||
|
"https://www.baidu.com/s?wd={}&rn={}",
|
||||||
|
url_encode(query),
|
||||||
|
max_results
|
||||||
|
);
|
||||||
|
|
||||||
|
let response = self.client
|
||||||
|
.get(&url)
|
||||||
|
.header("Accept", "text/html,application/xhtml+xml")
|
||||||
|
.header("Accept-Language", "zh-CN,zh;q=0.9")
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("Baidu search failed: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
|
let html = response.text().await
|
||||||
|
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||||
|
format!("Failed to read Baidu response: {}", e)
|
||||||
|
))?;
|
||||||
|
|
||||||
|
Ok(self.parse_baidu_html(&html, max_results))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse Baidu HTML search results page
|
||||||
|
fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for block in html.split("class=\"result c-container\"") {
|
||||||
|
if results.len() >= max_results {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !block.contains("href=\"http") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let title = extract_between(block, ">", "</a>")
|
||||||
|
.map(|s| strip_html_tags(s).trim().to_string())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let url = extract_href(block).unwrap_or_default();
|
||||||
|
|
||||||
|
let snippet = extract_between(block, "c-abstract", "</div>")
|
||||||
|
.or_else(|| extract_between(block, "content-right_", "</div>"))
|
||||||
|
.map(|s| strip_html_tags(s).trim().to_string())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
if title.is_empty() || url.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push(SearchResult {
|
||||||
|
title,
|
||||||
|
url,
|
||||||
|
snippet,
|
||||||
|
source: "Baidu".to_string(),
|
||||||
|
relevance: 80,
|
||||||
|
content: None,
|
||||||
|
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
results
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fetch content from a URL
|
/// Fetch content from a URL
|
||||||
@@ -765,6 +980,88 @@ fn url_encode(s: &str) -> String {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if a character is CJK (Chinese/Japanese/Korean)
|
||||||
|
fn is_cjk_char(c: char) -> bool {
|
||||||
|
matches!(c,
|
||||||
|
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
|
||||||
|
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
|
||||||
|
'\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
|
||||||
|
'\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms
|
||||||
|
'\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
|
||||||
|
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract text between two delimiters
|
||||||
|
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
|
||||||
|
let start_idx = text.find(start)?;
|
||||||
|
let rest = &text[start_idx + start.len()..];
|
||||||
|
let end_idx = rest.find(end)?;
|
||||||
|
Some(&rest[..end_idx])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Strip HTML tags from a string
|
||||||
|
fn strip_html_tags(s: &str) -> String {
|
||||||
|
let mut result = String::with_capacity(s.len());
|
||||||
|
let mut in_tag = false;
|
||||||
|
for c in s.chars() {
|
||||||
|
match c {
|
||||||
|
'<' => in_tag = true,
|
||||||
|
'>' => in_tag = false,
|
||||||
|
_ if !in_tag => result.push(c),
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode common HTML entities
|
||||||
|
result = result.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace(""", "\"")
|
||||||
|
.replace("'", "'")
|
||||||
|
.replace(" ", " ");
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract href URL from the first <a> tag in text
|
||||||
|
fn extract_href(text: &str) -> Option<String> {
|
||||||
|
let href_start = text.find("href=\"")?;
|
||||||
|
let rest = &text[href_start + 6..];
|
||||||
|
let end = rest.find('"')?;
|
||||||
|
let url = &rest[..end];
|
||||||
|
|
||||||
|
if url.starts_with("http") {
|
||||||
|
Some(url.to_string())
|
||||||
|
} else if url.starts_with("//") {
|
||||||
|
Some(format!("https:{}", url))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the real URL from DDG's redirect link (uddg= parameter)
|
||||||
|
fn extract_href_uddg(text: &str) -> Option<String> {
|
||||||
|
// DDG HTML uses: href="//duckduckgo.com/l/?uddg=ENCODED_URL&..."
|
||||||
|
if let Some(idx) = text.find("uddg=") {
|
||||||
|
let rest = &text[idx + 5..];
|
||||||
|
let url_encoded = rest.split('&').next().unwrap_or("");
|
||||||
|
let decoded = url_encoded.replace("%3A", ":")
|
||||||
|
.replace("%2F", "/")
|
||||||
|
.replace("%3F", "?")
|
||||||
|
.replace("%3D", "=")
|
||||||
|
.replace("%26", "&")
|
||||||
|
.replace("%20", " ")
|
||||||
|
.replace("%25", "%");
|
||||||
|
if decoded.starts_with("http") {
|
||||||
|
return Some(decoded);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: try regular href extraction
|
||||||
|
extract_href(text)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -1164,8 +1461,181 @@ mod tests {
|
|||||||
assert!(url.starts_with("http://localhost:8888/search?"));
|
assert!(url.starts_with("http://localhost:8888/search?"));
|
||||||
assert!(url.contains("format=json"));
|
assert!(url.contains("format=json"));
|
||||||
assert!(url.contains("categories=general"));
|
assert!(url.contains("categories=general"));
|
||||||
// Verify UTF-8 encoding, not Unicode codepoints
|
|
||||||
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
|
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
|
||||||
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
|
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Native Search Helper Tests ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_is_cjk_char_chinese() {
|
||||||
|
assert!(is_cjk_char('中'));
|
||||||
|
assert!(is_cjk_char('医'));
|
||||||
|
assert!(is_cjk_char('。'));
|
||||||
|
assert!(!is_cjk_char('a'));
|
||||||
|
assert!(!is_cjk_char('1'));
|
||||||
|
assert!(!is_cjk_char(' '));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_is_cjk_char_detects_chinese_query() {
|
||||||
|
let query = "2024年中国医疗政策";
|
||||||
|
assert!(query.chars().any(|c| is_cjk_char(c)));
|
||||||
|
|
||||||
|
let query_en = "Rust programming language";
|
||||||
|
assert!(!query_en.chars().any(|c| is_cjk_char(c)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_strip_html_tags() {
|
||||||
|
assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
|
||||||
|
assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
|
||||||
|
assert_eq!(strip_html_tags("plain text"), "plain text");
|
||||||
|
assert_eq!(strip_html_tags("&<>"), "&<>");
|
||||||
|
// strip_html_tags only removes tags, not script content
|
||||||
|
assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_between_basic() {
|
||||||
|
let text = "prefix<div>content</div>suffix";
|
||||||
|
assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_between_not_found() {
|
||||||
|
let text = "no delimiters here";
|
||||||
|
assert_eq!(extract_between(text, "<div>", "</div>"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_href() {
|
||||||
|
let text = r#"<a href="https://example.com/page">Title</a>"#;
|
||||||
|
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_href_protocol_relative() {
|
||||||
|
let text = r#"<a href="//example.com/page">Title</a>"#;
|
||||||
|
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_href_uddg() {
|
||||||
|
let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc""#;
|
||||||
|
assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_extract_href_uddg_fallback() {
|
||||||
|
let text = r#"<a href="https://example.com/direct">Title</a>"#;
|
||||||
|
assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- HTML Parser Tests ---
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_ddg_html() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = r#"
|
||||||
|
<div class="result__body">
|
||||||
|
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&rut=abc">Rust Programming Language</a>
|
||||||
|
<a class="result__snippet">A systems programming language focused on safety and speed.</a>
|
||||||
|
</div>
|
||||||
|
<div class="result__body">
|
||||||
|
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&rut=def">The Rust Book</a>
|
||||||
|
<a class="result__snippet">The official guide to Rust programming.</a>
|
||||||
|
</div>
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let results = hand.parse_ddg_html(html, 10);
|
||||||
|
assert_eq!(results.len(), 2);
|
||||||
|
assert_eq!(results[0].title, "Rust Programming Language");
|
||||||
|
assert_eq!(results[0].url, "https://rust-lang.org");
|
||||||
|
assert_eq!(results[0].source, "DuckDuckGo");
|
||||||
|
assert_eq!(results[1].title, "The Rust Book");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_ddg_html_max_results() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let mut html = String::new();
|
||||||
|
for i in 0..20 {
|
||||||
|
html.push_str(&format!(
|
||||||
|
r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
|
||||||
|
i, i, i
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let results = hand.parse_ddg_html(&html, 5);
|
||||||
|
assert_eq!(results.len(), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_ddg_html_empty() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = "<html><body>No results here</body></html>";
|
||||||
|
let results = hand.parse_ddg_html(html, 10);
|
||||||
|
assert!(results.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_bing_html() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = r#"
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/result1">Example Result 1</a></h2>
|
||||||
|
<div class="b_caption"><p>This is the first result snippet.</p></div>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/result2">Example Result 2</a></h2>
|
||||||
|
<div class="b_caption"><p>This is the second result snippet.</p></div>
|
||||||
|
</li>
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let results = hand.parse_bing_html(html, 10);
|
||||||
|
assert_eq!(results.len(), 2);
|
||||||
|
assert_eq!(results[0].title, "Example Result 1");
|
||||||
|
assert_eq!(results[0].url, "https://example.com/result1");
|
||||||
|
assert_eq!(results[0].source, "Bing");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_bing_html_skips_internal_urls() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = r#"
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://bing.com/search?q=more">More Results</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/real">Real Result</a></h2>
|
||||||
|
</li>
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let results = hand.parse_bing_html(html, 10);
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].url, "https://example.com/real");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_bing_html_empty() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = "<html><body>Nothing here</body></html>";
|
||||||
|
let results = hand.parse_bing_html(html, 10);
|
||||||
|
assert!(results.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_baidu_html() {
|
||||||
|
let hand = create_test_hand();
|
||||||
|
let html = r#"
|
||||||
|
<div class="result c-container">
|
||||||
|
<h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
|
||||||
|
<div class="c-abstract">这是关于医疗政策的摘要信息。</div>
|
||||||
|
</div>
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let results = hand.parse_baidu_html(html, 10);
|
||||||
|
assert_eq!(results.len(), 1);
|
||||||
|
assert_eq!(results[0].source, "Baidu");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user