fix(hands): 搜索引擎升级 — DDG改POST + Jina Reader内容提取
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

借鉴DeerFlow(ddgs库)架构改进搜索:
1. DDG搜索从GET改为POST(form-encoded),匹配ddgs库行为
2. 新增Jina Reader API(r.jina.ai)用于网页内容提取,返回干净Markdown
3. Jina失败时自动降级到原有HTML解析
4. 支持 ZCLAW_JINA_API_KEY 环境变量(可选,免费tier无需key)
5. 内容截断4096字符(DeerFlow模式)

验证: 160 tests PASS, 0 warnings, workspace check clean
This commit is contained in:
iven
2026-04-22 12:59:48 +08:00
parent eede45b13d
commit 6d7457de56

View File

@@ -37,6 +37,7 @@ struct SearchConfig {
default_engine: SearchEngine,
searxng_url: String,
timeout_secs: u64,
jina_api_key: Option<String>,
}
impl Default for SearchConfig {
@@ -45,6 +46,7 @@ impl Default for SearchConfig {
default_engine: SearchEngine::Auto,
searxng_url: "http://localhost:8888".to_string(),
timeout_secs: 15,
jina_api_key: None,
}
}
}
@@ -101,6 +103,7 @@ impl SearchConfig {
searxng_url: s.searxng_url
.unwrap_or_else(|| "http://localhost:8888".to_string()),
timeout_secs: s.searxng_timeout.unwrap_or(15),
jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(),
}
}
None => Self::default(),
@@ -492,16 +495,18 @@ impl ResearcherHand {
Ok(results)
}
/// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
/// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior)
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://html.duckduckgo.com/html/?q={}",
url_encode(query)
);
let has_cjk = query.chars().any(|c| is_cjk_char(c));
let region = if has_cjk { "wt-wt" } else { "wt-wt" };
let body = format!("q={}&b=&l={}", url_encode(query), region);
let response = self.client
.get(&url)
.header("Accept", "text/html")
.post("https://html.duckduckgo.com/html/")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.body(body)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
@@ -752,6 +757,7 @@ impl ResearcherHand {
}
/// Fetch content from a URL (with SSRF protection)
/// Tries Jina Reader API first for clean Markdown, falls back to direct fetch
async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
let start = std::time::Instant::now();
@@ -768,27 +774,13 @@ impl ResearcherHand {
}
}
let response = self.client
.get(url)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Fetch request failed: {}", e)))?;
let content_type = response.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let content = if content_type.contains("text/html") {
// Extract text from HTML
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
self.extract_text_from_html(&html)
} else if content_type.contains("text/") || content_type.contains("application/json") {
response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
} else {
"[Binary content]".to_string()
// Try Jina Reader API first (returns clean Markdown)
let content = match self.fetch_via_jina(url).await {
Ok(text) => text,
Err(e) => {
tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch");
self.fetch_direct(url).await?
}
};
let result = SearchResult {
@@ -824,6 +816,80 @@ impl ResearcherHand {
Ok(result)
}
/// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern)
async fn fetch_via_jina(&self, url: &str) -> Result<String> {
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(20))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
let mut builder = client
.post("https://r.jina.ai/")
.header("Content-Type", "application/json")
.header("X-Return-Format", "markdown")
.header("X-Timeout", "15")
.json(&serde_json::json!({ "url": url }));
// Optional API key for higher rate limits
if let Some(ref key) = self.search_config.jina_api_key {
builder = builder.header("Authorization", format!("Bearer {}", key));
}
let response = builder.send().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Jina Reader request failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("Jina Reader returned HTTP {}", status)
));
}
let text = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Jina response: {}", e)
))?;
if text.trim().is_empty() {
return Err(zclaw_types::ZclawError::HandError(
"Jina Reader returned empty response".to_string()
));
}
// Truncate to 4096 chars (DeerFlow pattern)
let truncated: String = text.chars().take(4096).collect();
Ok(truncated)
}
/// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable)
async fn fetch_direct(&self, url: &str) -> Result<String> {
let response = self.client
.get(url)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?;
let content_type = response.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let content = if content_type.contains("text/html") {
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
self.extract_text_from_html(&html)
} else if content_type.contains("text/") || content_type.contains("application/json") {
response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
} else {
"[Binary content]".to_string()
};
Ok(content)
}
/// Extract readable text from HTML
fn extract_text_from_html(&self, html: &str) -> String {
let html_lower = html.to_lowercase();