fix(hands): 搜索引擎升级 — DDG改POST + Jina Reader内容提取
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
借鉴DeerFlow(ddgs库)架构改进搜索: 1. DDG搜索从GET改为POST(form-encoded),匹配ddgs库行为 2. 新增Jina Reader API(r.jina.ai)用于网页内容提取,返回干净Markdown 3. Jina失败时自动降级到原有HTML解析 4. 支持 ZCLAW_JINA_API_KEY 环境变量(可选,免费tier无需key) 5. 内容截断4096字符(DeerFlow模式) 验证: 160 tests PASS, 0 warnings, workspace check clean
This commit is contained in:
@@ -37,6 +37,7 @@ struct SearchConfig {
|
||||
default_engine: SearchEngine,
|
||||
searxng_url: String,
|
||||
timeout_secs: u64,
|
||||
jina_api_key: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for SearchConfig {
|
||||
@@ -45,6 +46,7 @@ impl Default for SearchConfig {
|
||||
default_engine: SearchEngine::Auto,
|
||||
searxng_url: "http://localhost:8888".to_string(),
|
||||
timeout_secs: 15,
|
||||
jina_api_key: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -101,6 +103,7 @@ impl SearchConfig {
|
||||
searxng_url: s.searxng_url
|
||||
.unwrap_or_else(|| "http://localhost:8888".to_string()),
|
||||
timeout_secs: s.searxng_timeout.unwrap_or(15),
|
||||
jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(),
|
||||
}
|
||||
}
|
||||
None => Self::default(),
|
||||
@@ -492,16 +495,18 @@ impl ResearcherHand {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Search using DuckDuckGo HTML (real search results, not Instant Answer API)
|
||||
/// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior)
|
||||
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||||
let url = format!(
|
||||
"https://html.duckduckgo.com/html/?q={}",
|
||||
url_encode(query)
|
||||
);
|
||||
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||||
let region = if has_cjk { "wt-wt" } else { "wt-wt" };
|
||||
let body = format!("q={}&b=&l={}", url_encode(query), region);
|
||||
|
||||
let response = self.client
|
||||
.get(&url)
|
||||
.header("Accept", "text/html")
|
||||
.post("https://html.duckduckgo.com/html/")
|
||||
.header("Content-Type", "application/x-www-form-urlencoded")
|
||||
.header("Accept", "text/html,application/xhtml+xml")
|
||||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||||
.body(body)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
@@ -752,6 +757,7 @@ impl ResearcherHand {
|
||||
}
|
||||
|
||||
/// Fetch content from a URL (with SSRF protection)
|
||||
/// Tries Jina Reader API first for clean Markdown, falls back to direct fetch
|
||||
async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
@@ -768,27 +774,13 @@ impl ResearcherHand {
|
||||
}
|
||||
}
|
||||
|
||||
let response = self.client
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Fetch request failed: {}", e)))?;
|
||||
|
||||
let content_type = response.headers()
|
||||
.get(reqwest::header::CONTENT_TYPE)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("");
|
||||
|
||||
let content = if content_type.contains("text/html") {
|
||||
// Extract text from HTML
|
||||
let html = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
|
||||
self.extract_text_from_html(&html)
|
||||
} else if content_type.contains("text/") || content_type.contains("application/json") {
|
||||
response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
|
||||
} else {
|
||||
"[Binary content]".to_string()
|
||||
// Try Jina Reader API first (returns clean Markdown)
|
||||
let content = match self.fetch_via_jina(url).await {
|
||||
Ok(text) => text,
|
||||
Err(e) => {
|
||||
tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch");
|
||||
self.fetch_direct(url).await?
|
||||
}
|
||||
};
|
||||
|
||||
let result = SearchResult {
|
||||
@@ -824,6 +816,80 @@ impl ResearcherHand {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern)
|
||||
async fn fetch_via_jina(&self, url: &str) -> Result<String> {
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(20))
|
||||
.build()
|
||||
.unwrap_or_else(|_| reqwest::Client::new());
|
||||
|
||||
let mut builder = client
|
||||
.post("https://r.jina.ai/")
|
||||
.header("Content-Type", "application/json")
|
||||
.header("X-Return-Format", "markdown")
|
||||
.header("X-Timeout", "15")
|
||||
.json(&serde_json::json!({ "url": url }));
|
||||
|
||||
// Optional API key for higher rate limits
|
||||
if let Some(ref key) = self.search_config.jina_api_key {
|
||||
builder = builder.header("Authorization", format!("Bearer {}", key));
|
||||
}
|
||||
|
||||
let response = builder.send().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Jina Reader request failed: {}", e)
|
||||
))?;
|
||||
|
||||
let status = response.status();
|
||||
if !status.is_success() {
|
||||
return Err(zclaw_types::ZclawError::HandError(
|
||||
format!("Jina Reader returned HTTP {}", status)
|
||||
));
|
||||
}
|
||||
|
||||
let text = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||||
format!("Failed to read Jina response: {}", e)
|
||||
))?;
|
||||
|
||||
if text.trim().is_empty() {
|
||||
return Err(zclaw_types::ZclawError::HandError(
|
||||
"Jina Reader returned empty response".to_string()
|
||||
));
|
||||
}
|
||||
|
||||
// Truncate to 4096 chars (DeerFlow pattern)
|
||||
let truncated: String = text.chars().take(4096).collect();
|
||||
Ok(truncated)
|
||||
}
|
||||
|
||||
/// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable)
|
||||
async fn fetch_direct(&self, url: &str) -> Result<String> {
|
||||
let response = self.client
|
||||
.get(url)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?;
|
||||
|
||||
let content_type = response.headers()
|
||||
.get(reqwest::header::CONTENT_TYPE)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.unwrap_or("");
|
||||
|
||||
let content = if content_type.contains("text/html") {
|
||||
let html = response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
|
||||
self.extract_text_from_html(&html)
|
||||
} else if content_type.contains("text/") || content_type.contains("application/json") {
|
||||
response.text().await
|
||||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
|
||||
} else {
|
||||
"[Binary content]".to_string()
|
||||
};
|
||||
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
/// Extract readable text from HTML
|
||||
fn extract_text_from_html(&self, html: &str) -> String {
|
||||
let html_lower = html.to_lowercase();
|
||||
|
||||
Reference in New Issue
Block a user