Files
zclaw_openfang/crates/zclaw-hands/src/hands/researcher.rs
iven 5816f56039
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
fix(runtime,hands): 搜索功能修复 — glm空参数回退+schema简化
根因: glm-5.1 不理解 oneOf+const 复杂 schema,发送 tool_calls 时
arguments 为空 {}。同时缺少从对话上下文提取用户意图的回退机制。

修复:
1. researcher input_schema 从 oneOf+const 改为扁平化属性 — glm 正确传参
2. loop_runner 增加 empty-input 回退 — 从最近用户消息注入 _fallback_query
3. researcher infer_action 增加 _fallback_query 分支处理
4. 调试日志降级 INFO→DEBUG (openai tool_calls delta, researcher input)
2026-04-22 16:06:47 +08:00

2199 lines
74 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Researcher Hand - Deep research and analysis capabilities
//!
//! This hand provides web search, content fetching, and research synthesis.
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::collections::HashMap;
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
use std::sync::Arc;
use tokio::sync::RwLock;
use url::Url;
use zclaw_types::Result;
use crate::{Hand, HandConfig, HandContext, HandResult};
/// Search engine options
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SearchEngine {
SearXNG,
Google,
Bing,
DuckDuckGo,
Auto,
}
impl Default for SearchEngine {
fn default() -> Self {
Self::Auto
}
}
/// Search configuration loaded from config/config.toml
#[derive(Debug, Clone)]
struct SearchConfig {
default_engine: SearchEngine,
searxng_url: String,
timeout_secs: u64,
jina_api_key: Option<String>,
}
impl Default for SearchConfig {
fn default() -> Self {
Self {
default_engine: SearchEngine::Auto,
searxng_url: "http://localhost:8888".to_string(),
timeout_secs: 15,
jina_api_key: None,
}
}
}
impl SearchConfig {
fn load() -> Self {
let path = "config/config.toml";
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return Self::default(),
};
#[derive(Deserialize)]
struct ToolsWebSearch {
default_engine: Option<String>,
#[allow(dead_code)]
max_results: Option<usize>,
searxng_url: Option<String>,
searxng_timeout: Option<u64>,
}
#[derive(Deserialize)]
struct ToolsWeb {
search: Option<ToolsWebSearch>,
}
#[derive(Deserialize)]
struct Tools {
web: Option<ToolsWeb>,
}
#[derive(Deserialize)]
struct Config {
tools: Option<Tools>,
}
let config: Config = match toml::from_str(&content) {
Ok(c) => c,
Err(_) => return Self::default(),
};
let search = config.tools
.and_then(|t| t.web)
.and_then(|w| w.search);
match search {
Some(s) => {
let engine = s.default_engine
.as_deref()
.and_then(|e| serde_json::from_str(&format!("\"{}\"", e)).ok())
.unwrap_or_default();
Self {
default_engine: engine,
searxng_url: s.searxng_url
.unwrap_or_else(|| "http://localhost:8888".to_string()),
timeout_secs: s.searxng_timeout.unwrap_or(15),
jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(),
}
}
None => Self::default(),
}
}
}
/// Research depth level
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ResearchDepth {
Quick, // Fast search, top 3 results
Standard, // Normal search, top 10 results
Deep, // Comprehensive search, multiple sources
}
impl Default for ResearchDepth {
fn default() -> Self {
Self::Standard
}
}
/// Research query configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchQuery {
/// Search query
pub query: String,
/// Search engine to use
#[serde(default)]
pub engine: SearchEngine,
/// Research depth
#[serde(default)]
pub depth: ResearchDepth,
/// Maximum results to return
#[serde(default = "default_max_results")]
pub max_results: usize,
/// Include related topics
#[serde(default)]
pub include_related: bool,
/// Time limit in seconds
#[serde(default = "default_time_limit")]
pub time_limit_secs: u64,
}
fn default_max_results() -> usize { 10 }
fn default_time_limit() -> u64 { 60 }
const MAX_QUERY_LENGTH: usize = 500;
const MAX_RESULTS_CAP: usize = 50;
const MAX_URL_LENGTH: usize = 2048;
const CACHE_MAX_ENTRIES: usize = 200;
impl ResearchQuery {
fn validate(&self) -> std::result::Result<(), String> {
if self.query.trim().is_empty() {
return Err("搜索查询不能为空".to_string());
}
if self.query.len() > MAX_QUERY_LENGTH {
return Err(format!("查询过长(上限 {} 字符)", MAX_QUERY_LENGTH));
}
if self.max_results > MAX_RESULTS_CAP {
return Err(format!("max_results 上限为 {}", MAX_RESULTS_CAP));
}
Ok(())
}
}
/// Search result item
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
/// Title of the result
pub title: String,
/// URL
pub url: String,
/// Snippet/summary
pub snippet: String,
/// Source name
pub source: String,
/// Relevance score (0-100)
#[serde(default)]
pub relevance: u8,
/// Fetched content (if available)
#[serde(default)]
pub content: Option<String>,
/// Timestamp
#[serde(default)]
pub fetched_at: Option<String>,
}
/// Research report
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchReport {
/// Original query
pub query: String,
/// Search results
pub results: Vec<SearchResult>,
/// Synthesized summary
#[serde(default)]
pub summary: Option<String>,
/// Key findings
#[serde(default)]
pub key_findings: Vec<String>,
/// Related topics discovered
#[serde(default)]
pub related_topics: Vec<String>,
/// Research timestamp
pub researched_at: String,
/// Total time spent (ms)
pub duration_ms: u64,
}
/// Researcher action types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "action")]
pub enum ResearcherAction {
#[serde(rename = "search")]
Search { query: ResearchQuery },
#[serde(rename = "fetch")]
Fetch { url: String },
#[serde(rename = "summarize")]
Summarize { urls: Vec<String> },
#[serde(rename = "report")]
Report { query: ResearchQuery },
}
/// Researcher Hand implementation
pub struct ResearcherHand {
config: HandConfig,
search_config: SearchConfig,
client: reqwest::Client,
cache: Arc<RwLock<HashMap<String, SearchResult>>>,
}
impl ResearcherHand {
/// Create a new researcher hand
pub fn new() -> Self {
Self {
config: HandConfig {
id: "researcher".to_string(),
name: "研究员".to_string(),
description: "深度研究和分析能力,支持网络搜索和内容获取".to_string(),
needs_approval: false,
dependencies: vec!["network".to_string()],
input_schema: Some(serde_json::json!({
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": ["search", "fetch", "report", "summarize"],
"description": "Action to perform: search (web search), fetch (get URL content), report (deep research), summarize (multiple URLs)"
},
"query": {
"type": "string",
"description": "Search query string for search/report actions"
},
"url": {
"type": "string",
"description": "URL to fetch content from"
},
"urls": {
"type": "array",
"items": { "type": "string" },
"description": "List of URLs to summarize"
},
"engine": {
"type": "string",
"enum": ["auto", "searxng", "google", "bing", "duckduckgo"],
"description": "Search engine preference"
}
},
"description": "Provide 'query' for search/report, or 'url' for fetch, or 'urls' for summarize"
})),
tags: vec!["research".to_string(), "web".to_string(), "search".to_string()],
enabled: true,
max_concurrent: 0,
timeout_secs: 0,
},
search_config: SearchConfig::load(),
client: reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
.redirect(reqwest::redirect::Policy::limited(3))
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
cache: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Infer action from input fields when LLM omits the `action` field.
/// Many LLMs (especially non-OpenAI models like glm) call tools without
/// including the enum tag, e.g. sending `{"query": "search terms"}` instead
/// of `{"action": "search", "query": "search terms"}`.
fn infer_action(input: &Value) -> Result<ResearcherAction> {
// Debug: log all keys in the input
let keys: Vec<&str> = input.as_object()
.map(|obj| obj.keys().map(|k| k.as_str()).collect())
.unwrap_or_default();
tracing::debug!(target: "researcher", ?keys, %input, "infer_action examining input");
// Check for action field with wrong value
if let Some(action) = input.get("action").and_then(|v| v.as_str()) {
if action == "search" || action == "report" {
if let Some(query_val) = input.get("query") {
let query = Self::parse_query(query_val);
if !query.query.trim().is_empty() {
return Ok(if action == "report" {
ResearcherAction::Report { query }
} else {
ResearcherAction::Search { query }
});
}
}
}
if action == "fetch" {
if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
return Ok(ResearcherAction::Fetch { url: url.to_string() });
}
}
}
// Has "url" (singular) → fetch
if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
if !url.is_empty() && url.starts_with("http") {
return Ok(ResearcherAction::Fetch { url: url.to_string() });
}
}
// Has "urls" (plural) → summarize
if let Some(urls) = input.get("urls").and_then(|v| v.as_array()) {
let url_list: Vec<String> = urls.iter()
.filter_map(|v| v.as_str().map(|s| s.to_string()))
.collect();
if !url_list.is_empty() {
return Ok(ResearcherAction::Summarize { urls: url_list });
}
}
// Has "query" → search
if let Some(query_val) = input.get("query") {
let query = Self::parse_query(query_val);
if !query.query.trim().is_empty() {
return Ok(ResearcherAction::Search { query });
}
}
// Has "search" or "search_query" → search
for key in &["search", "search_query", "keyword", "keywords", "q", "text"] {
if let Some(val) = input.get(key) {
let query = Self::parse_query(val);
if !query.query.trim().is_empty() {
return Ok(ResearcherAction::Search { query });
}
}
}
// Check for injected fallback query from loop_runner (when LLM sends empty args)
if let Some(fallback) = input.get("_fallback_query").and_then(|v| v.as_str()) {
if !fallback.trim().is_empty() {
tracing::debug!(target: "researcher", query = %fallback, "Using fallback user message as search query");
return Ok(ResearcherAction::Search { query: ResearchQuery {
query: fallback.to_string(),
engine: SearchEngine::Auto,
depth: ResearchDepth::Standard,
max_results: 10,
include_related: false,
time_limit_secs: 60,
}});
}
}
// Last resort: if any string field looks like a search query
if let Some(obj) = input.as_object() {
for (key, val) in obj {
if let Some(s) = val.as_str() {
if s.len() > 2 && !s.starts_with("http") && key != "action" && key != "engine" {
tracing::debug!(target: "researcher", key = %key, value = %s, "Using fallback field as query");
return Ok(ResearcherAction::Search { query: ResearchQuery {
query: s.to_string(),
engine: SearchEngine::Auto,
depth: ResearchDepth::Standard,
max_results: 10,
include_related: false,
time_limit_secs: 60,
}});
}
}
}
}
Err(zclaw_types::ZclawError::HandError(
"无法识别搜索意图:请提供 query搜索或 url获取网页参数".to_string()
))
}
fn parse_query(query_val: &Value) -> ResearchQuery {
if query_val.is_string() {
ResearchQuery {
query: query_val.as_str().unwrap_or("").to_string(),
engine: SearchEngine::Auto,
depth: ResearchDepth::Standard,
max_results: 10,
include_related: false,
time_limit_secs: 60,
}
} else {
serde_json::from_value(query_val.clone()).unwrap_or_else(|_| ResearchQuery {
query: query_val.get("query")
.or_else(|| query_val.get("search"))
.or_else(|| query_val.get("q"))
.or_else(|| query_val.get("keyword"))
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
engine: SearchEngine::Auto,
depth: ResearchDepth::Standard,
max_results: 10,
include_related: false,
time_limit_secs: 60,
})
}
}
/// Execute a web search — route to the configured backend
async fn execute_search(&self, query: &ResearchQuery) -> Result<Vec<SearchResult>> {
query.validate().map_err(|e| zclaw_types::ZclawError::HandError(e))?;
let max_results = query.max_results.min(MAX_RESULTS_CAP);
let start = std::time::Instant::now();
let engine = match &query.engine {
SearchEngine::Auto => &self.search_config.default_engine,
other => other,
};
let results = match engine {
SearchEngine::SearXNG => {
match self.search_searxng(&query.query, max_results).await {
Ok(r) if !r.is_empty() => r,
_ => self.search_native(&query.query, max_results).await?,
}
}
SearchEngine::Auto => {
self.search_native(&query.query, max_results).await?
}
SearchEngine::DuckDuckGo => {
// DDG在国内不可用降级到百度
tracing::warn!(target: "researcher", "DuckDuckGo在国内不可用降级到百度");
self.search_baidu(&query.query, max_results).await?
}
SearchEngine::Google => {
tracing::warn!(target: "researcher", "Google在国内不可用降级到百度");
self.search_baidu(&query.query, max_results).await?
}
SearchEngine::Bing => {
self.search_bing(&query.query, max_results).await?
}
};
let duration = start.elapsed().as_millis() as u64;
tracing::info!(
target: "researcher",
query = %query.query,
engine = ?engine,
duration_ms = duration,
results_count = results.len(),
"Search completed"
);
Ok(results)
}
/// Rust-native multi-engine search — optimized for China mainland users
/// Priority: Baidu + Bing CN (both always work in China)
/// DuckDuckGo as optional fallback (may be blocked by GFW)
async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let mut all_results = Vec::new();
// Always use Baidu + Bing CN in parallel (both work in China)
let baidu_fut = self.search_baidu(query, max_results);
let bing_fut = self.search_bing(query, max_results);
let (baidu_res, bing_res) = tokio::join!(
async { baidu_fut.await },
async { bing_fut.await },
);
if let Ok(r) = baidu_res {
all_results.extend(r);
}
if let Ok(r) = bing_res {
all_results.extend(r);
}
// If both primary engines returned nothing, try DDG as last resort
if all_results.is_empty() {
tracing::info!(target: "researcher", "Primary engines empty, trying DuckDuckGo as fallback");
if let Ok(r) = self.search_duckduckgo_html(query, max_results).await {
all_results.extend(r);
}
}
// Deduplicate by URL
let mut seen_urls = std::collections::HashSet::new();
all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
// Sort by relevance descending, take top N
all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
all_results.truncate(max_results);
if all_results.is_empty() {
tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
}
Ok(all_results)
}
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
self.search_config.searxng_url.trim_end_matches('/'),
url_encode(query)
);
let response = self.client
.get(&url)
.timeout(std::time::Duration::from_secs(self.search_config.timeout_secs))
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("SearXNG request failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("SearXNG returned HTTP {}", status)
));
}
let json: Value = response.json().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to parse SearXNG response: {}", e)
))?;
let mut results = Vec::new();
if let Some(items) = json.get("results").and_then(|v| v.as_array()) {
for item in items.iter().take(max_results) {
let title = item.get("title")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let url = item.get("url")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let snippet = item.get("content")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let engines = item.get("engines")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|e| e.as_str())
.collect::<Vec<_>>()
.join(",")
})
.unwrap_or_default();
let score = item.get("score")
.and_then(|v| v.as_f64())
.unwrap_or(0.0);
// Normalize score to 0-100 range
let relevance = if score > 0.0 {
(score.min(10.0) * 10.0) as u8
} else {
50
};
if !title.is_empty() && !url.is_empty() {
results.push(SearchResult {
title,
url,
snippet,
source: if engines.is_empty() {
"SearXNG".to_string()
} else {
format!("SearXNG({})", engines)
},
relevance,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
}
}
Ok(results)
}
/// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior)
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let has_cjk = query.chars().any(|c| is_cjk_char(c));
let region = if has_cjk { "wt-wt" } else { "wt-wt" };
let body = format!("q={}&b=&l={}", url_encode(query), region);
let response = self.client
.post("https://html.duckduckgo.com/html/")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.body(body)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("DuckDuckGo HTML search failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("DuckDuckGo returned HTTP {}", status)
));
}
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read DuckDuckGo response: {}", e)
))?;
Ok(self.parse_ddg_html(&html, max_results))
}
/// Parse DuckDuckGo HTML search results page
fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
for block in html.split("class=\"result__body\"") {
if results.len() >= max_results {
break;
}
// Find the result title link: <a class="result__a" href="...">Title</a>
let title_link = match extract_between(block, "result__a", "</a>") {
Some(s) => s,
None => continue,
};
// title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text
let title = title_link.rsplit('>').next()
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
let url = extract_href_uddg(block).unwrap_or_default();
let snippet = extract_between(block, "result__snippet", "</a>")
.map(|s| {
s.rsplit('>').next()
.map(|t| strip_html_tags(t).trim().to_string())
.unwrap_or_default()
})
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "DuckDuckGo".to_string(),
relevance: 70,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
results
}
/// Search using Bing (works well for both Chinese and English)
async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let has_cjk = query.chars().any(|c| is_cjk_char(c));
let url = if has_cjk {
format!(
"https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
url_encode(query),
max_results
)
} else {
format!(
"https://www.bing.com/search?q={}&count={}",
url_encode(query),
max_results
)
};
let response = self.client
.get(&url)
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Bing search failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("Bing returned HTTP {}", status)
));
}
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Bing response: {}", e)
))?;
Ok(self.parse_bing_html(&html, max_results))
}
/// Parse Bing HTML search results page
fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
// Bing results are in <li class="b_algo">
for block in html.split("class=\"b_algo\"") {
if results.len() >= max_results {
break;
}
// Extract title from first <a> inside the block
let title = extract_between(block, ">", "</a>")
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
// Extract URL from href attribute of first <a>
let url = extract_href(block).unwrap_or_default();
// Extract snippet from <div class="b_caption"><p>...</p> or <p>
let snippet = extract_between(block, "<p>", "</p>")
.or_else(|| extract_between(block, "b_caption", "</div>"))
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
// Skip Bing internal URLs
if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
continue;
}
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "Bing".to_string(),
relevance: 75,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
results
}
/// Search using Baidu (essential for Chinese content)
async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!(
"https://www.baidu.com/s?wd={}&rn={}",
url_encode(query),
max_results
);
let response = self.client
.get(&url)
.header("Accept", "text/html,application/xhtml+xml")
.header("Accept-Language", "zh-CN,zh;q=0.9")
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Baidu search failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("Baidu returned HTTP {}", status)
));
}
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Baidu response: {}", e)
))?;
Ok(self.parse_baidu_html(&html, max_results))
}
/// Parse Baidu HTML search results page
fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
let mut results = Vec::new();
// Baidu uses multiple class patterns: "result c-container", "c-container new-pmd", "result-op c-container"
let blocks: Vec<&str> = html.split("c-container")
.enumerate()
.filter_map(|(i, block)| {
if i == 0 { return None; }
if block.contains("href=\"http") { Some(block) } else { None }
})
.collect();
for block in &blocks {
if results.len() >= max_results {
break;
}
let title = extract_between(block, ">", "</a>")
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
let url = extract_href(block).unwrap_or_default();
let snippet = extract_between(block, "c-abstract", "</div>")
.or_else(|| extract_between(block, "content-right_", "</div>"))
.map(|s| strip_html_tags(s).trim().to_string())
.unwrap_or_default();
if title.is_empty() || url.is_empty() {
continue;
}
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult {
title,
url,
snippet,
source: "Baidu".to_string(),
relevance: 80,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
results
}
/// Fetch content from a URL (with SSRF protection)
/// Tries Jina Reader API first for clean Markdown, falls back to direct fetch
async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
let start = std::time::Instant::now();
// SSRF validation
validate_fetch_url(url)?;
// Check cache first
{
let cache = self.cache.read().await;
if let Some(cached) = cache.get(url) {
if cached.content.is_some() {
return Ok(cached.clone());
}
}
}
// Try Jina Reader API first (returns clean Markdown)
let content = match self.fetch_via_jina(url).await {
Ok(text) => text,
Err(e) => {
tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch");
self.fetch_direct(url).await?
}
};
let result = SearchResult {
title: url.to_string(),
url: url.to_string(),
snippet: content.chars().take(500).collect(),
source: url.to_string(),
relevance: 100,
content: Some(content),
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
};
// Cache the result (with capacity limit)
{
let mut cache = self.cache.write().await;
if cache.len() >= CACHE_MAX_ENTRIES {
// Simple eviction: remove first entry
if let Some(key) = cache.keys().next().cloned() {
cache.remove(&key);
}
}
cache.insert(url.to_string(), result.clone());
}
let duration = start.elapsed().as_millis() as u64;
tracing::info!(
target: "researcher",
url = url,
duration_ms = duration,
"Fetch completed"
);
Ok(result)
}
/// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern)
async fn fetch_via_jina(&self, url: &str) -> Result<String> {
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(20))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
let mut builder = client
.post("https://r.jina.ai/")
.header("Content-Type", "application/json")
.header("X-Return-Format", "markdown")
.header("X-Timeout", "15")
.json(&serde_json::json!({ "url": url }));
// Optional API key for higher rate limits
if let Some(ref key) = self.search_config.jina_api_key {
builder = builder.header("Authorization", format!("Bearer {}", key));
}
let response = builder.send().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Jina Reader request failed: {}", e)
))?;
let status = response.status();
if !status.is_success() {
return Err(zclaw_types::ZclawError::HandError(
format!("Jina Reader returned HTTP {}", status)
));
}
let text = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(
format!("Failed to read Jina response: {}", e)
))?;
if text.trim().is_empty() {
return Err(zclaw_types::ZclawError::HandError(
"Jina Reader returned empty response".to_string()
));
}
// Truncate to 4096 chars (DeerFlow pattern)
let truncated: String = text.chars().take(4096).collect();
Ok(truncated)
}
/// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable)
async fn fetch_direct(&self, url: &str) -> Result<String> {
let response = self.client
.get(url)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?;
let content_type = response.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let content = if content_type.contains("text/html") {
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
self.extract_text_from_html(&html)
} else if content_type.contains("text/") || content_type.contains("application/json") {
response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
} else {
"[Binary content]".to_string()
};
Ok(content)
}
/// Extract readable text from HTML
fn extract_text_from_html(&self, html: &str) -> String {
let html_lower = html.to_lowercase();
let mut text = String::new();
let mut in_tag = false;
let mut in_script = false;
let mut in_style = false;
let mut pos: usize = 0;
for c in html.chars() {
let char_len = c.len_utf8();
match c {
'<' => {
// Check for closing tags before entering tag mode
let remaining = &html_lower[pos..];
if remaining.starts_with("</script") {
in_script = false;
} else if remaining.starts_with("</style") {
in_style = false;
}
// Check for opening tags
if remaining.starts_with("<script") {
in_script = true;
} else if remaining.starts_with("<style") {
in_style = true;
}
in_tag = true;
}
'>' => {
in_tag = false;
}
_ if in_tag => {}
_ if in_script || in_style => {}
' ' | '\n' | '\t' | '\r' => {
if !text.ends_with(' ') && !text.is_empty() {
text.push(' ');
}
}
_ => text.push(c),
}
pos += char_len;
}
if text.len() > 10000 {
text.truncate(10000);
text.push_str("...");
}
text.trim().to_string()
}
/// Generate a comprehensive research report
async fn execute_report(&self, query: &ResearchQuery) -> Result<ResearchReport> {
let start = std::time::Instant::now();
// First, execute search
let mut results = self.execute_search(query).await?;
// Fetch content for top results
let fetch_limit = match query.depth {
ResearchDepth::Quick => 1,
ResearchDepth::Standard => 3,
ResearchDepth::Deep => 5,
};
for result in results.iter_mut().take(fetch_limit) {
if !result.url.is_empty() {
match self.execute_fetch(&result.url).await {
Ok(fetched) => {
result.content = fetched.content;
result.fetched_at = fetched.fetched_at;
}
Err(e) => {
tracing::warn!(target: "researcher", error = %e, "Failed to fetch content");
}
}
}
}
// Extract key findings
let key_findings: Vec<String> = results.iter()
.take(5)
.filter_map(|r| {
r.content.as_ref().map(|c| {
c.split(". ")
.take(3)
.collect::<Vec<_>>()
.join(". ")
})
})
.collect();
// Extract related topics from snippets
let related_topics: Vec<String> = results.iter()
.filter_map(|r| {
if r.snippet.len() > 50 {
Some(r.title.clone())
} else {
None
}
})
.take(5)
.collect();
let duration = start.elapsed().as_millis() as u64;
// Generate summary from top results
let summary = if results.is_empty() {
"未找到相关结果,建议调整搜索关键词后重试".to_string()
} else {
let top_snippets: Vec<&str> = results
.iter()
.take(3)
.filter_map(|r| {
let s = r.snippet.trim();
if s.is_empty() { None } else { Some(s) }
})
.collect();
if top_snippets.is_empty() {
format!("找到 {} 条相关结果,但无摘要信息", results.len())
} else {
format!(
"基于 {} 条搜索结果:{}",
results.len(),
top_snippets.join("")
)
}
};
Ok(ResearchReport {
query: query.query.clone(),
results,
summary: Some(summary),
key_findings,
related_topics,
researched_at: chrono::Utc::now().to_rfc3339(),
duration_ms: duration,
})
}
}
impl Default for ResearcherHand {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Hand for ResearcherHand {
fn config(&self) -> &HandConfig {
&self.config
}
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
tracing::debug!(target: "researcher", input = %input, "Researcher hand received input");
// Try strict deserialization first, then fall back to inference
let action: ResearcherAction = match serde_json::from_value(input.clone()) {
Ok(a) => a,
Err(e) => {
tracing::debug!(target: "researcher", error = %e, input = %input, "Strict deserialization failed, trying inference");
Self::infer_action(&input)?
}
};
let start = std::time::Instant::now();
let result = match action {
ResearcherAction::Search { query } => {
let results = self.execute_search(&query).await?;
json!({
"action": "search",
"query": query.query,
"results": results,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Fetch { url } => {
let result = self.execute_fetch(&url).await?;
json!({
"action": "fetch",
"url": url,
"result": result,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Summarize { urls } => {
let mut results = Vec::new();
for url in urls.iter().take(5) {
if let Ok(result) = self.execute_fetch(url).await {
results.push(result);
}
}
json!({
"action": "summarize",
"urls": urls,
"results": results,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Report { query } => {
let report = self.execute_report(&query).await?;
json!({
"action": "report",
"report": report
})
}
};
Ok(HandResult::success(result))
}
fn needs_approval(&self) -> bool {
false // Research operations are generally safe
}
fn check_dependencies(&self) -> Result<Vec<String>> {
// Network connectivity will be checked at runtime
Ok(Vec::new())
}
fn status(&self) -> crate::HandStatus {
crate::HandStatus::Idle
}
}
/// URL encoding helper — encodes each UTF-8 byte, not Unicode code points.
fn url_encode(s: &str) -> String {
s.bytes()
.map(|b| match b {
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
(b as char).to_string()
}
_ => format!("%{:02X}", b),
})
.collect()
}
/// Check if a character is CJK (Chinese/Japanese/Korean)
fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
'\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
'\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms
'\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
)
}
/// Validate a URL for SSRF safety before fetching
fn validate_fetch_url(url_str: &str) -> Result<()> {
if url_str.len() > MAX_URL_LENGTH {
return Err(zclaw_types::ZclawError::HandError(
format!("URL exceeds maximum length of {} characters", MAX_URL_LENGTH)
));
}
let url = Url::parse(url_str)
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid URL: {}", e)))?;
match url.scheme() {
"http" | "https" => {}
scheme => {
return Err(zclaw_types::ZclawError::HandError(
format!("URL scheme '{}' not allowed, only http/https", scheme)
));
}
}
let host = url.host_str()
.ok_or_else(|| zclaw_types::ZclawError::HandError("URL must have a host".into()))?;
// Strip IPv6 brackets for parsing
let host_for_parsing = if host.starts_with('[') && host.ends_with(']') {
&host[1..host.len()-1]
} else {
host
};
if let Ok(ip) = host_for_parsing.parse::<IpAddr>() {
validate_ip(&ip)?;
} else {
validate_hostname(host)?;
}
Ok(())
}
fn validate_ip(ip: &IpAddr) -> Result<()> {
match ip {
IpAddr::V4(v4) => validate_ipv4(v4),
IpAddr::V6(v6) => validate_ipv6(v6),
}
}
fn validate_ipv4(ip: &Ipv4Addr) -> Result<()> {
let o = ip.octets();
if o[0] == 127 { return Err(ssrf_err("loopback")); }
if o[0] == 10 { return Err(ssrf_err("private 10.x.x.x")); }
if o[0] == 172 && (16..=31).contains(&o[1]) { return Err(ssrf_err("private 172.16-31.x.x")); }
if o[0] == 192 && o[1] == 168 { return Err(ssrf_err("private 192.168.x.x")); }
if o[0] == 169 && o[1] == 254 { return Err(ssrf_err("link-local/metadata")); }
if o[0] == 0 { return Err(ssrf_err("0.x.x.x")); }
if *ip == Ipv4Addr::new(255, 255, 255, 255) { return Err(ssrf_err("broadcast")); }
if (224..=239).contains(&o[0]) { return Err(ssrf_err("multicast")); }
Ok(())
}
fn validate_ipv6(ip: &Ipv6Addr) -> Result<()> {
if *ip == Ipv6Addr::LOCALHOST { return Err(ssrf_err("IPv6 loopback")); }
if *ip == Ipv6Addr::UNSPECIFIED { return Err(ssrf_err("IPv6 unspecified")); }
let segs = ip.segments();
// IPv4-mapped: ::ffff:x.x.x.x
if segs[5] == 0xffff {
let v4 = ((segs[6] as u32) << 16) | (segs[7] as u32);
validate_ipv4(&Ipv4Addr::from(v4))?;
}
// Link-local fe80::/10
if (segs[0] & 0xffc0) == 0xfe80 { return Err(ssrf_err("IPv6 link-local")); }
// Unique local fc00::/7
if (segs[0] & 0xfe00) == 0xfc00 { return Err(ssrf_err("IPv6 unique local")); }
Ok(())
}
fn validate_hostname(host: &str) -> Result<()> {
let h = host.to_lowercase();
let blocked = [
"localhost", "localhost.localdomain", "ip6-localhost",
"ip6-loopback", "metadata.google.internal", "metadata",
"kubernetes.default", "kubernetes.default.svc",
];
for b in &blocked {
if h == *b || h.ends_with(&format!(".{}", b)) {
return Err(ssrf_err(&format!("blocked host '{}'", host)));
}
}
// Decimal IP bypass: 2130706433 = 127.0.0.1
if h.chars().all(|c| c.is_ascii_digit()) {
if let Ok(num) = h.parse::<u32>() {
validate_ipv4(&Ipv4Addr::from(num))?;
}
}
Ok(())
}
fn ssrf_err(reason: &str) -> zclaw_types::ZclawError {
zclaw_types::ZclawError::HandError(format!("Access denied: {}", reason))
}
/// Extract text between two delimiters
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
let start_idx = text.find(start)?;
let rest = &text[start_idx + start.len()..];
let end_idx = rest.find(end)?;
Some(&rest[..end_idx])
}
/// Strip HTML tags from a string
fn strip_html_tags(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut in_tag = false;
for c in s.chars() {
match c {
'<' => in_tag = true,
'>' => in_tag = false,
_ if !in_tag => result.push(c),
_ => {}
}
}
// Decode common HTML entities
result = result.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&nbsp;", " ")
.replace("&#x27;", "'")
.replace("&#x2F;", "/");
// Collapse whitespace
let collapsed: String = result.split_whitespace().collect::<Vec<_>>().join(" ");
collapsed
}
/// Check if a search result is likely genuine (not navigation/ad/script garbage)
fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool {
// Title quality checks
let title_trimmed = title.trim();
if title_trimmed.len() < 2 || title_trimmed.len() > 300 {
return false;
}
// Reject titles with JavaScript/CSS indicators
let lower = title_trimmed.to_lowercase();
if lower.contains("function(") || lower.contains("var ") || lower.contains("const ")
|| lower.contains("window.") || lower.contains("document.")
|| lower.contains("{") || lower.contains("}")
|| lower.starts_with("//") || lower.starts_with("/*")
|| lower.contains("cookie") || lower.contains("navigator.")
|| lower.contains(".css") || lower.contains("stylesheet")
|| lower.contains("google-analytics") || lower.contains("gtag")
{
return false;
}
// URL quality checks
if url.contains("javascript:") || url.contains("data:") {
return false;
}
// Reject URLs that are just fragments or relative paths
if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") {
return false;
}
// Snippet quality — if snippet looks like code, reject
let snippet_lower = snippet.to_lowercase();
if snippet_lower.contains("function(") && snippet_lower.contains("return ")
|| snippet_lower.contains("var ") && snippet_lower.contains("=")
{
return false;
}
true
}
/// Extract href URL from the first <a> tag in text
fn extract_href(text: &str) -> Option<String> {
let href_start = text.find("href=\"")?;
let rest = &text[href_start + 6..];
let end = rest.find('"')?;
let url = &rest[..end];
if url.starts_with("http") {
Some(url.to_string())
} else if url.starts_with("//") {
Some(format!("https:{}", url))
} else {
None
}
}
/// Extract the real URL from DDG's redirect link (uddg= parameter)
fn extract_href_uddg(text: &str) -> Option<String> {
if let Some(idx) = text.find("uddg=") {
let rest = &text[idx + 5..];
let url_encoded = rest.split('&').next().unwrap_or("");
// Use standard percent decoding instead of manual replacement
let decoded = percent_decode(url_encoded);
if decoded.starts_with("http") {
return Some(decoded);
}
}
// Fallback: try regular href extraction
extract_href(text)
}
/// Standard percent-decode a URL-encoded string
fn percent_decode(input: &str) -> String {
let mut result = Vec::new();
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
if let Ok(byte) = u8::from_str_radix(
&input[i + 1..i + 3], 16
) {
result.push(byte);
i += 3;
continue;
}
}
result.push(bytes[i]);
i += 1;
}
String::from_utf8_lossy(&result).to_string()
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_hand() -> ResearcherHand {
ResearcherHand::new()
}
fn test_context() -> HandContext {
HandContext::default()
}
// --- Config & Type Tests ---
#[test]
fn test_config_id() {
let hand = create_test_hand();
assert_eq!(hand.config().id, "researcher");
assert_eq!(hand.config().name, "研究员");
assert!(hand.config().enabled);
assert!(!hand.config().needs_approval);
}
#[test]
fn test_search_engine_default_is_auto() {
let engine = SearchEngine::default();
assert!(matches!(engine, SearchEngine::Auto));
}
#[test]
fn test_search_engine_searxng_deserialize() {
let engine: SearchEngine = serde_json::from_str("\"searxng\"").unwrap();
assert!(matches!(engine, SearchEngine::SearXNG));
}
#[test]
fn test_research_depth_default_is_standard() {
let depth = ResearchDepth::default();
assert!(matches!(depth, ResearchDepth::Standard));
}
#[test]
fn test_research_depth_serialize() {
let json = serde_json::to_string(&ResearchDepth::Deep).unwrap();
assert_eq!(json, "\"deep\"");
}
#[test]
fn test_research_depth_deserialize() {
let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap();
assert!(matches!(depth, ResearchDepth::Quick));
}
#[test]
fn test_search_engine_serialize_roundtrip() {
for engine in [SearchEngine::SearXNG, SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] {
let json = serde_json::to_string(&engine).unwrap();
let back: SearchEngine = serde_json::from_str(&json).unwrap();
assert_eq!(json, serde_json::to_string(&back).unwrap());
}
}
// --- Action Deserialization Tests ---
#[test]
fn test_action_search_deserialize() {
let json = json!({
"action": "search",
"query": {
"query": "Rust programming",
"engine": "duckduckgo",
"depth": "quick",
"maxResults": 5
}
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Search { query } => {
assert_eq!(query.query, "Rust programming");
assert!(matches!(query.engine, SearchEngine::DuckDuckGo));
assert!(matches!(query.depth, ResearchDepth::Quick));
assert_eq!(query.max_results, 5);
}
_ => panic!("Expected Search action"),
}
}
#[test]
fn test_action_fetch_deserialize() {
let json = json!({
"action": "fetch",
"url": "https://example.com/page"
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Fetch { url } => {
assert_eq!(url, "https://example.com/page");
}
_ => panic!("Expected Fetch action"),
}
}
#[test]
fn test_action_report_deserialize() {
let json = json!({
"action": "report",
"query": {
"query": "AI trends 2026",
"depth": "deep"
}
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Report { query } => {
assert_eq!(query.query, "AI trends 2026");
assert!(matches!(query.depth, ResearchDepth::Deep));
}
_ => panic!("Expected Report action"),
}
}
#[test]
fn test_action_invalid_rejected() {
let json = json!({
"action": "unknown_action",
"data": "whatever"
});
let result: std::result::Result<ResearcherAction, _> = serde_json::from_value(json);
assert!(result.is_err());
}
// --- URL Encoding Tests ---
#[test]
fn test_url_encode_ascii() {
assert_eq!(url_encode("hello world"), "hello%20world");
}
#[test]
fn test_url_encode_chinese() {
// "医" = UTF-8 bytes E5 8C BB → must produce %E5%8C%BB, not %533B
let encoded = url_encode("");
assert_eq!(encoded, "%E5%8C%BB");
// Full phrase: "中文" = E4 B8 AD E6 96 87
let encoded = url_encode("中文搜索");
assert_eq!(&encoded[0..9], "%E4%B8%AD");
assert!(!encoded.contains("中文"));
}
#[test]
fn test_url_encode_safe_chars() {
assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string());
}
#[test]
fn test_url_encode_empty() {
assert_eq!(url_encode(""), "");
}
// --- HTML Text Extraction Tests ---
#[test]
fn test_extract_text_basic() {
let hand = create_test_hand();
let html = "<html><body><h1>Title</h1><p>Content here</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(text.contains("Title"));
assert!(text.contains("Content here"));
}
#[test]
fn test_extract_text_strips_scripts() {
let hand = create_test_hand();
let html = "<html><body><script>alert('xss')</script><p>Safe text</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(!text.contains("alert"));
assert!(text.contains("Safe text"));
}
#[test]
fn test_extract_text_strips_styles() {
let hand = create_test_hand();
let html = "<html><body><style>.class{color:red}</style><p>Visible</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(!text.contains("color"));
assert!(text.contains("Visible"));
}
#[test]
fn test_extract_text_truncates_long_content() {
let hand = create_test_hand();
let long_body: String = "x".repeat(20000);
let html = format!("<html><body><p>{}</p></body></html>", long_body);
let text = hand.extract_text_from_html(&html);
assert!(text.len() <= 10003); // 10000 + "..."
}
#[test]
fn test_extract_text_empty_body() {
let hand = create_test_hand();
let html = "<html><body></body></html>";
let text = hand.extract_text_from_html(html);
assert!(text.is_empty());
}
// --- Hand Trait Tests ---
#[tokio::test]
async fn test_needs_approval_is_false() {
let hand = create_test_hand();
assert!(!hand.needs_approval());
}
#[tokio::test]
async fn test_status_is_idle() {
let hand = create_test_hand();
assert!(matches!(hand.status(), crate::HandStatus::Idle));
}
#[tokio::test]
async fn test_check_dependencies_ok() {
let hand = create_test_hand();
let missing = hand.check_dependencies().unwrap();
// Default is_dependency_available returns true for all
assert!(missing.is_empty());
}
// --- Default Values Tests ---
#[test]
fn test_research_query_defaults() {
let json = json!({ "query": "test" });
let query: ResearchQuery = serde_json::from_value(json).unwrap();
assert_eq!(query.query, "test");
assert!(matches!(query.engine, SearchEngine::Auto));
assert!(matches!(query.depth, ResearchDepth::Standard));
assert_eq!(query.max_results, 10);
assert_eq!(query.time_limit_secs, 60);
assert!(!query.include_related);
}
#[test]
fn test_search_result_serialization() {
let result = SearchResult {
title: "Test".to_string(),
url: "https://example.com".to_string(),
snippet: "A snippet".to_string(),
source: "TestSource".to_string(),
relevance: 90,
content: None,
fetched_at: None,
};
let json = serde_json::to_string(&result).unwrap();
assert!(json.contains("Test"));
assert!(json.contains("https://example.com"));
}
#[test]
fn test_research_report_summary_is_some_when_results() {
// Verify the struct allows Some value
let report = ResearchReport {
query: "test".to_string(),
results: vec![SearchResult {
title: "R".to_string(),
url: "https://r.co".to_string(),
snippet: "snippet text".to_string(),
source: "S".to_string(),
relevance: 80,
content: None,
fetched_at: None,
}],
summary: Some("基于 1 条搜索结果snippet text".to_string()),
key_findings: vec![],
related_topics: vec![],
researched_at: "2026-01-01T00:00:00Z".to_string(),
duration_ms: 100,
};
assert!(report.summary.is_some());
assert!(report.summary.unwrap().contains("snippet text"));
}
// --- SearchConfig Tests ---
#[test]
fn test_search_config_default() {
let config = SearchConfig::default();
assert!(matches!(config.default_engine, SearchEngine::Auto));
assert_eq!(config.searxng_url, "http://localhost:8888");
assert_eq!(config.timeout_secs, 15);
}
#[test]
fn test_search_config_load_fallback_on_missing_file() {
// Config loads from config/config.toml which may not exist in test CWD
let config = SearchConfig::load();
// Should return a valid config either way
assert!(!config.searxng_url.is_empty());
}
// --- SearXNG Response Parsing Tests ---
#[test]
fn test_searxng_response_parse() {
let mock_response = json!({
"query": "Rust programming",
"number_of_results": 42,
"results": [
{
"url": "https://www.rust-lang.org/",
"title": "Rust Programming Language",
"content": "A language empowering everyone to build reliable software.",
"engine": "google",
"engines": ["google", "duckduckgo"],
"score": 5.2,
"category": "general"
},
{
"url": "https://doc.rust-lang.org/book/",
"title": "The Rust Book",
"content": "The official guide to Rust programming.",
"engine": "bing",
"engines": ["bing"],
"score": 3.1,
"category": "general"
}
],
"suggestions": ["rust tutorial", "rust vs go"]
});
let results = mock_response.get("results").unwrap().as_array().unwrap();
assert_eq!(results.len(), 2);
// Verify first result mapping
let r0 = &results[0];
assert_eq!(r0["title"].as_str().unwrap(), "Rust Programming Language");
assert_eq!(r0["url"].as_str().unwrap(), "https://www.rust-lang.org/");
assert_eq!(r0["content"].as_str().unwrap(), "A language empowering everyone to build reliable software.");
let engines: Vec<&str> = r0["engines"].as_array().unwrap()
.iter().filter_map(|e| e.as_str()).collect();
assert_eq!(engines, vec!["google", "duckduckgo"]);
}
#[test]
fn test_searxng_empty_results() {
let mock_response = json!({
"query": "nonexistent xyzzy123",
"number_of_results": 0,
"results": [],
"suggestions": []
});
let results = mock_response.get("results").unwrap().as_array().unwrap();
assert!(results.is_empty());
}
#[test]
fn test_searxng_score_normalization() {
// Score 5.2 → (5.2 * 10) = 52 → relevance 52
let score = 5.2_f64;
let relevance = if score > 0.0 {
(score.min(10.0) * 10.0) as u8
} else {
50
};
assert_eq!(relevance, 52);
// Score 15.0 → clamped to 10.0 → relevance 100
let score = 15.0_f64;
let relevance = if score > 0.0 {
(score.min(10.0) * 10.0) as u8
} else {
50
};
assert_eq!(relevance, 100);
// Score 0.0 → default relevance 50
let score = 0.0_f64;
let relevance = if score > 0.0 {
(score.min(10.0) * 10.0) as u8
} else {
50
};
assert_eq!(relevance, 50);
}
#[test]
fn test_searxng_url_construction() {
let config = SearchConfig::default();
let query = "2024年中国医疗政策";
let url = format!(
"{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
config.searxng_url.trim_end_matches('/'),
url_encode(query)
);
assert!(url.starts_with("http://localhost:8888/search?"));
assert!(url.contains("format=json"));
assert!(url.contains("categories=general"));
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
}
// --- Native Search Helper Tests ---
#[test]
fn test_is_cjk_char_chinese() {
assert!(is_cjk_char('中'));
assert!(is_cjk_char('医'));
assert!(is_cjk_char('。'));
assert!(!is_cjk_char('a'));
assert!(!is_cjk_char('1'));
assert!(!is_cjk_char(' '));
}
#[test]
fn test_is_cjk_char_detects_chinese_query() {
let query = "2024年中国医疗政策";
assert!(query.chars().any(|c| is_cjk_char(c)));
let query_en = "Rust programming language";
assert!(!query_en.chars().any(|c| is_cjk_char(c)));
}
#[test]
fn test_strip_html_tags() {
assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
assert_eq!(strip_html_tags("plain text"), "plain text");
assert_eq!(strip_html_tags("&amp;&lt;&gt;"), "&<>");
// strip_html_tags only removes tags, not script content
assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
}
#[test]
fn test_extract_between_basic() {
let text = "prefix<div>content</div>suffix";
assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
}
#[test]
fn test_extract_between_not_found() {
let text = "no delimiters here";
assert_eq!(extract_between(text, "<div>", "</div>"), None);
}
#[test]
fn test_extract_href() {
let text = r#"<a href="https://example.com/page">Title</a>"#;
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_protocol_relative() {
let text = r#"<a href="//example.com/page">Title</a>"#;
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_uddg() {
let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&amp;rut=abc""#;
assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
}
#[test]
fn test_extract_href_uddg_fallback() {
let text = r#"<a href="https://example.com/direct">Title</a>"#;
assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
}
// --- HTML Parser Tests ---
#[test]
fn test_parse_ddg_html() {
let hand = create_test_hand();
let html = r#"
<div class="result__body">
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&amp;rut=abc">Rust Programming Language</a>
<a class="result__snippet">A systems programming language focused on safety and speed.</a>
</div>
<div class="result__body">
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&amp;rut=def">The Rust Book</a>
<a class="result__snippet">The official guide to Rust programming.</a>
</div>
"#;
let results = hand.parse_ddg_html(html, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Rust Programming Language");
assert_eq!(results[0].url, "https://rust-lang.org");
assert_eq!(results[0].source, "DuckDuckGo");
assert_eq!(results[1].title, "The Rust Book");
}
#[test]
fn test_parse_ddg_html_max_results() {
let hand = create_test_hand();
let mut html = String::new();
for i in 0..20 {
html.push_str(&format!(
r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
i, i, i
));
}
let results = hand.parse_ddg_html(&html, 5);
assert_eq!(results.len(), 5);
}
#[test]
fn test_parse_ddg_html_empty() {
let hand = create_test_hand();
let html = "<html><body>No results here</body></html>";
let results = hand.parse_ddg_html(html, 10);
assert!(results.is_empty());
}
#[test]
fn test_parse_bing_html() {
let hand = create_test_hand();
let html = r#"
<li class="b_algo">
<h2><a href="https://example.com/result1">Example Result 1</a></h2>
<div class="b_caption"><p>This is the first result snippet.</p></div>
</li>
<li class="b_algo">
<h2><a href="https://example.com/result2">Example Result 2</a></h2>
<div class="b_caption"><p>This is the second result snippet.</p></div>
</li>
"#;
let results = hand.parse_bing_html(html, 10);
assert_eq!(results.len(), 2);
assert_eq!(results[0].title, "Example Result 1");
assert_eq!(results[0].url, "https://example.com/result1");
assert_eq!(results[0].source, "Bing");
}
#[test]
fn test_parse_bing_html_skips_internal_urls() {
let hand = create_test_hand();
let html = r#"
<li class="b_algo">
<h2><a href="https://bing.com/search?q=more">More Results</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://example.com/real">Real Result</a></h2>
</li>
"#;
let results = hand.parse_bing_html(html, 10);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/real");
}
#[test]
fn test_parse_bing_html_empty() {
let hand = create_test_hand();
let html = "<html><body>Nothing here</body></html>";
let results = hand.parse_bing_html(html, 10);
assert!(results.is_empty());
}
#[test]
fn test_parse_baidu_html() {
let hand = create_test_hand();
let html = r#"
<div class="result c-container">
<h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
<div class="c-abstract">这是关于医疗政策的摘要信息。</div>
</div>
<div class="c-container new-pmd">
<h3><a href="https://www.example.cn/page2">第二条结果</a></h3>
</div>
"#;
let results = hand.parse_baidu_html(html, 10);
assert!(results.len() >= 1, "Should find at least 1 result, got {}", results.len());
assert_eq!(results[0].source, "Baidu");
}
// --- SSRF Validation Tests ---
#[test]
fn test_ssrf_blocks_localhost() {
assert!(validate_fetch_url("http://localhost:8080/admin").is_err());
assert!(validate_fetch_url("http://127.0.0.1:5432/db").is_err());
}
#[test]
fn test_ssrf_blocks_private_ip() {
assert!(validate_fetch_url("http://10.0.0.1/secret").is_err());
assert!(validate_fetch_url("http://192.168.1.1/router").is_err());
assert!(validate_fetch_url("http://172.16.0.1/internal").is_err());
}
#[test]
fn test_ssrf_blocks_cloud_metadata() {
assert!(validate_fetch_url("http://169.254.169.254/metadata").is_err());
}
#[test]
fn test_ssrf_blocks_non_http_scheme() {
assert!(validate_fetch_url("file:///etc/passwd").is_err());
assert!(validate_fetch_url("ftp://example.com/file").is_err());
}
#[test]
fn test_ssrf_allows_public_url() {
assert!(validate_fetch_url("https://www.rust-lang.org/learn").is_ok());
assert!(validate_fetch_url("https://example.com/page?q=test").is_ok());
}
// --- Percent Decode Tests ---
#[test]
fn test_percent_decode_basic() {
assert_eq!(percent_decode("hello%20world"), "hello world");
assert_eq!(percent_decode("%E4%B8%AD%E6%96%87"), "中文");
}
#[test]
fn test_percent_decode_full_url() {
assert_eq!(
percent_decode("https%3A%2F%2Fexample.com%2Fpage%3Fq%3Dtest"),
"https://example.com/page?q=test"
);
}
#[test]
fn test_percent_decode_no_encoding() {
assert_eq!(percent_decode("plain-text_123"), "plain-text_123");
}
// --- Input Validation Tests ---
#[test]
fn test_research_query_validate_empty() {
let query = ResearchQuery {
query: " ".to_string(), engine: SearchEngine::Auto,
depth: ResearchDepth::Standard, max_results: 10,
include_related: false, time_limit_secs: 60,
};
assert!(query.validate().is_err());
}
#[test]
fn test_research_query_validate_too_long() {
let query = ResearchQuery {
query: "x".repeat(501), engine: SearchEngine::Auto,
depth: ResearchDepth::Standard, max_results: 10,
include_related: false, time_limit_secs: 60,
};
assert!(query.validate().is_err());
}
#[test]
fn test_research_query_validate_max_results_overflow() {
let query = ResearchQuery {
query: "test".to_string(), engine: SearchEngine::Auto,
depth: ResearchDepth::Standard, max_results: 999,
include_related: false, time_limit_secs: 60,
};
assert!(query.validate().is_err());
}
#[test]
fn test_research_query_validate_ok() {
let query = ResearchQuery {
query: "Rust programming".to_string(), engine: SearchEngine::Auto,
depth: ResearchDepth::Standard, max_results: 10,
include_related: false, time_limit_secs: 60,
};
assert!(query.validate().is_ok());
}
// --- Quality Filter Tests ---
#[test]
fn test_quality_rejects_javascript_title() {
assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com"));
}
#[test]
fn test_quality_rejects_short_title() {
assert!(!is_quality_result("A", "snippet", "https://example.com"));
}
#[test]
fn test_quality_rejects_css_title() {
assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com"));
}
#[test]
fn test_quality_rejects_javascript_url() {
assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)"));
}
#[test]
fn test_quality_accepts_normal_result() {
assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy"));
}
#[test]
fn test_quality_accepts_english_result() {
assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org"));
}
#[test]
fn test_quality_rejects_long_title() {
let long_title: String = "x".repeat(301);
assert!(!is_quality_result(&long_title, "ok", "https://example.com"));
}
#[test]
fn test_strip_html_tags_collapses_whitespace() {
assert_eq!(strip_html_tags("<b>Hello</b> <i>World</i>"), "Hello World");
assert_eq!(strip_html_tags("a\n\t b"), "a b");
}
}