Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
根因: glm-5.1 不理解 oneOf+const 复杂 schema,发送 tool_calls 时
arguments 为空 {}。同时缺少从对话上下文提取用户意图的回退机制。
修复:
1. researcher input_schema 从 oneOf+const 改为扁平化属性 — glm 正确传参
2. loop_runner 增加 empty-input 回退 — 从最近用户消息注入 _fallback_query
3. researcher infer_action 增加 _fallback_query 分支处理
4. 调试日志降级 INFO→DEBUG (openai tool_calls delta, researcher input)
2199 lines
74 KiB
Rust
2199 lines
74 KiB
Rust
//! Researcher Hand - Deep research and analysis capabilities
|
||
//!
|
||
//! This hand provides web search, content fetching, and research synthesis.
|
||
|
||
use async_trait::async_trait;
|
||
use serde::{Deserialize, Serialize};
|
||
use serde_json::{json, Value};
|
||
use std::collections::HashMap;
|
||
use std::net::{IpAddr, Ipv4Addr, Ipv6Addr};
|
||
use std::sync::Arc;
|
||
use tokio::sync::RwLock;
|
||
use url::Url;
|
||
use zclaw_types::Result;
|
||
|
||
use crate::{Hand, HandConfig, HandContext, HandResult};
|
||
|
||
/// Search engine options
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(rename_all = "lowercase")]
|
||
pub enum SearchEngine {
|
||
SearXNG,
|
||
Google,
|
||
Bing,
|
||
DuckDuckGo,
|
||
Auto,
|
||
}
|
||
|
||
impl Default for SearchEngine {
|
||
fn default() -> Self {
|
||
Self::Auto
|
||
}
|
||
}
|
||
|
||
/// Search configuration loaded from config/config.toml
|
||
#[derive(Debug, Clone)]
|
||
struct SearchConfig {
|
||
default_engine: SearchEngine,
|
||
searxng_url: String,
|
||
timeout_secs: u64,
|
||
jina_api_key: Option<String>,
|
||
}
|
||
|
||
impl Default for SearchConfig {
|
||
fn default() -> Self {
|
||
Self {
|
||
default_engine: SearchEngine::Auto,
|
||
searxng_url: "http://localhost:8888".to_string(),
|
||
timeout_secs: 15,
|
||
jina_api_key: None,
|
||
}
|
||
}
|
||
}
|
||
|
||
impl SearchConfig {
|
||
fn load() -> Self {
|
||
let path = "config/config.toml";
|
||
let content = match std::fs::read_to_string(path) {
|
||
Ok(c) => c,
|
||
Err(_) => return Self::default(),
|
||
};
|
||
|
||
#[derive(Deserialize)]
|
||
struct ToolsWebSearch {
|
||
default_engine: Option<String>,
|
||
#[allow(dead_code)]
|
||
max_results: Option<usize>,
|
||
searxng_url: Option<String>,
|
||
searxng_timeout: Option<u64>,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct ToolsWeb {
|
||
search: Option<ToolsWebSearch>,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct Tools {
|
||
web: Option<ToolsWeb>,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct Config {
|
||
tools: Option<Tools>,
|
||
}
|
||
|
||
let config: Config = match toml::from_str(&content) {
|
||
Ok(c) => c,
|
||
Err(_) => return Self::default(),
|
||
};
|
||
|
||
let search = config.tools
|
||
.and_then(|t| t.web)
|
||
.and_then(|w| w.search);
|
||
|
||
match search {
|
||
Some(s) => {
|
||
let engine = s.default_engine
|
||
.as_deref()
|
||
.and_then(|e| serde_json::from_str(&format!("\"{}\"", e)).ok())
|
||
.unwrap_or_default();
|
||
Self {
|
||
default_engine: engine,
|
||
searxng_url: s.searxng_url
|
||
.unwrap_or_else(|| "http://localhost:8888".to_string()),
|
||
timeout_secs: s.searxng_timeout.unwrap_or(15),
|
||
jina_api_key: std::env::var("ZCLAW_JINA_API_KEY").ok(),
|
||
}
|
||
}
|
||
None => Self::default(),
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Research depth level
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(rename_all = "lowercase")]
|
||
pub enum ResearchDepth {
|
||
Quick, // Fast search, top 3 results
|
||
Standard, // Normal search, top 10 results
|
||
Deep, // Comprehensive search, multiple sources
|
||
}
|
||
|
||
impl Default for ResearchDepth {
|
||
fn default() -> Self {
|
||
Self::Standard
|
||
}
|
||
}
|
||
|
||
/// Research query configuration
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(rename_all = "camelCase")]
|
||
pub struct ResearchQuery {
|
||
/// Search query
|
||
pub query: String,
|
||
/// Search engine to use
|
||
#[serde(default)]
|
||
pub engine: SearchEngine,
|
||
/// Research depth
|
||
#[serde(default)]
|
||
pub depth: ResearchDepth,
|
||
/// Maximum results to return
|
||
#[serde(default = "default_max_results")]
|
||
pub max_results: usize,
|
||
/// Include related topics
|
||
#[serde(default)]
|
||
pub include_related: bool,
|
||
/// Time limit in seconds
|
||
#[serde(default = "default_time_limit")]
|
||
pub time_limit_secs: u64,
|
||
}
|
||
|
||
fn default_max_results() -> usize { 10 }
|
||
fn default_time_limit() -> u64 { 60 }
|
||
|
||
const MAX_QUERY_LENGTH: usize = 500;
|
||
const MAX_RESULTS_CAP: usize = 50;
|
||
const MAX_URL_LENGTH: usize = 2048;
|
||
const CACHE_MAX_ENTRIES: usize = 200;
|
||
|
||
impl ResearchQuery {
|
||
fn validate(&self) -> std::result::Result<(), String> {
|
||
if self.query.trim().is_empty() {
|
||
return Err("搜索查询不能为空".to_string());
|
||
}
|
||
if self.query.len() > MAX_QUERY_LENGTH {
|
||
return Err(format!("查询过长(上限 {} 字符)", MAX_QUERY_LENGTH));
|
||
}
|
||
if self.max_results > MAX_RESULTS_CAP {
|
||
return Err(format!("max_results 上限为 {}", MAX_RESULTS_CAP));
|
||
}
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// Search result item
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(rename_all = "camelCase")]
|
||
pub struct SearchResult {
|
||
/// Title of the result
|
||
pub title: String,
|
||
/// URL
|
||
pub url: String,
|
||
/// Snippet/summary
|
||
pub snippet: String,
|
||
/// Source name
|
||
pub source: String,
|
||
/// Relevance score (0-100)
|
||
#[serde(default)]
|
||
pub relevance: u8,
|
||
/// Fetched content (if available)
|
||
#[serde(default)]
|
||
pub content: Option<String>,
|
||
/// Timestamp
|
||
#[serde(default)]
|
||
pub fetched_at: Option<String>,
|
||
}
|
||
|
||
/// Research report
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(rename_all = "camelCase")]
|
||
pub struct ResearchReport {
|
||
/// Original query
|
||
pub query: String,
|
||
/// Search results
|
||
pub results: Vec<SearchResult>,
|
||
/// Synthesized summary
|
||
#[serde(default)]
|
||
pub summary: Option<String>,
|
||
/// Key findings
|
||
#[serde(default)]
|
||
pub key_findings: Vec<String>,
|
||
/// Related topics discovered
|
||
#[serde(default)]
|
||
pub related_topics: Vec<String>,
|
||
/// Research timestamp
|
||
pub researched_at: String,
|
||
/// Total time spent (ms)
|
||
pub duration_ms: u64,
|
||
}
|
||
|
||
/// Researcher action types
|
||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||
#[serde(tag = "action")]
|
||
pub enum ResearcherAction {
|
||
#[serde(rename = "search")]
|
||
Search { query: ResearchQuery },
|
||
#[serde(rename = "fetch")]
|
||
Fetch { url: String },
|
||
#[serde(rename = "summarize")]
|
||
Summarize { urls: Vec<String> },
|
||
#[serde(rename = "report")]
|
||
Report { query: ResearchQuery },
|
||
}
|
||
|
||
/// Researcher Hand implementation
|
||
pub struct ResearcherHand {
|
||
config: HandConfig,
|
||
search_config: SearchConfig,
|
||
client: reqwest::Client,
|
||
cache: Arc<RwLock<HashMap<String, SearchResult>>>,
|
||
}
|
||
|
||
impl ResearcherHand {
|
||
/// Create a new researcher hand
|
||
pub fn new() -> Self {
|
||
Self {
|
||
config: HandConfig {
|
||
id: "researcher".to_string(),
|
||
name: "研究员".to_string(),
|
||
description: "深度研究和分析能力,支持网络搜索和内容获取".to_string(),
|
||
needs_approval: false,
|
||
dependencies: vec!["network".to_string()],
|
||
input_schema: Some(serde_json::json!({
|
||
"type": "object",
|
||
"properties": {
|
||
"action": {
|
||
"type": "string",
|
||
"enum": ["search", "fetch", "report", "summarize"],
|
||
"description": "Action to perform: search (web search), fetch (get URL content), report (deep research), summarize (multiple URLs)"
|
||
},
|
||
"query": {
|
||
"type": "string",
|
||
"description": "Search query string for search/report actions"
|
||
},
|
||
"url": {
|
||
"type": "string",
|
||
"description": "URL to fetch content from"
|
||
},
|
||
"urls": {
|
||
"type": "array",
|
||
"items": { "type": "string" },
|
||
"description": "List of URLs to summarize"
|
||
},
|
||
"engine": {
|
||
"type": "string",
|
||
"enum": ["auto", "searxng", "google", "bing", "duckduckgo"],
|
||
"description": "Search engine preference"
|
||
}
|
||
},
|
||
"description": "Provide 'query' for search/report, or 'url' for fetch, or 'urls' for summarize"
|
||
})),
|
||
tags: vec!["research".to_string(), "web".to_string(), "search".to_string()],
|
||
enabled: true,
|
||
max_concurrent: 0,
|
||
timeout_secs: 0,
|
||
},
|
||
search_config: SearchConfig::load(),
|
||
client: reqwest::Client::builder()
|
||
.timeout(std::time::Duration::from_secs(30))
|
||
.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
|
||
.redirect(reqwest::redirect::Policy::limited(3))
|
||
.build()
|
||
.unwrap_or_else(|_| reqwest::Client::new()),
|
||
cache: Arc::new(RwLock::new(HashMap::new())),
|
||
}
|
||
}
|
||
|
||
/// Infer action from input fields when LLM omits the `action` field.
|
||
/// Many LLMs (especially non-OpenAI models like glm) call tools without
|
||
/// including the enum tag, e.g. sending `{"query": "search terms"}` instead
|
||
/// of `{"action": "search", "query": "search terms"}`.
|
||
fn infer_action(input: &Value) -> Result<ResearcherAction> {
|
||
// Debug: log all keys in the input
|
||
let keys: Vec<&str> = input.as_object()
|
||
.map(|obj| obj.keys().map(|k| k.as_str()).collect())
|
||
.unwrap_or_default();
|
||
tracing::debug!(target: "researcher", ?keys, %input, "infer_action examining input");
|
||
|
||
// Check for action field with wrong value
|
||
if let Some(action) = input.get("action").and_then(|v| v.as_str()) {
|
||
if action == "search" || action == "report" {
|
||
if let Some(query_val) = input.get("query") {
|
||
let query = Self::parse_query(query_val);
|
||
if !query.query.trim().is_empty() {
|
||
return Ok(if action == "report" {
|
||
ResearcherAction::Report { query }
|
||
} else {
|
||
ResearcherAction::Search { query }
|
||
});
|
||
}
|
||
}
|
||
}
|
||
if action == "fetch" {
|
||
if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
|
||
return Ok(ResearcherAction::Fetch { url: url.to_string() });
|
||
}
|
||
}
|
||
}
|
||
|
||
// Has "url" (singular) → fetch
|
||
if let Some(url) = input.get("url").and_then(|v| v.as_str()) {
|
||
if !url.is_empty() && url.starts_with("http") {
|
||
return Ok(ResearcherAction::Fetch { url: url.to_string() });
|
||
}
|
||
}
|
||
// Has "urls" (plural) → summarize
|
||
if let Some(urls) = input.get("urls").and_then(|v| v.as_array()) {
|
||
let url_list: Vec<String> = urls.iter()
|
||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||
.collect();
|
||
if !url_list.is_empty() {
|
||
return Ok(ResearcherAction::Summarize { urls: url_list });
|
||
}
|
||
}
|
||
// Has "query" → search
|
||
if let Some(query_val) = input.get("query") {
|
||
let query = Self::parse_query(query_val);
|
||
if !query.query.trim().is_empty() {
|
||
return Ok(ResearcherAction::Search { query });
|
||
}
|
||
}
|
||
// Has "search" or "search_query" → search
|
||
for key in &["search", "search_query", "keyword", "keywords", "q", "text"] {
|
||
if let Some(val) = input.get(key) {
|
||
let query = Self::parse_query(val);
|
||
if !query.query.trim().is_empty() {
|
||
return Ok(ResearcherAction::Search { query });
|
||
}
|
||
}
|
||
}
|
||
// Check for injected fallback query from loop_runner (when LLM sends empty args)
|
||
if let Some(fallback) = input.get("_fallback_query").and_then(|v| v.as_str()) {
|
||
if !fallback.trim().is_empty() {
|
||
tracing::debug!(target: "researcher", query = %fallback, "Using fallback user message as search query");
|
||
return Ok(ResearcherAction::Search { query: ResearchQuery {
|
||
query: fallback.to_string(),
|
||
engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard,
|
||
max_results: 10,
|
||
include_related: false,
|
||
time_limit_secs: 60,
|
||
}});
|
||
}
|
||
}
|
||
|
||
// Last resort: if any string field looks like a search query
|
||
if let Some(obj) = input.as_object() {
|
||
for (key, val) in obj {
|
||
if let Some(s) = val.as_str() {
|
||
if s.len() > 2 && !s.starts_with("http") && key != "action" && key != "engine" {
|
||
tracing::debug!(target: "researcher", key = %key, value = %s, "Using fallback field as query");
|
||
return Ok(ResearcherAction::Search { query: ResearchQuery {
|
||
query: s.to_string(),
|
||
engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard,
|
||
max_results: 10,
|
||
include_related: false,
|
||
time_limit_secs: 60,
|
||
}});
|
||
}
|
||
}
|
||
}
|
||
}
|
||
Err(zclaw_types::ZclawError::HandError(
|
||
"无法识别搜索意图:请提供 query(搜索)或 url(获取网页)参数".to_string()
|
||
))
|
||
}
|
||
|
||
fn parse_query(query_val: &Value) -> ResearchQuery {
|
||
if query_val.is_string() {
|
||
ResearchQuery {
|
||
query: query_val.as_str().unwrap_or("").to_string(),
|
||
engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard,
|
||
max_results: 10,
|
||
include_related: false,
|
||
time_limit_secs: 60,
|
||
}
|
||
} else {
|
||
serde_json::from_value(query_val.clone()).unwrap_or_else(|_| ResearchQuery {
|
||
query: query_val.get("query")
|
||
.or_else(|| query_val.get("search"))
|
||
.or_else(|| query_val.get("q"))
|
||
.or_else(|| query_val.get("keyword"))
|
||
.and_then(|v| v.as_str())
|
||
.unwrap_or("")
|
||
.to_string(),
|
||
engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard,
|
||
max_results: 10,
|
||
include_related: false,
|
||
time_limit_secs: 60,
|
||
})
|
||
}
|
||
}
|
||
|
||
/// Execute a web search — route to the configured backend
|
||
async fn execute_search(&self, query: &ResearchQuery) -> Result<Vec<SearchResult>> {
|
||
query.validate().map_err(|e| zclaw_types::ZclawError::HandError(e))?;
|
||
|
||
let max_results = query.max_results.min(MAX_RESULTS_CAP);
|
||
let start = std::time::Instant::now();
|
||
|
||
let engine = match &query.engine {
|
||
SearchEngine::Auto => &self.search_config.default_engine,
|
||
other => other,
|
||
};
|
||
|
||
let results = match engine {
|
||
SearchEngine::SearXNG => {
|
||
match self.search_searxng(&query.query, max_results).await {
|
||
Ok(r) if !r.is_empty() => r,
|
||
_ => self.search_native(&query.query, max_results).await?,
|
||
}
|
||
}
|
||
SearchEngine::Auto => {
|
||
self.search_native(&query.query, max_results).await?
|
||
}
|
||
SearchEngine::DuckDuckGo => {
|
||
// DDG在国内不可用,降级到百度
|
||
tracing::warn!(target: "researcher", "DuckDuckGo在国内不可用,降级到百度");
|
||
self.search_baidu(&query.query, max_results).await?
|
||
}
|
||
SearchEngine::Google => {
|
||
tracing::warn!(target: "researcher", "Google在国内不可用,降级到百度");
|
||
self.search_baidu(&query.query, max_results).await?
|
||
}
|
||
SearchEngine::Bing => {
|
||
self.search_bing(&query.query, max_results).await?
|
||
}
|
||
};
|
||
|
||
let duration = start.elapsed().as_millis() as u64;
|
||
tracing::info!(
|
||
target: "researcher",
|
||
query = %query.query,
|
||
engine = ?engine,
|
||
duration_ms = duration,
|
||
results_count = results.len(),
|
||
"Search completed"
|
||
);
|
||
|
||
Ok(results)
|
||
}
|
||
|
||
/// Rust-native multi-engine search — optimized for China mainland users
|
||
/// Priority: Baidu + Bing CN (both always work in China)
|
||
/// DuckDuckGo as optional fallback (may be blocked by GFW)
|
||
async fn search_native(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||
let mut all_results = Vec::new();
|
||
|
||
// Always use Baidu + Bing CN in parallel (both work in China)
|
||
let baidu_fut = self.search_baidu(query, max_results);
|
||
let bing_fut = self.search_bing(query, max_results);
|
||
|
||
let (baidu_res, bing_res) = tokio::join!(
|
||
async { baidu_fut.await },
|
||
async { bing_fut.await },
|
||
);
|
||
|
||
if let Ok(r) = baidu_res {
|
||
all_results.extend(r);
|
||
}
|
||
if let Ok(r) = bing_res {
|
||
all_results.extend(r);
|
||
}
|
||
|
||
// If both primary engines returned nothing, try DDG as last resort
|
||
if all_results.is_empty() {
|
||
tracing::info!(target: "researcher", "Primary engines empty, trying DuckDuckGo as fallback");
|
||
if let Ok(r) = self.search_duckduckgo_html(query, max_results).await {
|
||
all_results.extend(r);
|
||
}
|
||
}
|
||
|
||
// Deduplicate by URL
|
||
let mut seen_urls = std::collections::HashSet::new();
|
||
all_results.retain(|r| seen_urls.insert(r.url.to_lowercase()));
|
||
|
||
// Sort by relevance descending, take top N
|
||
all_results.sort_by(|a, b| b.relevance.cmp(&a.relevance));
|
||
all_results.truncate(max_results);
|
||
|
||
if all_results.is_empty() {
|
||
tracing::warn!(target: "researcher", "All native engines returned empty for query: {}", query);
|
||
}
|
||
|
||
Ok(all_results)
|
||
}
|
||
|
||
/// Search using SearXNG meta-search engine (aggregates 70+ engines)
|
||
async fn search_searxng(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||
let url = format!(
|
||
"{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
|
||
self.search_config.searxng_url.trim_end_matches('/'),
|
||
url_encode(query)
|
||
);
|
||
|
||
let response = self.client
|
||
.get(&url)
|
||
.timeout(std::time::Duration::from_secs(self.search_config.timeout_secs))
|
||
.send()
|
||
.await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("SearXNG request failed: {}", e)
|
||
))?;
|
||
|
||
let status = response.status();
|
||
if !status.is_success() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("SearXNG returned HTTP {}", status)
|
||
));
|
||
}
|
||
|
||
let json: Value = response.json().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Failed to parse SearXNG response: {}", e)
|
||
))?;
|
||
|
||
let mut results = Vec::new();
|
||
|
||
if let Some(items) = json.get("results").and_then(|v| v.as_array()) {
|
||
for item in items.iter().take(max_results) {
|
||
let title = item.get("title")
|
||
.and_then(|v| v.as_str())
|
||
.unwrap_or("")
|
||
.to_string();
|
||
let url = item.get("url")
|
||
.and_then(|v| v.as_str())
|
||
.unwrap_or("")
|
||
.to_string();
|
||
let snippet = item.get("content")
|
||
.and_then(|v| v.as_str())
|
||
.unwrap_or("")
|
||
.to_string();
|
||
let engines = item.get("engines")
|
||
.and_then(|v| v.as_array())
|
||
.map(|arr| {
|
||
arr.iter()
|
||
.filter_map(|e| e.as_str())
|
||
.collect::<Vec<_>>()
|
||
.join(",")
|
||
})
|
||
.unwrap_or_default();
|
||
let score = item.get("score")
|
||
.and_then(|v| v.as_f64())
|
||
.unwrap_or(0.0);
|
||
|
||
// Normalize score to 0-100 range
|
||
let relevance = if score > 0.0 {
|
||
(score.min(10.0) * 10.0) as u8
|
||
} else {
|
||
50
|
||
};
|
||
|
||
if !title.is_empty() && !url.is_empty() {
|
||
results.push(SearchResult {
|
||
title,
|
||
url,
|
||
snippet,
|
||
source: if engines.is_empty() {
|
||
"SearXNG".to_string()
|
||
} else {
|
||
format!("SearXNG({})", engines)
|
||
},
|
||
relevance,
|
||
content: None,
|
||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
Ok(results)
|
||
}
|
||
|
||
/// Search using DuckDuckGo HTML (POST method, matching ddgs library behavior)
|
||
async fn search_duckduckgo_html(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||
let region = if has_cjk { "wt-wt" } else { "wt-wt" };
|
||
let body = format!("q={}&b=&l={}", url_encode(query), region);
|
||
|
||
let response = self.client
|
||
.post("https://html.duckduckgo.com/html/")
|
||
.header("Content-Type", "application/x-www-form-urlencoded")
|
||
.header("Accept", "text/html,application/xhtml+xml")
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||
.body(body)
|
||
.send()
|
||
.await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("DuckDuckGo HTML search failed: {}", e)
|
||
))?;
|
||
|
||
let status = response.status();
|
||
if !status.is_success() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("DuckDuckGo returned HTTP {}", status)
|
||
));
|
||
}
|
||
|
||
let html = response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Failed to read DuckDuckGo response: {}", e)
|
||
))?;
|
||
|
||
Ok(self.parse_ddg_html(&html, max_results))
|
||
}
|
||
|
||
/// Parse DuckDuckGo HTML search results page
|
||
fn parse_ddg_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||
let mut results = Vec::new();
|
||
|
||
for block in html.split("class=\"result__body\"") {
|
||
if results.len() >= max_results {
|
||
break;
|
||
}
|
||
|
||
// Find the result title link: <a class="result__a" href="...">Title</a>
|
||
let title_link = match extract_between(block, "result__a", "</a>") {
|
||
Some(s) => s,
|
||
None => continue,
|
||
};
|
||
// title_link is like: href="//duckduckgo.com/l/?uddg=...">Title Text
|
||
let title = title_link.rsplit('>').next()
|
||
.map(|s| strip_html_tags(s).trim().to_string())
|
||
.unwrap_or_default();
|
||
|
||
let url = extract_href_uddg(block).unwrap_or_default();
|
||
|
||
let snippet = extract_between(block, "result__snippet", "</a>")
|
||
.map(|s| {
|
||
s.rsplit('>').next()
|
||
.map(|t| strip_html_tags(t).trim().to_string())
|
||
.unwrap_or_default()
|
||
})
|
||
.unwrap_or_default();
|
||
|
||
if title.is_empty() || url.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
if !is_quality_result(&title, &snippet, &url) {
|
||
continue;
|
||
}
|
||
|
||
results.push(SearchResult {
|
||
title,
|
||
url,
|
||
snippet,
|
||
source: "DuckDuckGo".to_string(),
|
||
relevance: 70,
|
||
content: None,
|
||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||
});
|
||
}
|
||
|
||
results
|
||
}
|
||
|
||
/// Search using Bing (works well for both Chinese and English)
|
||
async fn search_bing(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||
let has_cjk = query.chars().any(|c| is_cjk_char(c));
|
||
let url = if has_cjk {
|
||
format!(
|
||
"https://cn.bing.com/search?q={}&count={}&setlang=zh-Hans",
|
||
url_encode(query),
|
||
max_results
|
||
)
|
||
} else {
|
||
format!(
|
||
"https://www.bing.com/search?q={}&count={}",
|
||
url_encode(query),
|
||
max_results
|
||
)
|
||
};
|
||
|
||
let response = self.client
|
||
.get(&url)
|
||
.header("Accept", "text/html,application/xhtml+xml")
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8")
|
||
.send()
|
||
.await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Bing search failed: {}", e)
|
||
))?;
|
||
|
||
let status = response.status();
|
||
if !status.is_success() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("Bing returned HTTP {}", status)
|
||
));
|
||
}
|
||
|
||
let html = response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Failed to read Bing response: {}", e)
|
||
))?;
|
||
|
||
Ok(self.parse_bing_html(&html, max_results))
|
||
}
|
||
|
||
/// Parse Bing HTML search results page
|
||
fn parse_bing_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||
let mut results = Vec::new();
|
||
|
||
// Bing results are in <li class="b_algo">
|
||
for block in html.split("class=\"b_algo\"") {
|
||
if results.len() >= max_results {
|
||
break;
|
||
}
|
||
|
||
// Extract title from first <a> inside the block
|
||
let title = extract_between(block, ">", "</a>")
|
||
.map(|s| strip_html_tags(s).trim().to_string())
|
||
.unwrap_or_default();
|
||
|
||
// Extract URL from href attribute of first <a>
|
||
let url = extract_href(block).unwrap_or_default();
|
||
|
||
// Extract snippet from <div class="b_caption"><p>...</p> or <p>
|
||
let snippet = extract_between(block, "<p>", "</p>")
|
||
.or_else(|| extract_between(block, "b_caption", "</div>"))
|
||
.map(|s| strip_html_tags(s).trim().to_string())
|
||
.unwrap_or_default();
|
||
|
||
if title.is_empty() || url.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
// Skip Bing internal URLs
|
||
if url.contains("bing.com/search") || url.contains("go.microsoft.com") {
|
||
continue;
|
||
}
|
||
|
||
if !is_quality_result(&title, &snippet, &url) {
|
||
continue;
|
||
}
|
||
|
||
results.push(SearchResult {
|
||
title,
|
||
url,
|
||
snippet,
|
||
source: "Bing".to_string(),
|
||
relevance: 75,
|
||
content: None,
|
||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||
});
|
||
}
|
||
|
||
results
|
||
}
|
||
|
||
/// Search using Baidu (essential for Chinese content)
|
||
async fn search_baidu(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
|
||
let url = format!(
|
||
"https://www.baidu.com/s?wd={}&rn={}",
|
||
url_encode(query),
|
||
max_results
|
||
);
|
||
|
||
let response = self.client
|
||
.get(&url)
|
||
.header("Accept", "text/html,application/xhtml+xml")
|
||
.header("Accept-Language", "zh-CN,zh;q=0.9")
|
||
.send()
|
||
.await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Baidu search failed: {}", e)
|
||
))?;
|
||
|
||
let status = response.status();
|
||
if !status.is_success() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("Baidu returned HTTP {}", status)
|
||
));
|
||
}
|
||
|
||
let html = response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Failed to read Baidu response: {}", e)
|
||
))?;
|
||
|
||
Ok(self.parse_baidu_html(&html, max_results))
|
||
}
|
||
|
||
/// Parse Baidu HTML search results page
|
||
fn parse_baidu_html(&self, html: &str, max_results: usize) -> Vec<SearchResult> {
|
||
let mut results = Vec::new();
|
||
|
||
// Baidu uses multiple class patterns: "result c-container", "c-container new-pmd", "result-op c-container"
|
||
let blocks: Vec<&str> = html.split("c-container")
|
||
.enumerate()
|
||
.filter_map(|(i, block)| {
|
||
if i == 0 { return None; }
|
||
if block.contains("href=\"http") { Some(block) } else { None }
|
||
})
|
||
.collect();
|
||
|
||
for block in &blocks {
|
||
if results.len() >= max_results {
|
||
break;
|
||
}
|
||
|
||
let title = extract_between(block, ">", "</a>")
|
||
.map(|s| strip_html_tags(s).trim().to_string())
|
||
.unwrap_or_default();
|
||
|
||
let url = extract_href(block).unwrap_or_default();
|
||
|
||
let snippet = extract_between(block, "c-abstract", "</div>")
|
||
.or_else(|| extract_between(block, "content-right_", "</div>"))
|
||
.map(|s| strip_html_tags(s).trim().to_string())
|
||
.unwrap_or_default();
|
||
|
||
if title.is_empty() || url.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
if !is_quality_result(&title, &snippet, &url) {
|
||
continue;
|
||
}
|
||
|
||
results.push(SearchResult {
|
||
title,
|
||
url,
|
||
snippet,
|
||
source: "Baidu".to_string(),
|
||
relevance: 80,
|
||
content: None,
|
||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||
});
|
||
}
|
||
|
||
results
|
||
}
|
||
|
||
/// Fetch content from a URL (with SSRF protection)
|
||
/// Tries Jina Reader API first for clean Markdown, falls back to direct fetch
|
||
async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
|
||
let start = std::time::Instant::now();
|
||
|
||
// SSRF validation
|
||
validate_fetch_url(url)?;
|
||
|
||
// Check cache first
|
||
{
|
||
let cache = self.cache.read().await;
|
||
if let Some(cached) = cache.get(url) {
|
||
if cached.content.is_some() {
|
||
return Ok(cached.clone());
|
||
}
|
||
}
|
||
}
|
||
|
||
// Try Jina Reader API first (returns clean Markdown)
|
||
let content = match self.fetch_via_jina(url).await {
|
||
Ok(text) => text,
|
||
Err(e) => {
|
||
tracing::warn!(target: "researcher", error = %e, "Jina Reader failed, falling back to direct fetch");
|
||
self.fetch_direct(url).await?
|
||
}
|
||
};
|
||
|
||
let result = SearchResult {
|
||
title: url.to_string(),
|
||
url: url.to_string(),
|
||
snippet: content.chars().take(500).collect(),
|
||
source: url.to_string(),
|
||
relevance: 100,
|
||
content: Some(content),
|
||
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
|
||
};
|
||
|
||
// Cache the result (with capacity limit)
|
||
{
|
||
let mut cache = self.cache.write().await;
|
||
if cache.len() >= CACHE_MAX_ENTRIES {
|
||
// Simple eviction: remove first entry
|
||
if let Some(key) = cache.keys().next().cloned() {
|
||
cache.remove(&key);
|
||
}
|
||
}
|
||
cache.insert(url.to_string(), result.clone());
|
||
}
|
||
|
||
let duration = start.elapsed().as_millis() as u64;
|
||
tracing::info!(
|
||
target: "researcher",
|
||
url = url,
|
||
duration_ms = duration,
|
||
"Fetch completed"
|
||
);
|
||
|
||
Ok(result)
|
||
}
|
||
|
||
/// Fetch content via Jina Reader API — returns clean Markdown (DeerFlow pattern)
|
||
async fn fetch_via_jina(&self, url: &str) -> Result<String> {
|
||
let client = reqwest::Client::builder()
|
||
.timeout(std::time::Duration::from_secs(20))
|
||
.build()
|
||
.unwrap_or_else(|_| reqwest::Client::new());
|
||
|
||
let mut builder = client
|
||
.post("https://r.jina.ai/")
|
||
.header("Content-Type", "application/json")
|
||
.header("X-Return-Format", "markdown")
|
||
.header("X-Timeout", "15")
|
||
.json(&serde_json::json!({ "url": url }));
|
||
|
||
// Optional API key for higher rate limits
|
||
if let Some(ref key) = self.search_config.jina_api_key {
|
||
builder = builder.header("Authorization", format!("Bearer {}", key));
|
||
}
|
||
|
||
let response = builder.send().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Jina Reader request failed: {}", e)
|
||
))?;
|
||
|
||
let status = response.status();
|
||
if !status.is_success() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("Jina Reader returned HTTP {}", status)
|
||
));
|
||
}
|
||
|
||
let text = response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(
|
||
format!("Failed to read Jina response: {}", e)
|
||
))?;
|
||
|
||
if text.trim().is_empty() {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
"Jina Reader returned empty response".to_string()
|
||
));
|
||
}
|
||
|
||
// Truncate to 4096 chars (DeerFlow pattern)
|
||
let truncated: String = text.chars().take(4096).collect();
|
||
Ok(truncated)
|
||
}
|
||
|
||
/// Direct HTTP fetch with HTML text extraction (fallback when Jina unavailable)
|
||
async fn fetch_direct(&self, url: &str) -> Result<String> {
|
||
let response = self.client
|
||
.get(url)
|
||
.send()
|
||
.await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Direct fetch failed: {}", e)))?;
|
||
|
||
let content_type = response.headers()
|
||
.get(reqwest::header::CONTENT_TYPE)
|
||
.and_then(|v| v.to_str().ok())
|
||
.unwrap_or("");
|
||
|
||
let content = if content_type.contains("text/html") {
|
||
let html = response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
|
||
self.extract_text_from_html(&html)
|
||
} else if content_type.contains("text/") || content_type.contains("application/json") {
|
||
response.text().await
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
|
||
} else {
|
||
"[Binary content]".to_string()
|
||
};
|
||
|
||
Ok(content)
|
||
}
|
||
|
||
/// Extract readable text from HTML
|
||
fn extract_text_from_html(&self, html: &str) -> String {
|
||
let html_lower = html.to_lowercase();
|
||
let mut text = String::new();
|
||
let mut in_tag = false;
|
||
let mut in_script = false;
|
||
let mut in_style = false;
|
||
let mut pos: usize = 0;
|
||
|
||
for c in html.chars() {
|
||
let char_len = c.len_utf8();
|
||
match c {
|
||
'<' => {
|
||
// Check for closing tags before entering tag mode
|
||
let remaining = &html_lower[pos..];
|
||
if remaining.starts_with("</script") {
|
||
in_script = false;
|
||
} else if remaining.starts_with("</style") {
|
||
in_style = false;
|
||
}
|
||
// Check for opening tags
|
||
if remaining.starts_with("<script") {
|
||
in_script = true;
|
||
} else if remaining.starts_with("<style") {
|
||
in_style = true;
|
||
}
|
||
in_tag = true;
|
||
}
|
||
'>' => {
|
||
in_tag = false;
|
||
}
|
||
_ if in_tag => {}
|
||
_ if in_script || in_style => {}
|
||
' ' | '\n' | '\t' | '\r' => {
|
||
if !text.ends_with(' ') && !text.is_empty() {
|
||
text.push(' ');
|
||
}
|
||
}
|
||
_ => text.push(c),
|
||
}
|
||
pos += char_len;
|
||
}
|
||
|
||
if text.len() > 10000 {
|
||
text.truncate(10000);
|
||
text.push_str("...");
|
||
}
|
||
|
||
text.trim().to_string()
|
||
}
|
||
|
||
/// Generate a comprehensive research report
|
||
async fn execute_report(&self, query: &ResearchQuery) -> Result<ResearchReport> {
|
||
let start = std::time::Instant::now();
|
||
|
||
// First, execute search
|
||
let mut results = self.execute_search(query).await?;
|
||
|
||
// Fetch content for top results
|
||
let fetch_limit = match query.depth {
|
||
ResearchDepth::Quick => 1,
|
||
ResearchDepth::Standard => 3,
|
||
ResearchDepth::Deep => 5,
|
||
};
|
||
|
||
for result in results.iter_mut().take(fetch_limit) {
|
||
if !result.url.is_empty() {
|
||
match self.execute_fetch(&result.url).await {
|
||
Ok(fetched) => {
|
||
result.content = fetched.content;
|
||
result.fetched_at = fetched.fetched_at;
|
||
}
|
||
Err(e) => {
|
||
tracing::warn!(target: "researcher", error = %e, "Failed to fetch content");
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Extract key findings
|
||
let key_findings: Vec<String> = results.iter()
|
||
.take(5)
|
||
.filter_map(|r| {
|
||
r.content.as_ref().map(|c| {
|
||
c.split(". ")
|
||
.take(3)
|
||
.collect::<Vec<_>>()
|
||
.join(". ")
|
||
})
|
||
})
|
||
.collect();
|
||
|
||
// Extract related topics from snippets
|
||
let related_topics: Vec<String> = results.iter()
|
||
.filter_map(|r| {
|
||
if r.snippet.len() > 50 {
|
||
Some(r.title.clone())
|
||
} else {
|
||
None
|
||
}
|
||
})
|
||
.take(5)
|
||
.collect();
|
||
|
||
let duration = start.elapsed().as_millis() as u64;
|
||
|
||
// Generate summary from top results
|
||
let summary = if results.is_empty() {
|
||
"未找到相关结果,建议调整搜索关键词后重试".to_string()
|
||
} else {
|
||
let top_snippets: Vec<&str> = results
|
||
.iter()
|
||
.take(3)
|
||
.filter_map(|r| {
|
||
let s = r.snippet.trim();
|
||
if s.is_empty() { None } else { Some(s) }
|
||
})
|
||
.collect();
|
||
if top_snippets.is_empty() {
|
||
format!("找到 {} 条相关结果,但无摘要信息", results.len())
|
||
} else {
|
||
format!(
|
||
"基于 {} 条搜索结果:{}",
|
||
results.len(),
|
||
top_snippets.join(";")
|
||
)
|
||
}
|
||
};
|
||
|
||
Ok(ResearchReport {
|
||
query: query.query.clone(),
|
||
results,
|
||
summary: Some(summary),
|
||
key_findings,
|
||
related_topics,
|
||
researched_at: chrono::Utc::now().to_rfc3339(),
|
||
duration_ms: duration,
|
||
})
|
||
}
|
||
}
|
||
|
||
impl Default for ResearcherHand {
|
||
fn default() -> Self {
|
||
Self::new()
|
||
}
|
||
}
|
||
|
||
#[async_trait]
|
||
impl Hand for ResearcherHand {
|
||
fn config(&self) -> &HandConfig {
|
||
&self.config
|
||
}
|
||
|
||
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
|
||
tracing::debug!(target: "researcher", input = %input, "Researcher hand received input");
|
||
// Try strict deserialization first, then fall back to inference
|
||
let action: ResearcherAction = match serde_json::from_value(input.clone()) {
|
||
Ok(a) => a,
|
||
Err(e) => {
|
||
tracing::debug!(target: "researcher", error = %e, input = %input, "Strict deserialization failed, trying inference");
|
||
Self::infer_action(&input)?
|
||
}
|
||
};
|
||
|
||
let start = std::time::Instant::now();
|
||
|
||
let result = match action {
|
||
ResearcherAction::Search { query } => {
|
||
let results = self.execute_search(&query).await?;
|
||
json!({
|
||
"action": "search",
|
||
"query": query.query,
|
||
"results": results,
|
||
"duration_ms": start.elapsed().as_millis()
|
||
})
|
||
}
|
||
ResearcherAction::Fetch { url } => {
|
||
let result = self.execute_fetch(&url).await?;
|
||
json!({
|
||
"action": "fetch",
|
||
"url": url,
|
||
"result": result,
|
||
"duration_ms": start.elapsed().as_millis()
|
||
})
|
||
}
|
||
ResearcherAction::Summarize { urls } => {
|
||
let mut results = Vec::new();
|
||
for url in urls.iter().take(5) {
|
||
if let Ok(result) = self.execute_fetch(url).await {
|
||
results.push(result);
|
||
}
|
||
}
|
||
json!({
|
||
"action": "summarize",
|
||
"urls": urls,
|
||
"results": results,
|
||
"duration_ms": start.elapsed().as_millis()
|
||
})
|
||
}
|
||
ResearcherAction::Report { query } => {
|
||
let report = self.execute_report(&query).await?;
|
||
json!({
|
||
"action": "report",
|
||
"report": report
|
||
})
|
||
}
|
||
};
|
||
|
||
Ok(HandResult::success(result))
|
||
}
|
||
|
||
fn needs_approval(&self) -> bool {
|
||
false // Research operations are generally safe
|
||
}
|
||
|
||
fn check_dependencies(&self) -> Result<Vec<String>> {
|
||
// Network connectivity will be checked at runtime
|
||
Ok(Vec::new())
|
||
}
|
||
|
||
fn status(&self) -> crate::HandStatus {
|
||
crate::HandStatus::Idle
|
||
}
|
||
}
|
||
|
||
/// URL encoding helper — encodes each UTF-8 byte, not Unicode code points.
|
||
fn url_encode(s: &str) -> String {
|
||
s.bytes()
|
||
.map(|b| match b {
|
||
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => {
|
||
(b as char).to_string()
|
||
}
|
||
_ => format!("%{:02X}", b),
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
/// Check if a character is CJK (Chinese/Japanese/Korean)
|
||
fn is_cjk_char(c: char) -> bool {
|
||
matches!(c,
|
||
'\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs
|
||
'\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A
|
||
'\u{3000}'..='\u{303F}' | // CJK Symbols and Punctuation
|
||
'\u{FF00}'..='\u{FFEF}' | // Fullwidth Forms
|
||
'\u{2E80}'..='\u{2EFF}' | // CJK Radicals Supplement
|
||
'\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
|
||
)
|
||
}
|
||
|
||
/// Validate a URL for SSRF safety before fetching
|
||
fn validate_fetch_url(url_str: &str) -> Result<()> {
|
||
if url_str.len() > MAX_URL_LENGTH {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("URL exceeds maximum length of {} characters", MAX_URL_LENGTH)
|
||
));
|
||
}
|
||
|
||
let url = Url::parse(url_str)
|
||
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid URL: {}", e)))?;
|
||
|
||
match url.scheme() {
|
||
"http" | "https" => {}
|
||
scheme => {
|
||
return Err(zclaw_types::ZclawError::HandError(
|
||
format!("URL scheme '{}' not allowed, only http/https", scheme)
|
||
));
|
||
}
|
||
}
|
||
|
||
let host = url.host_str()
|
||
.ok_or_else(|| zclaw_types::ZclawError::HandError("URL must have a host".into()))?;
|
||
|
||
// Strip IPv6 brackets for parsing
|
||
let host_for_parsing = if host.starts_with('[') && host.ends_with(']') {
|
||
&host[1..host.len()-1]
|
||
} else {
|
||
host
|
||
};
|
||
|
||
if let Ok(ip) = host_for_parsing.parse::<IpAddr>() {
|
||
validate_ip(&ip)?;
|
||
} else {
|
||
validate_hostname(host)?;
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
fn validate_ip(ip: &IpAddr) -> Result<()> {
|
||
match ip {
|
||
IpAddr::V4(v4) => validate_ipv4(v4),
|
||
IpAddr::V6(v6) => validate_ipv6(v6),
|
||
}
|
||
}
|
||
|
||
fn validate_ipv4(ip: &Ipv4Addr) -> Result<()> {
|
||
let o = ip.octets();
|
||
if o[0] == 127 { return Err(ssrf_err("loopback")); }
|
||
if o[0] == 10 { return Err(ssrf_err("private 10.x.x.x")); }
|
||
if o[0] == 172 && (16..=31).contains(&o[1]) { return Err(ssrf_err("private 172.16-31.x.x")); }
|
||
if o[0] == 192 && o[1] == 168 { return Err(ssrf_err("private 192.168.x.x")); }
|
||
if o[0] == 169 && o[1] == 254 { return Err(ssrf_err("link-local/metadata")); }
|
||
if o[0] == 0 { return Err(ssrf_err("0.x.x.x")); }
|
||
if *ip == Ipv4Addr::new(255, 255, 255, 255) { return Err(ssrf_err("broadcast")); }
|
||
if (224..=239).contains(&o[0]) { return Err(ssrf_err("multicast")); }
|
||
Ok(())
|
||
}
|
||
|
||
fn validate_ipv6(ip: &Ipv6Addr) -> Result<()> {
|
||
if *ip == Ipv6Addr::LOCALHOST { return Err(ssrf_err("IPv6 loopback")); }
|
||
if *ip == Ipv6Addr::UNSPECIFIED { return Err(ssrf_err("IPv6 unspecified")); }
|
||
let segs = ip.segments();
|
||
// IPv4-mapped: ::ffff:x.x.x.x
|
||
if segs[5] == 0xffff {
|
||
let v4 = ((segs[6] as u32) << 16) | (segs[7] as u32);
|
||
validate_ipv4(&Ipv4Addr::from(v4))?;
|
||
}
|
||
// Link-local fe80::/10
|
||
if (segs[0] & 0xffc0) == 0xfe80 { return Err(ssrf_err("IPv6 link-local")); }
|
||
// Unique local fc00::/7
|
||
if (segs[0] & 0xfe00) == 0xfc00 { return Err(ssrf_err("IPv6 unique local")); }
|
||
Ok(())
|
||
}
|
||
|
||
fn validate_hostname(host: &str) -> Result<()> {
|
||
let h = host.to_lowercase();
|
||
let blocked = [
|
||
"localhost", "localhost.localdomain", "ip6-localhost",
|
||
"ip6-loopback", "metadata.google.internal", "metadata",
|
||
"kubernetes.default", "kubernetes.default.svc",
|
||
];
|
||
for b in &blocked {
|
||
if h == *b || h.ends_with(&format!(".{}", b)) {
|
||
return Err(ssrf_err(&format!("blocked host '{}'", host)));
|
||
}
|
||
}
|
||
// Decimal IP bypass: 2130706433 = 127.0.0.1
|
||
if h.chars().all(|c| c.is_ascii_digit()) {
|
||
if let Ok(num) = h.parse::<u32>() {
|
||
validate_ipv4(&Ipv4Addr::from(num))?;
|
||
}
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
fn ssrf_err(reason: &str) -> zclaw_types::ZclawError {
|
||
zclaw_types::ZclawError::HandError(format!("Access denied: {}", reason))
|
||
}
|
||
|
||
/// Extract text between two delimiters
|
||
fn extract_between<'a>(text: &'a str, start: &str, end: &str) -> Option<&'a str> {
|
||
let start_idx = text.find(start)?;
|
||
let rest = &text[start_idx + start.len()..];
|
||
let end_idx = rest.find(end)?;
|
||
Some(&rest[..end_idx])
|
||
}
|
||
|
||
/// Strip HTML tags from a string
|
||
fn strip_html_tags(s: &str) -> String {
|
||
let mut result = String::with_capacity(s.len());
|
||
let mut in_tag = false;
|
||
for c in s.chars() {
|
||
match c {
|
||
'<' => in_tag = true,
|
||
'>' => in_tag = false,
|
||
_ if !in_tag => result.push(c),
|
||
_ => {}
|
||
}
|
||
}
|
||
|
||
// Decode common HTML entities
|
||
result = result.replace("&", "&")
|
||
.replace("<", "<")
|
||
.replace(">", ">")
|
||
.replace(""", "\"")
|
||
.replace("'", "'")
|
||
.replace(" ", " ")
|
||
.replace("'", "'")
|
||
.replace("/", "/");
|
||
|
||
// Collapse whitespace
|
||
let collapsed: String = result.split_whitespace().collect::<Vec<_>>().join(" ");
|
||
collapsed
|
||
}
|
||
|
||
/// Check if a search result is likely genuine (not navigation/ad/script garbage)
|
||
fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool {
|
||
// Title quality checks
|
||
let title_trimmed = title.trim();
|
||
if title_trimmed.len() < 2 || title_trimmed.len() > 300 {
|
||
return false;
|
||
}
|
||
// Reject titles with JavaScript/CSS indicators
|
||
let lower = title_trimmed.to_lowercase();
|
||
if lower.contains("function(") || lower.contains("var ") || lower.contains("const ")
|
||
|| lower.contains("window.") || lower.contains("document.")
|
||
|| lower.contains("{") || lower.contains("}")
|
||
|| lower.starts_with("//") || lower.starts_with("/*")
|
||
|| lower.contains("cookie") || lower.contains("navigator.")
|
||
|| lower.contains(".css") || lower.contains("stylesheet")
|
||
|| lower.contains("google-analytics") || lower.contains("gtag")
|
||
{
|
||
return false;
|
||
}
|
||
|
||
// URL quality checks
|
||
if url.contains("javascript:") || url.contains("data:") {
|
||
return false;
|
||
}
|
||
// Reject URLs that are just fragments or relative paths
|
||
if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") {
|
||
return false;
|
||
}
|
||
|
||
// Snippet quality — if snippet looks like code, reject
|
||
let snippet_lower = snippet.to_lowercase();
|
||
if snippet_lower.contains("function(") && snippet_lower.contains("return ")
|
||
|| snippet_lower.contains("var ") && snippet_lower.contains("=")
|
||
{
|
||
return false;
|
||
}
|
||
|
||
true
|
||
}
|
||
|
||
/// Extract href URL from the first <a> tag in text
|
||
fn extract_href(text: &str) -> Option<String> {
|
||
let href_start = text.find("href=\"")?;
|
||
let rest = &text[href_start + 6..];
|
||
let end = rest.find('"')?;
|
||
let url = &rest[..end];
|
||
|
||
if url.starts_with("http") {
|
||
Some(url.to_string())
|
||
} else if url.starts_with("//") {
|
||
Some(format!("https:{}", url))
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
|
||
/// Extract the real URL from DDG's redirect link (uddg= parameter)
|
||
fn extract_href_uddg(text: &str) -> Option<String> {
|
||
if let Some(idx) = text.find("uddg=") {
|
||
let rest = &text[idx + 5..];
|
||
let url_encoded = rest.split('&').next().unwrap_or("");
|
||
// Use standard percent decoding instead of manual replacement
|
||
let decoded = percent_decode(url_encoded);
|
||
if decoded.starts_with("http") {
|
||
return Some(decoded);
|
||
}
|
||
}
|
||
|
||
// Fallback: try regular href extraction
|
||
extract_href(text)
|
||
}
|
||
|
||
/// Standard percent-decode a URL-encoded string
|
||
fn percent_decode(input: &str) -> String {
|
||
let mut result = Vec::new();
|
||
let bytes = input.as_bytes();
|
||
let mut i = 0;
|
||
while i < bytes.len() {
|
||
if bytes[i] == b'%' && i + 2 < bytes.len() {
|
||
if let Ok(byte) = u8::from_str_radix(
|
||
&input[i + 1..i + 3], 16
|
||
) {
|
||
result.push(byte);
|
||
i += 3;
|
||
continue;
|
||
}
|
||
}
|
||
result.push(bytes[i]);
|
||
i += 1;
|
||
}
|
||
String::from_utf8_lossy(&result).to_string()
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
fn create_test_hand() -> ResearcherHand {
|
||
ResearcherHand::new()
|
||
}
|
||
|
||
fn test_context() -> HandContext {
|
||
HandContext::default()
|
||
}
|
||
|
||
// --- Config & Type Tests ---
|
||
|
||
#[test]
|
||
fn test_config_id() {
|
||
let hand = create_test_hand();
|
||
assert_eq!(hand.config().id, "researcher");
|
||
assert_eq!(hand.config().name, "研究员");
|
||
assert!(hand.config().enabled);
|
||
assert!(!hand.config().needs_approval);
|
||
}
|
||
|
||
#[test]
|
||
fn test_search_engine_default_is_auto() {
|
||
let engine = SearchEngine::default();
|
||
assert!(matches!(engine, SearchEngine::Auto));
|
||
}
|
||
|
||
#[test]
|
||
fn test_search_engine_searxng_deserialize() {
|
||
let engine: SearchEngine = serde_json::from_str("\"searxng\"").unwrap();
|
||
assert!(matches!(engine, SearchEngine::SearXNG));
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_depth_default_is_standard() {
|
||
let depth = ResearchDepth::default();
|
||
assert!(matches!(depth, ResearchDepth::Standard));
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_depth_serialize() {
|
||
let json = serde_json::to_string(&ResearchDepth::Deep).unwrap();
|
||
assert_eq!(json, "\"deep\"");
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_depth_deserialize() {
|
||
let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap();
|
||
assert!(matches!(depth, ResearchDepth::Quick));
|
||
}
|
||
|
||
#[test]
|
||
fn test_search_engine_serialize_roundtrip() {
|
||
for engine in [SearchEngine::SearXNG, SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] {
|
||
let json = serde_json::to_string(&engine).unwrap();
|
||
let back: SearchEngine = serde_json::from_str(&json).unwrap();
|
||
assert_eq!(json, serde_json::to_string(&back).unwrap());
|
||
}
|
||
}
|
||
|
||
// --- Action Deserialization Tests ---
|
||
|
||
#[test]
|
||
fn test_action_search_deserialize() {
|
||
let json = json!({
|
||
"action": "search",
|
||
"query": {
|
||
"query": "Rust programming",
|
||
"engine": "duckduckgo",
|
||
"depth": "quick",
|
||
"maxResults": 5
|
||
}
|
||
});
|
||
let action: ResearcherAction = serde_json::from_value(json).unwrap();
|
||
match action {
|
||
ResearcherAction::Search { query } => {
|
||
assert_eq!(query.query, "Rust programming");
|
||
assert!(matches!(query.engine, SearchEngine::DuckDuckGo));
|
||
assert!(matches!(query.depth, ResearchDepth::Quick));
|
||
assert_eq!(query.max_results, 5);
|
||
}
|
||
_ => panic!("Expected Search action"),
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_action_fetch_deserialize() {
|
||
let json = json!({
|
||
"action": "fetch",
|
||
"url": "https://example.com/page"
|
||
});
|
||
let action: ResearcherAction = serde_json::from_value(json).unwrap();
|
||
match action {
|
||
ResearcherAction::Fetch { url } => {
|
||
assert_eq!(url, "https://example.com/page");
|
||
}
|
||
_ => panic!("Expected Fetch action"),
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_action_report_deserialize() {
|
||
let json = json!({
|
||
"action": "report",
|
||
"query": {
|
||
"query": "AI trends 2026",
|
||
"depth": "deep"
|
||
}
|
||
});
|
||
let action: ResearcherAction = serde_json::from_value(json).unwrap();
|
||
match action {
|
||
ResearcherAction::Report { query } => {
|
||
assert_eq!(query.query, "AI trends 2026");
|
||
assert!(matches!(query.depth, ResearchDepth::Deep));
|
||
}
|
||
_ => panic!("Expected Report action"),
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn test_action_invalid_rejected() {
|
||
let json = json!({
|
||
"action": "unknown_action",
|
||
"data": "whatever"
|
||
});
|
||
let result: std::result::Result<ResearcherAction, _> = serde_json::from_value(json);
|
||
assert!(result.is_err());
|
||
}
|
||
|
||
// --- URL Encoding Tests ---
|
||
|
||
#[test]
|
||
fn test_url_encode_ascii() {
|
||
assert_eq!(url_encode("hello world"), "hello%20world");
|
||
}
|
||
|
||
#[test]
|
||
fn test_url_encode_chinese() {
|
||
// "医" = UTF-8 bytes E5 8C BB → must produce %E5%8C%BB, not %533B
|
||
let encoded = url_encode("医");
|
||
assert_eq!(encoded, "%E5%8C%BB");
|
||
|
||
// Full phrase: "中文" = E4 B8 AD E6 96 87
|
||
let encoded = url_encode("中文搜索");
|
||
assert_eq!(&encoded[0..9], "%E4%B8%AD");
|
||
assert!(!encoded.contains("中文"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_url_encode_safe_chars() {
|
||
assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string());
|
||
}
|
||
|
||
#[test]
|
||
fn test_url_encode_empty() {
|
||
assert_eq!(url_encode(""), "");
|
||
}
|
||
|
||
// --- HTML Text Extraction Tests ---
|
||
|
||
#[test]
|
||
fn test_extract_text_basic() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body><h1>Title</h1><p>Content here</p></body></html>";
|
||
let text = hand.extract_text_from_html(html);
|
||
assert!(text.contains("Title"));
|
||
assert!(text.contains("Content here"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_text_strips_scripts() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body><script>alert('xss')</script><p>Safe text</p></body></html>";
|
||
let text = hand.extract_text_from_html(html);
|
||
assert!(!text.contains("alert"));
|
||
assert!(text.contains("Safe text"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_text_strips_styles() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body><style>.class{color:red}</style><p>Visible</p></body></html>";
|
||
let text = hand.extract_text_from_html(html);
|
||
assert!(!text.contains("color"));
|
||
assert!(text.contains("Visible"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_text_truncates_long_content() {
|
||
let hand = create_test_hand();
|
||
let long_body: String = "x".repeat(20000);
|
||
let html = format!("<html><body><p>{}</p></body></html>", long_body);
|
||
let text = hand.extract_text_from_html(&html);
|
||
assert!(text.len() <= 10003); // 10000 + "..."
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_text_empty_body() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body></body></html>";
|
||
let text = hand.extract_text_from_html(html);
|
||
assert!(text.is_empty());
|
||
}
|
||
|
||
// --- Hand Trait Tests ---
|
||
|
||
#[tokio::test]
|
||
async fn test_needs_approval_is_false() {
|
||
let hand = create_test_hand();
|
||
assert!(!hand.needs_approval());
|
||
}
|
||
|
||
#[tokio::test]
|
||
async fn test_status_is_idle() {
|
||
let hand = create_test_hand();
|
||
assert!(matches!(hand.status(), crate::HandStatus::Idle));
|
||
}
|
||
|
||
#[tokio::test]
|
||
async fn test_check_dependencies_ok() {
|
||
let hand = create_test_hand();
|
||
let missing = hand.check_dependencies().unwrap();
|
||
// Default is_dependency_available returns true for all
|
||
assert!(missing.is_empty());
|
||
}
|
||
|
||
// --- Default Values Tests ---
|
||
|
||
#[test]
|
||
fn test_research_query_defaults() {
|
||
let json = json!({ "query": "test" });
|
||
let query: ResearchQuery = serde_json::from_value(json).unwrap();
|
||
assert_eq!(query.query, "test");
|
||
assert!(matches!(query.engine, SearchEngine::Auto));
|
||
assert!(matches!(query.depth, ResearchDepth::Standard));
|
||
assert_eq!(query.max_results, 10);
|
||
assert_eq!(query.time_limit_secs, 60);
|
||
assert!(!query.include_related);
|
||
}
|
||
|
||
#[test]
|
||
fn test_search_result_serialization() {
|
||
let result = SearchResult {
|
||
title: "Test".to_string(),
|
||
url: "https://example.com".to_string(),
|
||
snippet: "A snippet".to_string(),
|
||
source: "TestSource".to_string(),
|
||
relevance: 90,
|
||
content: None,
|
||
fetched_at: None,
|
||
};
|
||
let json = serde_json::to_string(&result).unwrap();
|
||
assert!(json.contains("Test"));
|
||
assert!(json.contains("https://example.com"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_report_summary_is_some_when_results() {
|
||
// Verify the struct allows Some value
|
||
let report = ResearchReport {
|
||
query: "test".to_string(),
|
||
results: vec![SearchResult {
|
||
title: "R".to_string(),
|
||
url: "https://r.co".to_string(),
|
||
snippet: "snippet text".to_string(),
|
||
source: "S".to_string(),
|
||
relevance: 80,
|
||
content: None,
|
||
fetched_at: None,
|
||
}],
|
||
summary: Some("基于 1 条搜索结果:snippet text".to_string()),
|
||
key_findings: vec![],
|
||
related_topics: vec![],
|
||
researched_at: "2026-01-01T00:00:00Z".to_string(),
|
||
duration_ms: 100,
|
||
};
|
||
assert!(report.summary.is_some());
|
||
assert!(report.summary.unwrap().contains("snippet text"));
|
||
}
|
||
|
||
// --- SearchConfig Tests ---
|
||
|
||
#[test]
|
||
fn test_search_config_default() {
|
||
let config = SearchConfig::default();
|
||
assert!(matches!(config.default_engine, SearchEngine::Auto));
|
||
assert_eq!(config.searxng_url, "http://localhost:8888");
|
||
assert_eq!(config.timeout_secs, 15);
|
||
}
|
||
|
||
#[test]
|
||
fn test_search_config_load_fallback_on_missing_file() {
|
||
// Config loads from config/config.toml which may not exist in test CWD
|
||
let config = SearchConfig::load();
|
||
// Should return a valid config either way
|
||
assert!(!config.searxng_url.is_empty());
|
||
}
|
||
|
||
// --- SearXNG Response Parsing Tests ---
|
||
|
||
#[test]
|
||
fn test_searxng_response_parse() {
|
||
let mock_response = json!({
|
||
"query": "Rust programming",
|
||
"number_of_results": 42,
|
||
"results": [
|
||
{
|
||
"url": "https://www.rust-lang.org/",
|
||
"title": "Rust Programming Language",
|
||
"content": "A language empowering everyone to build reliable software.",
|
||
"engine": "google",
|
||
"engines": ["google", "duckduckgo"],
|
||
"score": 5.2,
|
||
"category": "general"
|
||
},
|
||
{
|
||
"url": "https://doc.rust-lang.org/book/",
|
||
"title": "The Rust Book",
|
||
"content": "The official guide to Rust programming.",
|
||
"engine": "bing",
|
||
"engines": ["bing"],
|
||
"score": 3.1,
|
||
"category": "general"
|
||
}
|
||
],
|
||
"suggestions": ["rust tutorial", "rust vs go"]
|
||
});
|
||
|
||
let results = mock_response.get("results").unwrap().as_array().unwrap();
|
||
assert_eq!(results.len(), 2);
|
||
|
||
// Verify first result mapping
|
||
let r0 = &results[0];
|
||
assert_eq!(r0["title"].as_str().unwrap(), "Rust Programming Language");
|
||
assert_eq!(r0["url"].as_str().unwrap(), "https://www.rust-lang.org/");
|
||
assert_eq!(r0["content"].as_str().unwrap(), "A language empowering everyone to build reliable software.");
|
||
|
||
let engines: Vec<&str> = r0["engines"].as_array().unwrap()
|
||
.iter().filter_map(|e| e.as_str()).collect();
|
||
assert_eq!(engines, vec!["google", "duckduckgo"]);
|
||
}
|
||
|
||
#[test]
|
||
fn test_searxng_empty_results() {
|
||
let mock_response = json!({
|
||
"query": "nonexistent xyzzy123",
|
||
"number_of_results": 0,
|
||
"results": [],
|
||
"suggestions": []
|
||
});
|
||
|
||
let results = mock_response.get("results").unwrap().as_array().unwrap();
|
||
assert!(results.is_empty());
|
||
}
|
||
|
||
#[test]
|
||
fn test_searxng_score_normalization() {
|
||
// Score 5.2 → (5.2 * 10) = 52 → relevance 52
|
||
let score = 5.2_f64;
|
||
let relevance = if score > 0.0 {
|
||
(score.min(10.0) * 10.0) as u8
|
||
} else {
|
||
50
|
||
};
|
||
assert_eq!(relevance, 52);
|
||
|
||
// Score 15.0 → clamped to 10.0 → relevance 100
|
||
let score = 15.0_f64;
|
||
let relevance = if score > 0.0 {
|
||
(score.min(10.0) * 10.0) as u8
|
||
} else {
|
||
50
|
||
};
|
||
assert_eq!(relevance, 100);
|
||
|
||
// Score 0.0 → default relevance 50
|
||
let score = 0.0_f64;
|
||
let relevance = if score > 0.0 {
|
||
(score.min(10.0) * 10.0) as u8
|
||
} else {
|
||
50
|
||
};
|
||
assert_eq!(relevance, 50);
|
||
}
|
||
|
||
#[test]
|
||
fn test_searxng_url_construction() {
|
||
let config = SearchConfig::default();
|
||
let query = "2024年中国医疗政策";
|
||
let url = format!(
|
||
"{}/search?q={}&format=json&categories=general&language=auto&pageno=1",
|
||
config.searxng_url.trim_end_matches('/'),
|
||
url_encode(query)
|
||
);
|
||
assert!(url.starts_with("http://localhost:8888/search?"));
|
||
assert!(url.contains("format=json"));
|
||
assert!(url.contains("categories=general"));
|
||
assert!(url.contains("%E4%B8%AD")); // 中 = E4 B8 AD
|
||
assert!(!url.contains("%4E2D")); // NOT Unicode codepoint
|
||
}
|
||
|
||
// --- Native Search Helper Tests ---
|
||
|
||
#[test]
|
||
fn test_is_cjk_char_chinese() {
|
||
assert!(is_cjk_char('中'));
|
||
assert!(is_cjk_char('医'));
|
||
assert!(is_cjk_char('。'));
|
||
assert!(!is_cjk_char('a'));
|
||
assert!(!is_cjk_char('1'));
|
||
assert!(!is_cjk_char(' '));
|
||
}
|
||
|
||
#[test]
|
||
fn test_is_cjk_char_detects_chinese_query() {
|
||
let query = "2024年中国医疗政策";
|
||
assert!(query.chars().any(|c| is_cjk_char(c)));
|
||
|
||
let query_en = "Rust programming language";
|
||
assert!(!query_en.chars().any(|c| is_cjk_char(c)));
|
||
}
|
||
|
||
#[test]
|
||
fn test_strip_html_tags() {
|
||
assert_eq!(strip_html_tags("<b>Hello</b>"), "Hello");
|
||
assert_eq!(strip_html_tags("<a href=\"x\">Link</a>"), "Link");
|
||
assert_eq!(strip_html_tags("plain text"), "plain text");
|
||
assert_eq!(strip_html_tags("&<>"), "&<>");
|
||
// strip_html_tags only removes tags, not script content
|
||
assert_eq!(strip_html_tags("<script>alert()</script>Safe"), "alert()Safe");
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_between_basic() {
|
||
let text = "prefix<div>content</div>suffix";
|
||
assert_eq!(extract_between(text, "<div>", "</div>"), Some("content"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_between_not_found() {
|
||
let text = "no delimiters here";
|
||
assert_eq!(extract_between(text, "<div>", "</div>"), None);
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_href() {
|
||
let text = r#"<a href="https://example.com/page">Title</a>"#;
|
||
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_href_protocol_relative() {
|
||
let text = r#"<a href="//example.com/page">Title</a>"#;
|
||
assert_eq!(extract_href(text), Some("https://example.com/page".to_string()));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_href_uddg() {
|
||
let text = r#"href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2Fpage&rut=abc""#;
|
||
assert_eq!(extract_href_uddg(text), Some("https://example.com/page".to_string()));
|
||
}
|
||
|
||
#[test]
|
||
fn test_extract_href_uddg_fallback() {
|
||
let text = r#"<a href="https://example.com/direct">Title</a>"#;
|
||
assert_eq!(extract_href_uddg(text), Some("https://example.com/direct".to_string()));
|
||
}
|
||
|
||
// --- HTML Parser Tests ---
|
||
|
||
#[test]
|
||
fn test_parse_ddg_html() {
|
||
let hand = create_test_hand();
|
||
let html = r#"
|
||
<div class="result__body">
|
||
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Frust-lang.org&rut=abc">Rust Programming Language</a>
|
||
<a class="result__snippet">A systems programming language focused on safety and speed.</a>
|
||
</div>
|
||
<div class="result__body">
|
||
<a rel="nofollow" class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fdoc.rust-lang.org&rut=def">The Rust Book</a>
|
||
<a class="result__snippet">The official guide to Rust programming.</a>
|
||
</div>
|
||
"#;
|
||
|
||
let results = hand.parse_ddg_html(html, 10);
|
||
assert_eq!(results.len(), 2);
|
||
assert_eq!(results[0].title, "Rust Programming Language");
|
||
assert_eq!(results[0].url, "https://rust-lang.org");
|
||
assert_eq!(results[0].source, "DuckDuckGo");
|
||
assert_eq!(results[1].title, "The Rust Book");
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_ddg_html_max_results() {
|
||
let hand = create_test_hand();
|
||
let mut html = String::new();
|
||
for i in 0..20 {
|
||
html.push_str(&format!(
|
||
r#"<div class="result__body"><a class="result__a" href="//duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F{}">Result {}</a><a class="result__snippet">Snippet {}</a></div>"#,
|
||
i, i, i
|
||
));
|
||
}
|
||
let results = hand.parse_ddg_html(&html, 5);
|
||
assert_eq!(results.len(), 5);
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_ddg_html_empty() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body>No results here</body></html>";
|
||
let results = hand.parse_ddg_html(html, 10);
|
||
assert!(results.is_empty());
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_bing_html() {
|
||
let hand = create_test_hand();
|
||
let html = r#"
|
||
<li class="b_algo">
|
||
<h2><a href="https://example.com/result1">Example Result 1</a></h2>
|
||
<div class="b_caption"><p>This is the first result snippet.</p></div>
|
||
</li>
|
||
<li class="b_algo">
|
||
<h2><a href="https://example.com/result2">Example Result 2</a></h2>
|
||
<div class="b_caption"><p>This is the second result snippet.</p></div>
|
||
</li>
|
||
"#;
|
||
|
||
let results = hand.parse_bing_html(html, 10);
|
||
assert_eq!(results.len(), 2);
|
||
assert_eq!(results[0].title, "Example Result 1");
|
||
assert_eq!(results[0].url, "https://example.com/result1");
|
||
assert_eq!(results[0].source, "Bing");
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_bing_html_skips_internal_urls() {
|
||
let hand = create_test_hand();
|
||
let html = r#"
|
||
<li class="b_algo">
|
||
<h2><a href="https://bing.com/search?q=more">More Results</a></h2>
|
||
</li>
|
||
<li class="b_algo">
|
||
<h2><a href="https://example.com/real">Real Result</a></h2>
|
||
</li>
|
||
"#;
|
||
|
||
let results = hand.parse_bing_html(html, 10);
|
||
assert_eq!(results.len(), 1);
|
||
assert_eq!(results[0].url, "https://example.com/real");
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_bing_html_empty() {
|
||
let hand = create_test_hand();
|
||
let html = "<html><body>Nothing here</body></html>";
|
||
let results = hand.parse_bing_html(html, 10);
|
||
assert!(results.is_empty());
|
||
}
|
||
|
||
#[test]
|
||
fn test_parse_baidu_html() {
|
||
let hand = create_test_hand();
|
||
let html = r#"
|
||
<div class="result c-container">
|
||
<h3 class="t"><a href="https://www.example.cn/page1">中国医疗政策 2024</a></h3>
|
||
<div class="c-abstract">这是关于医疗政策的摘要信息。</div>
|
||
</div>
|
||
<div class="c-container new-pmd">
|
||
<h3><a href="https://www.example.cn/page2">第二条结果</a></h3>
|
||
</div>
|
||
"#;
|
||
|
||
let results = hand.parse_baidu_html(html, 10);
|
||
assert!(results.len() >= 1, "Should find at least 1 result, got {}", results.len());
|
||
assert_eq!(results[0].source, "Baidu");
|
||
}
|
||
|
||
// --- SSRF Validation Tests ---
|
||
|
||
#[test]
|
||
fn test_ssrf_blocks_localhost() {
|
||
assert!(validate_fetch_url("http://localhost:8080/admin").is_err());
|
||
assert!(validate_fetch_url("http://127.0.0.1:5432/db").is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_ssrf_blocks_private_ip() {
|
||
assert!(validate_fetch_url("http://10.0.0.1/secret").is_err());
|
||
assert!(validate_fetch_url("http://192.168.1.1/router").is_err());
|
||
assert!(validate_fetch_url("http://172.16.0.1/internal").is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_ssrf_blocks_cloud_metadata() {
|
||
assert!(validate_fetch_url("http://169.254.169.254/metadata").is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_ssrf_blocks_non_http_scheme() {
|
||
assert!(validate_fetch_url("file:///etc/passwd").is_err());
|
||
assert!(validate_fetch_url("ftp://example.com/file").is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_ssrf_allows_public_url() {
|
||
assert!(validate_fetch_url("https://www.rust-lang.org/learn").is_ok());
|
||
assert!(validate_fetch_url("https://example.com/page?q=test").is_ok());
|
||
}
|
||
|
||
// --- Percent Decode Tests ---
|
||
|
||
#[test]
|
||
fn test_percent_decode_basic() {
|
||
assert_eq!(percent_decode("hello%20world"), "hello world");
|
||
assert_eq!(percent_decode("%E4%B8%AD%E6%96%87"), "中文");
|
||
}
|
||
|
||
#[test]
|
||
fn test_percent_decode_full_url() {
|
||
assert_eq!(
|
||
percent_decode("https%3A%2F%2Fexample.com%2Fpage%3Fq%3Dtest"),
|
||
"https://example.com/page?q=test"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_percent_decode_no_encoding() {
|
||
assert_eq!(percent_decode("plain-text_123"), "plain-text_123");
|
||
}
|
||
|
||
// --- Input Validation Tests ---
|
||
|
||
#[test]
|
||
fn test_research_query_validate_empty() {
|
||
let query = ResearchQuery {
|
||
query: " ".to_string(), engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard, max_results: 10,
|
||
include_related: false, time_limit_secs: 60,
|
||
};
|
||
assert!(query.validate().is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_query_validate_too_long() {
|
||
let query = ResearchQuery {
|
||
query: "x".repeat(501), engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard, max_results: 10,
|
||
include_related: false, time_limit_secs: 60,
|
||
};
|
||
assert!(query.validate().is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_query_validate_max_results_overflow() {
|
||
let query = ResearchQuery {
|
||
query: "test".to_string(), engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard, max_results: 999,
|
||
include_related: false, time_limit_secs: 60,
|
||
};
|
||
assert!(query.validate().is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn test_research_query_validate_ok() {
|
||
let query = ResearchQuery {
|
||
query: "Rust programming".to_string(), engine: SearchEngine::Auto,
|
||
depth: ResearchDepth::Standard, max_results: 10,
|
||
include_related: false, time_limit_secs: 60,
|
||
};
|
||
assert!(query.validate().is_ok());
|
||
}
|
||
|
||
// --- Quality Filter Tests ---
|
||
|
||
#[test]
|
||
fn test_quality_rejects_javascript_title() {
|
||
assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_rejects_short_title() {
|
||
assert!(!is_quality_result("A", "snippet", "https://example.com"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_rejects_css_title() {
|
||
assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_rejects_javascript_url() {
|
||
assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_accepts_normal_result() {
|
||
assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_accepts_english_result() {
|
||
assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_quality_rejects_long_title() {
|
||
let long_title: String = "x".repeat(301);
|
||
assert!(!is_quality_result(&long_title, "ok", "https://example.com"));
|
||
}
|
||
|
||
#[test]
|
||
fn test_strip_html_tags_collapses_whitespace() {
|
||
assert_eq!(strip_html_tags("<b>Hello</b> <i>World</i>"), "Hello World");
|
||
assert_eq!(strip_html_tags("a\n\t b"), "a b");
|
||
}
|
||
}
|