Files
zclaw_openfang/crates/zclaw-hands/src/hands/researcher.rs
iven 59fc7debd6 feat(hands): add 25 unit tests + fix summary + fix HTML extraction for ResearcherHand
- Add comprehensive test suite: config, types, action deserialization, URL encoding,
  HTML text extraction, hand trait methods
- Fix summary field: generate rule-based summary from top search results (was always None)
- Fix extract_text_from_html: correct position tracking for script/style tag detection

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-01 23:16:57 +08:00

845 lines
27 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Researcher Hand - Deep research and analysis capabilities
//!
//! This hand provides web search, content fetching, and research synthesis.
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
use zclaw_types::Result;
use crate::{Hand, HandConfig, HandContext, HandResult};
/// Search engine options
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SearchEngine {
Google,
Bing,
DuckDuckGo,
Auto,
}
impl Default for SearchEngine {
fn default() -> Self {
Self::Auto
}
}
/// Research depth level
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum ResearchDepth {
Quick, // Fast search, top 3 results
Standard, // Normal search, top 10 results
Deep, // Comprehensive search, multiple sources
}
impl Default for ResearchDepth {
fn default() -> Self {
Self::Standard
}
}
/// Research query configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchQuery {
/// Search query
pub query: String,
/// Search engine to use
#[serde(default)]
pub engine: SearchEngine,
/// Research depth
#[serde(default)]
pub depth: ResearchDepth,
/// Maximum results to return
#[serde(default = "default_max_results")]
pub max_results: usize,
/// Include related topics
#[serde(default)]
pub include_related: bool,
/// Time limit in seconds
#[serde(default = "default_time_limit")]
pub time_limit_secs: u64,
}
fn default_max_results() -> usize { 10 }
fn default_time_limit() -> u64 { 60 }
/// Search result item
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
/// Title of the result
pub title: String,
/// URL
pub url: String,
/// Snippet/summary
pub snippet: String,
/// Source name
pub source: String,
/// Relevance score (0-100)
#[serde(default)]
pub relevance: u8,
/// Fetched content (if available)
#[serde(default)]
pub content: Option<String>,
/// Timestamp
#[serde(default)]
pub fetched_at: Option<String>,
}
/// Research report
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct ResearchReport {
/// Original query
pub query: String,
/// Search results
pub results: Vec<SearchResult>,
/// Synthesized summary
#[serde(default)]
pub summary: Option<String>,
/// Key findings
#[serde(default)]
pub key_findings: Vec<String>,
/// Related topics discovered
#[serde(default)]
pub related_topics: Vec<String>,
/// Research timestamp
pub researched_at: String,
/// Total time spent (ms)
pub duration_ms: u64,
}
/// Researcher action types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "action")]
pub enum ResearcherAction {
#[serde(rename = "search")]
Search { query: ResearchQuery },
#[serde(rename = "fetch")]
Fetch { url: String },
#[serde(rename = "summarize")]
Summarize { urls: Vec<String> },
#[serde(rename = "report")]
Report { query: ResearchQuery },
}
/// Researcher Hand implementation
pub struct ResearcherHand {
config: HandConfig,
client: reqwest::Client,
cache: Arc<RwLock<HashMap<String, SearchResult>>>,
}
impl ResearcherHand {
/// Create a new researcher hand
pub fn new() -> Self {
Self {
config: HandConfig {
id: "researcher".to_string(),
name: "研究员".to_string(),
description: "深度研究和分析能力,支持网络搜索和内容获取".to_string(),
needs_approval: false,
dependencies: vec!["network".to_string()],
input_schema: Some(serde_json::json!({
"type": "object",
"oneOf": [
{
"properties": {
"action": { "const": "search" },
"query": {
"type": "object",
"properties": {
"query": { "type": "string" },
"engine": { "type": "string", "enum": ["google", "bing", "duckduckgo", "auto"] },
"depth": { "type": "string", "enum": ["quick", "standard", "deep"] },
"maxResults": { "type": "integer" }
},
"required": ["query"]
}
},
"required": ["action", "query"]
},
{
"properties": {
"action": { "const": "fetch" },
"url": { "type": "string" }
},
"required": ["action", "url"]
},
{
"properties": {
"action": { "const": "report" },
"query": { "$ref": "#/properties/query" }
},
"required": ["action", "query"]
}
]
})),
tags: vec!["research".to_string(), "web".to_string(), "search".to_string()],
enabled: true,
},
client: reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.user_agent("ZCLAW-Researcher/1.0")
.build()
.unwrap_or_else(|_| reqwest::Client::new()),
cache: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Execute a web search
async fn execute_search(&self, query: &ResearchQuery) -> Result<Vec<SearchResult>> {
let start = std::time::Instant::now();
// Use DuckDuckGo as default search (no API key required)
let results = self.search_duckduckgo(&query.query, query.max_results).await?;
let duration = start.elapsed().as_millis() as u64;
tracing::info!(
target: "researcher",
query = %query.query,
duration_ms = duration,
results_count = results.len(),
"Search completed"
);
Ok(results)
}
/// Search using DuckDuckGo (no API key required)
async fn search_duckduckgo(&self, query: &str, max_results: usize) -> Result<Vec<SearchResult>> {
let url = format!("https://api.duckduckgo.com/?q={}&format=json&no_html=1",
url_encode(query));
let response = self.client
.get(&url)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Search request failed: {}", e)))?;
let json: Value = response.json().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to parse search response: {}", e)))?;
let mut results = Vec::new();
// Parse DuckDuckGo Instant Answer
if let Some(abstract_text) = json.get("AbstractText").and_then(|v| v.as_str()) {
if !abstract_text.is_empty() {
results.push(SearchResult {
title: query.to_string(),
url: json.get("AbstractURL")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
snippet: abstract_text.to_string(),
source: json.get("AbstractSource")
.and_then(|v| v.as_str())
.unwrap_or("DuckDuckGo")
.to_string(),
relevance: 100,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
}
// Parse related topics
if let Some(related) = json.get("RelatedTopics").and_then(|v| v.as_array()) {
for item in related.iter().take(max_results) {
if let Some(obj) = item.as_object() {
results.push(SearchResult {
title: obj.get("Text")
.and_then(|v| v.as_str())
.unwrap_or("Related Topic")
.to_string(),
url: obj.get("FirstURL")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
snippet: obj.get("Text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string(),
source: "DuckDuckGo".to_string(),
relevance: 80,
content: None,
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
});
}
}
}
Ok(results)
}
/// Fetch content from a URL
async fn execute_fetch(&self, url: &str) -> Result<SearchResult> {
let start = std::time::Instant::now();
// Check cache first
{
let cache = self.cache.read().await;
if let Some(cached) = cache.get(url) {
if cached.content.is_some() {
return Ok(cached.clone());
}
}
}
let response = self.client
.get(url)
.send()
.await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Fetch request failed: {}", e)))?;
let content_type = response.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("");
let content = if content_type.contains("text/html") {
// Extract text from HTML
let html = response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read HTML: {}", e)))?;
self.extract_text_from_html(&html)
} else if content_type.contains("text/") || content_type.contains("application/json") {
response.text().await
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Failed to read text: {}", e)))?
} else {
"[Binary content]".to_string()
};
let result = SearchResult {
title: url.to_string(),
url: url.to_string(),
snippet: content.chars().take(500).collect(),
source: url.to_string(),
relevance: 100,
content: Some(content),
fetched_at: Some(chrono::Utc::now().to_rfc3339()),
};
// Cache the result
{
let mut cache = self.cache.write().await;
cache.insert(url.to_string(), result.clone());
}
let duration = start.elapsed().as_millis() as u64;
tracing::info!(
target: "researcher",
url = url,
duration_ms = duration,
"Fetch completed"
);
Ok(result)
}
/// Extract readable text from HTML
fn extract_text_from_html(&self, html: &str) -> String {
let html_lower = html.to_lowercase();
let mut text = String::new();
let mut in_tag = false;
let mut in_script = false;
let mut in_style = false;
let mut pos: usize = 0;
for c in html.chars() {
let char_len = c.len_utf8();
match c {
'<' => {
// Check for closing tags before entering tag mode
let remaining = &html_lower[pos..];
if remaining.starts_with("</script") {
in_script = false;
} else if remaining.starts_with("</style") {
in_style = false;
}
// Check for opening tags
if remaining.starts_with("<script") {
in_script = true;
} else if remaining.starts_with("<style") {
in_style = true;
}
in_tag = true;
}
'>' => {
in_tag = false;
}
_ if in_tag => {}
_ if in_script || in_style => {}
' ' | '\n' | '\t' | '\r' => {
if !text.ends_with(' ') && !text.is_empty() {
text.push(' ');
}
}
_ => text.push(c),
}
pos += char_len;
}
if text.len() > 10000 {
text.truncate(10000);
text.push_str("...");
}
text.trim().to_string()
}
/// Generate a comprehensive research report
async fn execute_report(&self, query: &ResearchQuery) -> Result<ResearchReport> {
let start = std::time::Instant::now();
// First, execute search
let mut results = self.execute_search(query).await?;
// Fetch content for top results
let fetch_limit = match query.depth {
ResearchDepth::Quick => 1,
ResearchDepth::Standard => 3,
ResearchDepth::Deep => 5,
};
for result in results.iter_mut().take(fetch_limit) {
if !result.url.is_empty() {
match self.execute_fetch(&result.url).await {
Ok(fetched) => {
result.content = fetched.content;
result.fetched_at = fetched.fetched_at;
}
Err(e) => {
tracing::warn!(target: "researcher", error = %e, "Failed to fetch content");
}
}
}
}
// Extract key findings
let key_findings: Vec<String> = results.iter()
.take(5)
.filter_map(|r| {
r.content.as_ref().map(|c| {
c.split(". ")
.take(3)
.collect::<Vec<_>>()
.join(". ")
})
})
.collect();
// Extract related topics from snippets
let related_topics: Vec<String> = results.iter()
.filter_map(|r| {
if r.snippet.len() > 50 {
Some(r.title.clone())
} else {
None
}
})
.take(5)
.collect();
let duration = start.elapsed().as_millis() as u64;
// Generate summary from top results
let summary = if results.is_empty() {
"未找到相关结果,建议调整搜索关键词后重试".to_string()
} else {
let top_snippets: Vec<&str> = results
.iter()
.take(3)
.filter_map(|r| {
let s = r.snippet.trim();
if s.is_empty() { None } else { Some(s) }
})
.collect();
if top_snippets.is_empty() {
format!("找到 {} 条相关结果,但无摘要信息", results.len())
} else {
format!(
"基于 {} 条搜索结果:{}",
results.len(),
top_snippets.join("")
)
}
};
Ok(ResearchReport {
query: query.query.clone(),
results,
summary: Some(summary),
key_findings,
related_topics,
researched_at: chrono::Utc::now().to_rfc3339(),
duration_ms: duration,
})
}
}
impl Default for ResearcherHand {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Hand for ResearcherHand {
fn config(&self) -> &HandConfig {
&self.config
}
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
let action: ResearcherAction = serde_json::from_value(input.clone())
.map_err(|e| zclaw_types::ZclawError::HandError(format!("Invalid action: {}", e)))?;
let start = std::time::Instant::now();
let result = match action {
ResearcherAction::Search { query } => {
let results = self.execute_search(&query).await?;
json!({
"action": "search",
"query": query.query,
"results": results,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Fetch { url } => {
let result = self.execute_fetch(&url).await?;
json!({
"action": "fetch",
"url": url,
"result": result,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Summarize { urls } => {
let mut results = Vec::new();
for url in urls.iter().take(5) {
if let Ok(result) = self.execute_fetch(url).await {
results.push(result);
}
}
json!({
"action": "summarize",
"urls": urls,
"results": results,
"duration_ms": start.elapsed().as_millis()
})
}
ResearcherAction::Report { query } => {
let report = self.execute_report(&query).await?;
json!({
"action": "report",
"report": report
})
}
};
Ok(HandResult::success(result))
}
fn needs_approval(&self) -> bool {
false // Research operations are generally safe
}
fn check_dependencies(&self) -> Result<Vec<String>> {
// Network connectivity will be checked at runtime
Ok(Vec::new())
}
fn status(&self) -> crate::HandStatus {
crate::HandStatus::Idle
}
}
/// URL encoding helper (simple implementation)
fn url_encode(s: &str) -> String {
s.chars()
.map(|c| match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '-' | '_' | '.' | '~' => c.to_string(),
_ => format!("%{:02X}", c as u32),
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_hand() -> ResearcherHand {
ResearcherHand::new()
}
fn test_context() -> HandContext {
HandContext::default()
}
// --- Config & Type Tests ---
#[test]
fn test_config_id() {
let hand = create_test_hand();
assert_eq!(hand.config().id, "researcher");
assert_eq!(hand.config().name, "研究员");
assert!(hand.config().enabled);
assert!(!hand.config().needs_approval);
}
#[test]
fn test_search_engine_default_is_auto() {
let engine = SearchEngine::default();
assert!(matches!(engine, SearchEngine::Auto));
}
#[test]
fn test_research_depth_default_is_standard() {
let depth = ResearchDepth::default();
assert!(matches!(depth, ResearchDepth::Standard));
}
#[test]
fn test_research_depth_serialize() {
let json = serde_json::to_string(&ResearchDepth::Deep).unwrap();
assert_eq!(json, "\"deep\"");
}
#[test]
fn test_research_depth_deserialize() {
let depth: ResearchDepth = serde_json::from_str("\"quick\"").unwrap();
assert!(matches!(depth, ResearchDepth::Quick));
}
#[test]
fn test_search_engine_serialize_roundtrip() {
for engine in [SearchEngine::Google, SearchEngine::Bing, SearchEngine::DuckDuckGo, SearchEngine::Auto] {
let json = serde_json::to_string(&engine).unwrap();
let back: SearchEngine = serde_json::from_str(&json).unwrap();
assert_eq!(json, serde_json::to_string(&back).unwrap());
}
}
// --- Action Deserialization Tests ---
#[test]
fn test_action_search_deserialize() {
let json = json!({
"action": "search",
"query": {
"query": "Rust programming",
"engine": "duckduckgo",
"depth": "quick",
"maxResults": 5
}
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Search { query } => {
assert_eq!(query.query, "Rust programming");
assert!(matches!(query.engine, SearchEngine::DuckDuckGo));
assert!(matches!(query.depth, ResearchDepth::Quick));
assert_eq!(query.max_results, 5);
}
_ => panic!("Expected Search action"),
}
}
#[test]
fn test_action_fetch_deserialize() {
let json = json!({
"action": "fetch",
"url": "https://example.com/page"
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Fetch { url } => {
assert_eq!(url, "https://example.com/page");
}
_ => panic!("Expected Fetch action"),
}
}
#[test]
fn test_action_report_deserialize() {
let json = json!({
"action": "report",
"query": {
"query": "AI trends 2026",
"depth": "deep"
}
});
let action: ResearcherAction = serde_json::from_value(json).unwrap();
match action {
ResearcherAction::Report { query } => {
assert_eq!(query.query, "AI trends 2026");
assert!(matches!(query.depth, ResearchDepth::Deep));
}
_ => panic!("Expected Report action"),
}
}
#[test]
fn test_action_invalid_rejected() {
let json = json!({
"action": "unknown_action",
"data": "whatever"
});
let result: std::result::Result<ResearcherAction, _> = serde_json::from_value(json);
assert!(result.is_err());
}
// --- URL Encoding Tests ---
#[test]
fn test_url_encode_ascii() {
assert_eq!(url_encode("hello world"), "hello%20world");
}
#[test]
fn test_url_encode_chinese() {
let encoded = url_encode("中文搜索");
assert!(encoded.contains("%"));
// Chinese chars should be percent-encoded
assert!(!encoded.contains("中文"));
}
#[test]
fn test_url_encode_safe_chars() {
assert_eq!(url_encode("abc123-_."), "abc123-_.".to_string());
}
#[test]
fn test_url_encode_empty() {
assert_eq!(url_encode(""), "");
}
// --- HTML Text Extraction Tests ---
#[test]
fn test_extract_text_basic() {
let hand = create_test_hand();
let html = "<html><body><h1>Title</h1><p>Content here</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(text.contains("Title"));
assert!(text.contains("Content here"));
}
#[test]
fn test_extract_text_strips_scripts() {
let hand = create_test_hand();
let html = "<html><body><script>alert('xss')</script><p>Safe text</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(!text.contains("alert"));
assert!(text.contains("Safe text"));
}
#[test]
fn test_extract_text_strips_styles() {
let hand = create_test_hand();
let html = "<html><body><style>.class{color:red}</style><p>Visible</p></body></html>";
let text = hand.extract_text_from_html(html);
assert!(!text.contains("color"));
assert!(text.contains("Visible"));
}
#[test]
fn test_extract_text_truncates_long_content() {
let hand = create_test_hand();
let long_body: String = "x".repeat(20000);
let html = format!("<html><body><p>{}</p></body></html>", long_body);
let text = hand.extract_text_from_html(&html);
assert!(text.len() <= 10003); // 10000 + "..."
}
#[test]
fn test_extract_text_empty_body() {
let hand = create_test_hand();
let html = "<html><body></body></html>";
let text = hand.extract_text_from_html(html);
assert!(text.is_empty());
}
// --- Hand Trait Tests ---
#[tokio::test]
async fn test_needs_approval_is_false() {
let hand = create_test_hand();
assert!(!hand.needs_approval());
}
#[tokio::test]
async fn test_status_is_idle() {
let hand = create_test_hand();
assert!(matches!(hand.status(), crate::HandStatus::Idle));
}
#[tokio::test]
async fn test_check_dependencies_ok() {
let hand = create_test_hand();
let missing = hand.check_dependencies().unwrap();
// Default is_dependency_available returns true for all
assert!(missing.is_empty());
}
// --- Default Values Tests ---
#[test]
fn test_research_query_defaults() {
let json = json!({ "query": "test" });
let query: ResearchQuery = serde_json::from_value(json).unwrap();
assert_eq!(query.query, "test");
assert!(matches!(query.engine, SearchEngine::Auto));
assert!(matches!(query.depth, ResearchDepth::Standard));
assert_eq!(query.max_results, 10);
assert_eq!(query.time_limit_secs, 60);
assert!(!query.include_related);
}
#[test]
fn test_search_result_serialization() {
let result = SearchResult {
title: "Test".to_string(),
url: "https://example.com".to_string(),
snippet: "A snippet".to_string(),
source: "TestSource".to_string(),
relevance: 90,
content: None,
fetched_at: None,
};
let json = serde_json::to_string(&result).unwrap();
assert!(json.contains("Test"));
assert!(json.contains("https://example.com"));
}
#[test]
fn test_research_report_summary_is_some_when_results() {
// Verify the struct allows Some value
let report = ResearchReport {
query: "test".to_string(),
results: vec![SearchResult {
title: "R".to_string(),
url: "https://r.co".to_string(),
snippet: "snippet text".to_string(),
source: "S".to_string(),
relevance: 80,
content: None,
fetched_at: None,
}],
summary: Some("基于 1 条搜索结果snippet text".to_string()),
key_findings: vec![],
related_topics: vec![],
researched_at: "2026-01-01T00:00:00Z".to_string(),
duration_ms: 100,
};
assert!(report.summary.is_some());
assert!(report.summary.unwrap().contains("snippet text"));
}
}