diff --git a/crates/zclaw-hands/src/hands/researcher.rs b/crates/zclaw-hands/src/hands/researcher.rs index f000b71..9e24094 100644 --- a/crates/zclaw-hands/src/hands/researcher.rs +++ b/crates/zclaw-hands/src/hands/researcher.rs @@ -556,6 +556,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -645,6 +649,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -725,6 +733,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -1181,9 +1193,53 @@ fn strip_html_tags(s: &str) -> String { .replace(">", ">") .replace(""", "\"") .replace("'", "'") - .replace(" ", " "); + .replace(" ", " ") + .replace("'", "'") + .replace("/", "/"); - result + // Collapse whitespace + let collapsed: String = result.split_whitespace().collect::>().join(" "); + collapsed +} + +/// Check if a search result is likely genuine (not navigation/ad/script garbage) +fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool { + // Title quality checks + let title_trimmed = title.trim(); + if title_trimmed.len() < 2 || title_trimmed.len() > 300 { + return false; + } + // Reject titles with JavaScript/CSS indicators + let lower = title_trimmed.to_lowercase(); + if lower.contains("function(") || lower.contains("var ") || lower.contains("const ") + || lower.contains("window.") || lower.contains("document.") + || lower.contains("{") || lower.contains("}") + || lower.starts_with("//") || lower.starts_with("/*") + || lower.contains("cookie") || lower.contains("navigator.") + || lower.contains(".css") || lower.contains("stylesheet") + || lower.contains("google-analytics") || lower.contains("gtag") + { + return false; + } + + // URL quality checks + if url.contains("javascript:") || url.contains("data:") { + return false; + } + // Reject URLs that are just fragments or relative paths + if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") { + return false; + } + + // Snippet quality — if snippet looks like code, reject + let snippet_lower = snippet.to_lowercase(); + if snippet_lower.contains("function(") && snippet_lower.contains("return ") + || snippet_lower.contains("var ") && snippet_lower.contains("=") + { + return false; + } + + true } /// Extract href URL from the first tag in text @@ -1913,4 +1969,48 @@ mod tests { }; assert!(query.validate().is_ok()); } + + // --- Quality Filter Tests --- + + #[test] + fn test_quality_rejects_javascript_title() { + assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com")); + } + + #[test] + fn test_quality_rejects_short_title() { + assert!(!is_quality_result("A", "snippet", "https://example.com")); + } + + #[test] + fn test_quality_rejects_css_title() { + assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com")); + } + + #[test] + fn test_quality_rejects_javascript_url() { + assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)")); + } + + #[test] + fn test_quality_accepts_normal_result() { + assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy")); + } + + #[test] + fn test_quality_accepts_english_result() { + assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org")); + } + + #[test] + fn test_quality_rejects_long_title() { + let long_title: String = "x".repeat(301); + assert!(!is_quality_result(&long_title, "ok", "https://example.com")); + } + + #[test] + fn test_strip_html_tags_collapses_whitespace() { + assert_eq!(strip_html_tags("Hello World"), "Hello World"); + assert_eq!(strip_html_tags("a\n\t b"), "a b"); + } }