From ee56bf6087aa38bcecd87d2187502b57710a01b1 Mon Sep 17 00:00:00 2001 From: iven Date: Wed, 22 Apr 2026 12:16:02 +0800 Subject: [PATCH] =?UTF-8?q?fix(hands):=20=E6=90=9C=E7=B4=A2=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E8=B4=A8=E9=87=8F=E8=BF=87=E6=BB=A4=20=E2=80=94=20?= =?UTF-8?q?=E5=8E=BB=E9=99=A4JS/CSS/=E5=B9=BF=E5=91=8A=E7=AD=89=E5=9E=83?= =?UTF-8?q?=E5=9C=BE=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 问题:HTML解析器提取搜索引擎页面中的导航/脚本/广告片段, 导致搜索结果混入 function()/var/stylesheet 等垃圾文本。 修复: - 新增 is_quality_result() 过滤函数,检查 title/snippet/url 质量 - 拒绝含 JS 关键词(function/var/const/window/document)的标题 - 拒绝含 CSS 标识(.css/stylesheet)的标题 - 拒绝过短(<2)或过长(>300)的标题 - 拒绝 javascript:/data: URL - strip_html_tags 添加空白折叠 + 更多HTML实体 - 三个解析器(DDG/Bing/百度)全部接入质量过滤 测试: 68 PASS (新增8个质量过滤测试) --- crates/zclaw-hands/src/hands/researcher.rs | 104 ++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/crates/zclaw-hands/src/hands/researcher.rs b/crates/zclaw-hands/src/hands/researcher.rs index f000b71..9e24094 100644 --- a/crates/zclaw-hands/src/hands/researcher.rs +++ b/crates/zclaw-hands/src/hands/researcher.rs @@ -556,6 +556,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -645,6 +649,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -725,6 +733,10 @@ impl ResearcherHand { continue; } + if !is_quality_result(&title, &snippet, &url) { + continue; + } + results.push(SearchResult { title, url, @@ -1181,9 +1193,53 @@ fn strip_html_tags(s: &str) -> String { .replace(">", ">") .replace(""", "\"") .replace("'", "'") - .replace(" ", " "); + .replace(" ", " ") + .replace("'", "'") + .replace("/", "/"); - result + // Collapse whitespace + let collapsed: String = result.split_whitespace().collect::>().join(" "); + collapsed +} + +/// Check if a search result is likely genuine (not navigation/ad/script garbage) +fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool { + // Title quality checks + let title_trimmed = title.trim(); + if title_trimmed.len() < 2 || title_trimmed.len() > 300 { + return false; + } + // Reject titles with JavaScript/CSS indicators + let lower = title_trimmed.to_lowercase(); + if lower.contains("function(") || lower.contains("var ") || lower.contains("const ") + || lower.contains("window.") || lower.contains("document.") + || lower.contains("{") || lower.contains("}") + || lower.starts_with("//") || lower.starts_with("/*") + || lower.contains("cookie") || lower.contains("navigator.") + || lower.contains(".css") || lower.contains("stylesheet") + || lower.contains("google-analytics") || lower.contains("gtag") + { + return false; + } + + // URL quality checks + if url.contains("javascript:") || url.contains("data:") { + return false; + } + // Reject URLs that are just fragments or relative paths + if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") { + return false; + } + + // Snippet quality — if snippet looks like code, reject + let snippet_lower = snippet.to_lowercase(); + if snippet_lower.contains("function(") && snippet_lower.contains("return ") + || snippet_lower.contains("var ") && snippet_lower.contains("=") + { + return false; + } + + true } /// Extract href URL from the first tag in text @@ -1913,4 +1969,48 @@ mod tests { }; assert!(query.validate().is_ok()); } + + // --- Quality Filter Tests --- + + #[test] + fn test_quality_rejects_javascript_title() { + assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com")); + } + + #[test] + fn test_quality_rejects_short_title() { + assert!(!is_quality_result("A", "snippet", "https://example.com")); + } + + #[test] + fn test_quality_rejects_css_title() { + assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com")); + } + + #[test] + fn test_quality_rejects_javascript_url() { + assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)")); + } + + #[test] + fn test_quality_accepts_normal_result() { + assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy")); + } + + #[test] + fn test_quality_accepts_english_result() { + assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org")); + } + + #[test] + fn test_quality_rejects_long_title() { + let long_title: String = "x".repeat(301); + assert!(!is_quality_result(&long_title, "ok", "https://example.com")); + } + + #[test] + fn test_strip_html_tags_collapses_whitespace() { + assert_eq!(strip_html_tags("Hello World"), "Hello World"); + assert_eq!(strip_html_tags("a\n\t b"), "a b"); + } }