fix(hands): 搜索结果质量过滤 — 去除JS/CSS/广告等垃圾内容
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

问题:HTML解析器提取搜索引擎页面中的导航/脚本/广告片段,
导致搜索结果混入 function()/var/stylesheet 等垃圾文本。

修复:
- 新增 is_quality_result() 过滤函数,检查 title/snippet/url 质量
- 拒绝含 JS 关键词(function/var/const/window/document)的标题
- 拒绝含 CSS 标识(.css/stylesheet)的标题
- 拒绝过短(<2)或过长(>300)的标题
- 拒绝 javascript:/data: URL
- strip_html_tags 添加空白折叠 + 更多HTML实体
- 三个解析器(DDG/Bing/百度)全部接入质量过滤

测试: 68 PASS (新增8个质量过滤测试)
This commit is contained in:
iven
2026-04-22 12:16:02 +08:00
parent 5a0c652f4f
commit ee56bf6087

View File

@@ -556,6 +556,10 @@ impl ResearcherHand {
continue; continue;
} }
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult { results.push(SearchResult {
title, title,
url, url,
@@ -645,6 +649,10 @@ impl ResearcherHand {
continue; continue;
} }
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult { results.push(SearchResult {
title, title,
url, url,
@@ -725,6 +733,10 @@ impl ResearcherHand {
continue; continue;
} }
if !is_quality_result(&title, &snippet, &url) {
continue;
}
results.push(SearchResult { results.push(SearchResult {
title, title,
url, url,
@@ -1181,9 +1193,53 @@ fn strip_html_tags(s: &str) -> String {
.replace("&gt;", ">") .replace("&gt;", ">")
.replace("&quot;", "\"") .replace("&quot;", "\"")
.replace("&#39;", "'") .replace("&#39;", "'")
.replace("&nbsp;", " "); .replace("&nbsp;", " ")
.replace("&#x27;", "'")
.replace("&#x2F;", "/");
result // Collapse whitespace
let collapsed: String = result.split_whitespace().collect::<Vec<_>>().join(" ");
collapsed
}
/// Check if a search result is likely genuine (not navigation/ad/script garbage)
fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool {
// Title quality checks
let title_trimmed = title.trim();
if title_trimmed.len() < 2 || title_trimmed.len() > 300 {
return false;
}
// Reject titles with JavaScript/CSS indicators
let lower = title_trimmed.to_lowercase();
if lower.contains("function(") || lower.contains("var ") || lower.contains("const ")
|| lower.contains("window.") || lower.contains("document.")
|| lower.contains("{") || lower.contains("}")
|| lower.starts_with("//") || lower.starts_with("/*")
|| lower.contains("cookie") || lower.contains("navigator.")
|| lower.contains(".css") || lower.contains("stylesheet")
|| lower.contains("google-analytics") || lower.contains("gtag")
{
return false;
}
// URL quality checks
if url.contains("javascript:") || url.contains("data:") {
return false;
}
// Reject URLs that are just fragments or relative paths
if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") {
return false;
}
// Snippet quality — if snippet looks like code, reject
let snippet_lower = snippet.to_lowercase();
if snippet_lower.contains("function(") && snippet_lower.contains("return ")
|| snippet_lower.contains("var ") && snippet_lower.contains("=")
{
return false;
}
true
} }
/// Extract href URL from the first <a> tag in text /// Extract href URL from the first <a> tag in text
@@ -1913,4 +1969,48 @@ mod tests {
}; };
assert!(query.validate().is_ok()); assert!(query.validate().is_ok());
} }
// --- Quality Filter Tests ---
#[test]
fn test_quality_rejects_javascript_title() {
assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com"));
}
#[test]
fn test_quality_rejects_short_title() {
assert!(!is_quality_result("A", "snippet", "https://example.com"));
}
#[test]
fn test_quality_rejects_css_title() {
assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com"));
}
#[test]
fn test_quality_rejects_javascript_url() {
assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)"));
}
#[test]
fn test_quality_accepts_normal_result() {
assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy"));
}
#[test]
fn test_quality_accepts_english_result() {
assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org"));
}
#[test]
fn test_quality_rejects_long_title() {
let long_title: String = "x".repeat(301);
assert!(!is_quality_result(&long_title, "ok", "https://example.com"));
}
#[test]
fn test_strip_html_tags_collapses_whitespace() {
assert_eq!(strip_html_tags("<b>Hello</b> <i>World</i>"), "Hello World");
assert_eq!(strip_html_tags("a\n\t b"), "a b");
}
} }