fix(hands): 搜索结果质量过滤 — 去除JS/CSS/广告等垃圾内容
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
问题:HTML解析器提取搜索引擎页面中的导航/脚本/广告片段, 导致搜索结果混入 function()/var/stylesheet 等垃圾文本。 修复: - 新增 is_quality_result() 过滤函数,检查 title/snippet/url 质量 - 拒绝含 JS 关键词(function/var/const/window/document)的标题 - 拒绝含 CSS 标识(.css/stylesheet)的标题 - 拒绝过短(<2)或过长(>300)的标题 - 拒绝 javascript:/data: URL - strip_html_tags 添加空白折叠 + 更多HTML实体 - 三个解析器(DDG/Bing/百度)全部接入质量过滤 测试: 68 PASS (新增8个质量过滤测试)
This commit is contained in:
@@ -556,6 +556,10 @@ impl ResearcherHand {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !is_quality_result(&title, &snippet, &url) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
@@ -645,6 +649,10 @@ impl ResearcherHand {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !is_quality_result(&title, &snippet, &url) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
@@ -725,6 +733,10 @@ impl ResearcherHand {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !is_quality_result(&title, &snippet, &url) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(SearchResult {
|
||||
title,
|
||||
url,
|
||||
@@ -1181,9 +1193,53 @@ fn strip_html_tags(s: &str) -> String {
|
||||
.replace(">", ">")
|
||||
.replace(""", "\"")
|
||||
.replace("'", "'")
|
||||
.replace(" ", " ");
|
||||
.replace(" ", " ")
|
||||
.replace("'", "'")
|
||||
.replace("/", "/");
|
||||
|
||||
result
|
||||
// Collapse whitespace
|
||||
let collapsed: String = result.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
collapsed
|
||||
}
|
||||
|
||||
/// Check if a search result is likely genuine (not navigation/ad/script garbage)
|
||||
fn is_quality_result(title: &str, snippet: &str, url: &str) -> bool {
|
||||
// Title quality checks
|
||||
let title_trimmed = title.trim();
|
||||
if title_trimmed.len() < 2 || title_trimmed.len() > 300 {
|
||||
return false;
|
||||
}
|
||||
// Reject titles with JavaScript/CSS indicators
|
||||
let lower = title_trimmed.to_lowercase();
|
||||
if lower.contains("function(") || lower.contains("var ") || lower.contains("const ")
|
||||
|| lower.contains("window.") || lower.contains("document.")
|
||||
|| lower.contains("{") || lower.contains("}")
|
||||
|| lower.starts_with("//") || lower.starts_with("/*")
|
||||
|| lower.contains("cookie") || lower.contains("navigator.")
|
||||
|| lower.contains(".css") || lower.contains("stylesheet")
|
||||
|| lower.contains("google-analytics") || lower.contains("gtag")
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// URL quality checks
|
||||
if url.contains("javascript:") || url.contains("data:") {
|
||||
return false;
|
||||
}
|
||||
// Reject URLs that are just fragments or relative paths
|
||||
if url.starts_with('#') || url.starts_with('/') && !url.starts_with("//") {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Snippet quality — if snippet looks like code, reject
|
||||
let snippet_lower = snippet.to_lowercase();
|
||||
if snippet_lower.contains("function(") && snippet_lower.contains("return ")
|
||||
|| snippet_lower.contains("var ") && snippet_lower.contains("=")
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/// Extract href URL from the first <a> tag in text
|
||||
@@ -1913,4 +1969,48 @@ mod tests {
|
||||
};
|
||||
assert!(query.validate().is_ok());
|
||||
}
|
||||
|
||||
// --- Quality Filter Tests ---
|
||||
|
||||
#[test]
|
||||
fn test_quality_rejects_javascript_title() {
|
||||
assert!(!is_quality_result("function(x) { return x; }", "ok", "https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_rejects_short_title() {
|
||||
assert!(!is_quality_result("A", "snippet", "https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_rejects_css_title() {
|
||||
assert!(!is_quality_result(".stylesheet{color:red}", "ok", "https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_rejects_javascript_url() {
|
||||
assert!(!is_quality_result("Title", "snippet", "javascript:alert(1)"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_accepts_normal_result() {
|
||||
assert!(is_quality_result("2024年中国医疗政策解读", "相关政策文件摘要", "https://www.gov.cn/policy"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_accepts_english_result() {
|
||||
assert!(is_quality_result("Rust Programming Language", "A systems programming language", "https://www.rust-lang.org"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_quality_rejects_long_title() {
|
||||
let long_title: String = "x".repeat(301);
|
||||
assert!(!is_quality_result(&long_title, "ok", "https://example.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_strip_html_tags_collapses_whitespace() {
|
||||
assert_eq!(strip_html_tags("<b>Hello</b> <i>World</i>"), "Hello World");
|
||||
assert_eq!(strip_html_tags("a\n\t b"), "a b");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user