/// HTML/Script 内容清理工具。 /// /// 基于 ammonia(html5ever)剥离所有 HTML 标签,防止存储型 XSS。 /// 覆盖场景:用户名、显示名、邮箱、电话等字符串字段。 /// /// 剥离字符串中的所有 HTML 标签,返回纯文本。 /// /// 使用 ammonia 构建 DOM 树,然后用 tendril 收集文本节点。 /// 比手写字符级解析器更安全,能正确处理所有 HTML 边界情况。 pub fn strip_html_tags(input: &str) -> String { // 使用 ammonia 清理(保留在 span 中的纯文本),然后剥离 span 标签 let doc = ammonia::Builder::new() .tags(std::collections::HashSet::new()) .clean(input) .to_string(); // ammonia 的 clean() 结果可能包含 HTML 实体(如 <),需要解码 // 但由于所有标签已被禁止,结果是纯文本(可能有实体转义) // 使用二次清理:将结果作为纯文本处理 decode_entities(&doc).trim().to_string() } /// 简单解码常见 HTML 实体。 fn decode_entities(input: &str) -> String { input .replace("<", "<") .replace(">", ">") .replace("&", "&") .replace(""", "\"") .replace("'", "'") .replace("'", "'") .replace("/", "/") .replace(" ", " ") } /// 对 Option 类型的字段进行清理。 pub fn sanitize_option(input: Option) -> Option { input.map(|s| strip_html_tags(&s)).filter(|s| !s.is_empty()) } /// 对 String 类型的必填字段进行清理。 pub fn sanitize_string(input: &str) -> String { strip_html_tags(input) } /// 对富文本 HTML 进行安全清理,保留安全的 HTML 标签和内联样式,去除危险元素。 /// 适用于文章内容等需要保留 HTML 排版的场景。 pub fn sanitize_rich_html(input: &str) -> String { use std::collections::{HashMap, HashSet}; let tag_attrs: HashMap<&str, HashSet<&str>> = [ ("div", HashSet::from(["style", "data-w-e-type"])), ("span", HashSet::from(["style"])), ("p", HashSet::from(["style"])), ( "img", HashSet::from(["src", "alt", "style", "width", "height"]), ), ("a", HashSet::from(["href", "target"])), ("td", HashSet::from(["style", "colspan", "rowspan"])), ("th", HashSet::from(["style", "colspan", "rowspan"])), ("blockquote", HashSet::from(["style"])), ] .into_iter() .collect(); ammonia::Builder::new() .tags( [ "p", "br", "span", "div", "strong", "b", "em", "i", "u", "s", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "blockquote", "pre", "code", "table", "thead", "tbody", "tr", "th", "td", "img", "a", "hr", ] .into_iter() .collect(), ) .tag_attributes(tag_attrs) .generic_attributes(HashSet::from(["style"])) .url_relative(ammonia::UrlRelative::PassThrough) .clean(input) .to_string() } /// 对 Option 的富文本进行安全清理。 pub fn sanitize_rich_html_option(input: Option) -> Option { input .map(|s| sanitize_rich_html(&s)) .filter(|s| !s.trim().is_empty()) } #[cfg(test)] mod tests { use super::*; #[test] fn strips_script_tag() { // script 内容在 HTML 规范中是 raw text,ammonia 正确地将其完全移除 assert_eq!(strip_html_tags(""), ""); } #[test] fn strips_img_onerror() { assert_eq!(strip_html_tags(""), ""); } #[test] fn strips_bold_tags() { assert_eq!(strip_html_tags("Hello World"), "Hello World"); } #[test] fn no_tags_passthrough() { assert_eq!(strip_html_tags("Normal text"), "Normal text"); } #[test] fn nested_tags() { assert_eq!(strip_html_tags("

text

"), "text"); } #[test] fn sanitize_option_some() { assert_eq!( sanitize_option(Some("evil".to_string())), Some("evil".to_string()) ); } #[test] fn sanitize_option_none() { assert_eq!(sanitize_option(None), None); } #[test] fn sanitize_option_becomes_empty() { assert_eq!(sanitize_option(Some("".to_string())), None); } #[test] fn strips_nested_script_attack() { let result = strip_html_tags("ipt>alert(1)ipt>"); assert!(!result.contains("<"), "不应残留 HTML 标签"); } #[test] fn strips_unclosed_tag() { let result = strip_html_tags("text Hello

Green box
Bold"#; let result = sanitize_rich_html(html); assert!(result.contains("

Hello

"), "should preserve

tags"); assert!( result.contains("Bold"), "should preserve " ); assert!( result.contains("background"), "should preserve style attribute" ); } #[test] fn rich_html_removes_script() { let html = r#"

Hello

"#; let result = sanitize_rich_html(html); assert!(!result.contains("script"), "should remove script tags"); assert!(result.contains("Hello")); } #[test] fn rich_html_preserves_styled_block() { let html = r#"
Tip content
"#; let result = sanitize_rich_html(html); assert!( result.contains("styled-block"), "should preserve data-w-e-type" ); assert!(result.contains("Tip content")); } }