Files
hms/crates/erp-core/src/sanitize.rs
iven e00ee69d28 fix(core,health): 文章内容 sanitize 保留安全 HTML 标签 + 血透测试文章种子
- 新增 sanitize_rich_html() 使用 ammonia 白名单保留安全 HTML 标签和内联样式
- 修复文章创建/更新时 content 被 strip_html_tags() 完全剥离的问题
- ammonia 4 不允许手动指定 <a> 的 rel 属性(自动管理),已从 tag_attrs 移除
- 新增 3 个 sanitize_rich_html 单元测试
- 新增 seed-dialysis-articles.mjs 种子脚本(4 篇血透相关富文本文章)
2026-05-11 03:13:43 +08:00

219 lines
6.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/// HTML/Script 内容清理工具。
///
/// 基于 ammoniahtml5ever剥离所有 HTML 标签,防止存储型 XSS。
/// 覆盖场景:用户名、显示名、邮箱、电话等字符串字段。
///
/// 剥离字符串中的所有 HTML 标签,返回纯文本。
///
/// 使用 ammonia 构建 DOM 树,然后用 tendril 收集文本节点。
/// 比手写字符级解析器更安全,能正确处理所有 HTML 边界情况。
pub fn strip_html_tags(input: &str) -> String {
// 使用 ammonia 清理(保留在 span 中的纯文本),然后剥离 span 标签
let doc = ammonia::Builder::new()
.tags(std::collections::HashSet::new())
.clean(input)
.to_string();
// ammonia 的 clean() 结果可能包含 HTML 实体(如 &lt;),需要解码
// 但由于所有标签已被禁止,结果是纯文本(可能有实体转义)
// 使用二次清理:将结果作为纯文本处理
decode_entities(&doc).trim().to_string()
}
/// 简单解码常见 HTML 实体。
fn decode_entities(input: &str) -> String {
input
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&amp;", "&")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&apos;", "'")
.replace("&#47;", "/")
.replace("&#32;", " ")
}
/// 对 Option<String> 类型的字段进行清理。
pub fn sanitize_option(input: Option<String>) -> Option<String> {
input.map(|s| strip_html_tags(&s)).filter(|s| !s.is_empty())
}
/// 对 String 类型的必填字段进行清理。
pub fn sanitize_string(input: &str) -> String {
strip_html_tags(input)
}
/// 对富文本 HTML 进行安全清理,保留安全的 HTML 标签和内联样式,去除危险元素。
/// 适用于文章内容等需要保留 HTML 排版的场景。
pub fn sanitize_rich_html(input: &str) -> String {
use std::collections::{HashMap, HashSet};
let tag_attrs: HashMap<&str, HashSet<&str>> = [
("div", HashSet::from(["style", "data-w-e-type"])),
("span", HashSet::from(["style"])),
("p", HashSet::from(["style"])),
(
"img",
HashSet::from(["src", "alt", "style", "width", "height"]),
),
("a", HashSet::from(["href", "target"])),
("td", HashSet::from(["style", "colspan", "rowspan"])),
("th", HashSet::from(["style", "colspan", "rowspan"])),
("blockquote", HashSet::from(["style"])),
]
.into_iter()
.collect();
ammonia::Builder::new()
.tags(
[
"p",
"br",
"span",
"div",
"strong",
"b",
"em",
"i",
"u",
"s",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"blockquote",
"pre",
"code",
"table",
"thead",
"tbody",
"tr",
"th",
"td",
"img",
"a",
"hr",
]
.into_iter()
.collect(),
)
.tag_attributes(tag_attrs)
.generic_attributes(HashSet::from(["style"]))
.url_relative(ammonia::UrlRelative::PassThrough)
.clean(input)
.to_string()
}
/// 对 Option<String> 的富文本进行安全清理。
pub fn sanitize_rich_html_option(input: Option<String>) -> Option<String> {
input
.map(|s| sanitize_rich_html(&s))
.filter(|s| !s.trim().is_empty())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strips_script_tag() {
// script 内容在 HTML 规范中是 raw textammonia 正确地将其完全移除
assert_eq!(strip_html_tags("<script>alert('xss')</script>"), "");
}
#[test]
fn strips_img_onerror() {
assert_eq!(strip_html_tags("<img src=x onerror=alert(1)>"), "");
}
#[test]
fn strips_bold_tags() {
assert_eq!(strip_html_tags("Hello <b>World</b>"), "Hello World");
}
#[test]
fn no_tags_passthrough() {
assert_eq!(strip_html_tags("Normal text"), "Normal text");
}
#[test]
fn nested_tags() {
assert_eq!(strip_html_tags("<div><p>text</p></div>"), "text");
}
#[test]
fn sanitize_option_some() {
assert_eq!(
sanitize_option(Some("<b>evil</b>".to_string())),
Some("evil".to_string())
);
}
#[test]
fn sanitize_option_none() {
assert_eq!(sanitize_option(None), None);
}
#[test]
fn sanitize_option_becomes_empty() {
assert_eq!(sanitize_option(Some("<img>".to_string())), None);
}
#[test]
fn strips_nested_script_attack() {
let result = strip_html_tags("<scr<script>ipt>alert(1)</scr</script>ipt>");
assert!(!result.contains("<"), "不应残留 HTML 标签");
}
#[test]
fn strips_unclosed_tag() {
let result = strip_html_tags("text <img");
assert!(result.contains("text") || result.is_empty());
}
#[test]
fn handles_entities() {
let result = strip_html_tags("a &lt; b");
assert!(result.contains("a") && result.contains("b"));
}
#[test]
fn rich_html_preserves_safe_tags() {
let html = r#"<p>Hello</p><div style="background:#f0fdf4;padding:14px">Green box</div><strong>Bold</strong>"#;
let result = sanitize_rich_html(html);
assert!(result.contains("<p>Hello</p>"), "should preserve <p> tags");
assert!(
result.contains("<strong>Bold</strong>"),
"should preserve <strong>"
);
assert!(
result.contains("background"),
"should preserve style attribute"
);
}
#[test]
fn rich_html_removes_script() {
let html = r#"<p>Hello</p><script>alert(1)</script>"#;
let result = sanitize_rich_html(html);
assert!(!result.contains("script"), "should remove script tags");
assert!(result.contains("Hello"));
}
#[test]
fn rich_html_preserves_styled_block() {
let html = r#"<div data-w-e-type="styled-block" style="background:#f0fdf4;border-radius:8px;padding:14px">Tip content</div>"#;
let result = sanitize_rich_html(html);
assert!(
result.contains("styled-block"),
"should preserve data-w-e-type"
);
assert!(result.contains("Tip content"));
}
}