hms/crates/erp-core/src/sanitize.rs

/// HTML/Script 内容清理工具。
///
/// 基于 ammonia（html5ever）剥离所有 HTML 标签，防止存储型 XSS。
/// 覆盖场景：用户名、显示名、邮箱、电话等字符串字段。
///
/// 剥离字符串中的所有 HTML 标签，返回纯文本。
///
/// 使用 ammonia 构建 DOM 树，然后用 tendril 收集文本节点。
/// 比手写字符级解析器更安全，能正确处理所有 HTML 边界情况。
pub fn strip_html_tags(input: &str) -> String {
    // 使用 ammonia 清理（保留在 span 中的纯文本），然后剥离 span 标签
    let doc = ammonia::Builder::new()
        .tags(std::collections::HashSet::new())
        .clean(input)
        .to_string();

    // ammonia 的 clean() 结果可能包含 HTML 实体（如 &lt;），需要解码
    // 但由于所有标签已被禁止，结果是纯文本（可能有实体转义）
    // 使用二次清理：将结果作为纯文本处理
    decode_entities(&doc).trim().to_string()
}

/// 简单解码常见 HTML 实体。
fn decode_entities(input: &str) -> String {
    input
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&amp;", "&")
        .replace("&quot;", "\"")
        .replace("&#39;", "'")
        .replace("&apos;", "'")
        .replace("&#47;", "/")
        .replace("&#32;", " ")
}

/// 对 Option<String> 类型的字段进行清理。
pub fn sanitize_option(input: Option<String>) -> Option<String> {
    input.map(|s| strip_html_tags(&s)).filter(|s| !s.is_empty())
}

/// 对 String 类型的必填字段进行清理。
pub fn sanitize_string(input: &str) -> String {
    strip_html_tags(input)
}

/// 对富文本 HTML 进行安全清理，保留安全的 HTML 标签和内联样式，去除危险元素。
/// 适用于文章内容等需要保留 HTML 排版的场景。
pub fn sanitize_rich_html(input: &str) -> String {
    use std::collections::{HashMap, HashSet};

    let tag_attrs: HashMap<&str, HashSet<&str>> = [
        ("div", HashSet::from(["style", "data-w-e-type"])),
        ("span", HashSet::from(["style"])),
        ("p", HashSet::from(["style"])),
        (
            "img",
            HashSet::from(["src", "alt", "style", "width", "height"]),
        ),
        ("a", HashSet::from(["href", "target"])),
        ("td", HashSet::from(["style", "colspan", "rowspan"])),
        ("th", HashSet::from(["style", "colspan", "rowspan"])),
        ("blockquote", HashSet::from(["style"])),
    ]
    .into_iter()
    .collect();

    ammonia::Builder::new()
        .tags(
            [
                "p",
                "br",
                "span",
                "div",
                "strong",
                "b",
                "em",
                "i",
                "u",
                "s",
                "h1",
                "h2",
                "h3",
                "h4",
                "h5",
                "h6",
                "ul",
                "ol",
                "li",
                "blockquote",
                "pre",
                "code",
                "table",
                "thead",
                "tbody",
                "tr",
                "th",
                "td",
                "img",
                "a",
                "hr",
            ]
            .into_iter()
            .collect(),
        )
        .tag_attributes(tag_attrs)
        .generic_attributes(HashSet::from(["style"]))
        .url_relative(ammonia::UrlRelative::PassThrough)
        .clean(input)
        .to_string()
}

/// 对 Option<String> 的富文本进行安全清理。
pub fn sanitize_rich_html_option(input: Option<String>) -> Option<String> {
    input
        .map(|s| sanitize_rich_html(&s))
        .filter(|s| !s.trim().is_empty())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strips_script_tag() {
        // script 内容在 HTML 规范中是 raw text，ammonia 正确地将其完全移除
        assert_eq!(strip_html_tags("<script>alert('xss')</script>"), "");
    }

    #[test]
    fn strips_img_onerror() {
        assert_eq!(strip_html_tags("<img src=x onerror=alert(1)>"), "");
    }

    #[test]
    fn strips_bold_tags() {
        assert_eq!(strip_html_tags("Hello <b>World</b>"), "Hello World");
    }

    #[test]
    fn no_tags_passthrough() {
        assert_eq!(strip_html_tags("Normal text"), "Normal text");
    }

    #[test]
    fn nested_tags() {
        assert_eq!(strip_html_tags("<div><p>text</p></div>"), "text");
    }

    #[test]
    fn sanitize_option_some() {
        assert_eq!(
            sanitize_option(Some("<b>evil</b>".to_string())),
            Some("evil".to_string())
        );
    }

    #[test]
    fn sanitize_option_none() {
        assert_eq!(sanitize_option(None), None);
    }

    #[test]
    fn sanitize_option_becomes_empty() {
        assert_eq!(sanitize_option(Some("<img>".to_string())), None);
    }

    #[test]
    fn strips_nested_script_attack() {
        let result = strip_html_tags("<scr<script>ipt>alert(1)</scr</script>ipt>");
        assert!(!result.contains("<"), "不应残留 HTML 标签");
    }

    #[test]
    fn strips_unclosed_tag() {
        let result = strip_html_tags("text <img");
        assert!(result.contains("text") || result.is_empty());
    }

    #[test]
    fn handles_entities() {
        let result = strip_html_tags("a &lt; b");
        assert!(result.contains("a") && result.contains("b"));
    }

    #[test]
    fn rich_html_preserves_safe_tags() {
        let html = r#"<p>Hello</p><div style="background:#f0fdf4;padding:14px">Green box</div><strong>Bold</strong>"#;
        let result = sanitize_rich_html(html);
        assert!(result.contains("<p>Hello</p>"), "should preserve <p> tags");
        assert!(
            result.contains("<strong>Bold</strong>"),
            "should preserve <strong>"
        );
        assert!(
            result.contains("background"),
            "should preserve style attribute"
        );
    }

    #[test]
    fn rich_html_removes_script() {
        let html = r#"<p>Hello</p><script>alert(1)</script>"#;
        let result = sanitize_rich_html(html);
        assert!(!result.contains("script"), "should remove script tags");
        assert!(result.contains("Hello"));
    }

    #[test]
    fn rich_html_preserves_styled_block() {
        let html = r#"<div data-w-e-type="styled-block" style="background:#f0fdf4;border-radius:8px;padding:14px">Tip content</div>"#;
        let result = sanitize_rich_html(html);
        assert!(
            result.contains("styled-block"),
            "should preserve data-w-e-type"
        );
        assert!(result.contains("Tip content"));
    }
}