zclaw_openfang/crates/zclaw-saas/src/knowledge/extractors.rs

//! 文档处理管线 — PDF/DOCX/Excel 格式提取
//!
//! 核心思想：每种格式输出统一的 NormalizedDocument，后面复用现有管线。
//! Excel 走独立的结构化通道（JSONB 行级存储），不走 RAG。

use calamine::{Reader, Data, Range};

// === 规范化文档 — 所有格式的统一中间表示 ===

/// 文档提取结果（用于 RAG 通道）
pub struct NormalizedDocument {
    pub title: String,
    pub sections: Vec<DocumentSection>,
    pub metadata: DocumentMetadata,
}

pub struct DocumentSection {
    pub heading: Option<String>,
    pub content: String,
    pub level: u8,
    pub page_number: Option<u32>,
}

pub struct DocumentMetadata {
    pub source_format: String,
    pub file_name: String,
    pub total_pages: Option<u32>,
    pub total_sections: u32,
}

// === 格式路由 ===

/// 根据文件扩展名判断处理通道
pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
    let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
    match ext.as_str() {
        "pdf" => Some(DocumentFormat::Pdf),
        "docx" | "doc" => Some(DocumentFormat::Docx),
        "xlsx" | "xls" => Some(DocumentFormat::Excel),
        "md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
        "csv" => Some(DocumentFormat::Csv),
        _ => None,
    }
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DocumentFormat {
    Pdf,
    Docx,
    Excel,
    Csv,
    Markdown,
}

impl DocumentFormat {
    pub fn is_structured(&self) -> bool {
        matches!(self, Self::Excel | Self::Csv)
    }
}

// === 文件处理结果 ===

pub enum ProcessedFile {
    /// 文档通道（RAG）— PDF/DOCX/Markdown
    Document(NormalizedDocument),
    /// 结构化通道 — Excel/CSV 行数据
    Structured {
        title: String,
        sheet_names: Vec<String>,
        column_headers: Vec<String>,
        rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
    },
}

// === 提取错误 ===

#[derive(Debug)]
pub struct ExtractError(pub String);

impl std::fmt::Display for ExtractError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
    }
}

impl std::error::Error for ExtractError {}

impl From<ExtractError> for crate::error::SaasError {
    fn from(e: ExtractError) -> Self {
        crate::error::SaasError::InvalidInput(e.0)
    }
}

// === PDF 提取 ===

pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
    let text = pdf_extract::extract_text_from_mem(data)
        .map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;

    let pages: Vec<&str> = text.split('\x0c').collect();
    let page_count = pages.len() as u32;

    let mut sections = Vec::new();
    let mut current_content = String::new();

    for (i, page) in pages.iter().enumerate() {
        let page_text = page.trim();
        if page_text.is_empty() {
            continue;
        }

        current_content.push_str(page_text);
        current_content.push('\n');

        if current_content.len() > 2000 || i == pages.len() - 1 {
            let content = current_content.trim().to_string();
            if !content.is_empty() {
                sections.push(DocumentSection {
                    heading: Some(format!("第 {} 页", i + 1)),
                    content,
                    level: 2,
                    page_number: Some((i + 1) as u32),
                });
            }
            current_content.clear();
        }
    }

    let title = extract_title(file_name, ".pdf");
    let total_sections = sections.len() as u32;

    Ok(NormalizedDocument {
        title,
        sections,
        metadata: DocumentMetadata {
            source_format: "pdf".to_string(),
            file_name: file_name.to_string(),
            total_pages: Some(page_count),
            total_sections,
        },
    })
}

// === DOCX 提取 ===

pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
    let reader = std::io::Cursor::new(data);
    let mut archive = zip::ZipArchive::new(reader)
        .map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;

    let mut doc_xml = archive.by_name("word/document.xml")
        .map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;

    let mut xml_content = String::new();
    use std::io::Read;
    doc_xml.read_to_string(&mut xml_content)
        .map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;

    let mut sections = Vec::new();
    let mut current_heading: Option<String> = None;
    let mut current_content = String::new();

    // 简单 XML 解析：提取 <w:t> 文本和 <w:pStyle> 标题层级
    let mut in_text = false;
    let mut paragraph_style = String::new();
    let mut text_buf = String::new();

    let mut reader = quick_xml::Reader::from_str(&xml_content);
    let mut buf = Vec::new();

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(quick_xml::events::Event::Start(e)) => {
                let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
                match name.as_str() {
                    "p" => paragraph_style.clear(),
                    "t" => in_text = true,
                    "pStyle" => {
                        for attr in e.attributes().flatten() {
                            if attr.key.local_name().as_ref() == b"val" {
                                paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
                            }
                        }
                    }
                    _ => {}
                }
            }
            Ok(quick_xml::events::Event::Text(t)) => {
                if in_text {
                    text_buf.push_str(&t.unescape().unwrap_or_default());
                }
            }
            Ok(quick_xml::events::Event::End(e)) => {
                let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
                match name.as_str() {
                    "p" => {
                        let text = text_buf.trim().to_string();
                        text_buf.clear();
                        if text.is_empty() { continue; }

                        let is_heading = paragraph_style.starts_with("Heading")
                            || paragraph_style.starts_with("heading")
                            || paragraph_style == "Title";

                        if is_heading {
                            if !current_content.is_empty() {
                                sections.push(DocumentSection {
                                    heading: current_heading.take(),
                                    content: current_content.trim().to_string(),
                                    level: 2,
                                    page_number: None,
                                });
                                current_content.clear();
                            }
                            current_heading = Some(text);
                        } else {
                            current_content.push_str(&text);
                            current_content.push('\n');
                        }
                    }
                    "t" => in_text = false,
                    _ => {}
                }
            }
            Ok(quick_xml::events::Event::Eof) => break,
            Err(e) => {
                tracing::warn!("DOCX XML parse warning: {}", e);
                break;
            }
            _ => {}
        }
        buf.clear();
    }

    if !current_content.is_empty() {
        sections.push(DocumentSection {
            heading: current_heading,
            content: current_content.trim().to_string(),
            level: 2,
            page_number: None,
        });
    }

    let title = extract_title(file_name, ".docx");
    let total_sections = sections.len() as u32;

    Ok(NormalizedDocument {
        title,
        sections,
        metadata: DocumentMetadata {
            source_format: "docx".to_string(),
            file_name: file_name.to_string(),
            total_pages: None,
            total_sections,
        },
    })
}

// === Excel 解析 ===

pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
    let cursor = std::io::Cursor::new(data);
    let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
        .map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;

    let sheet_names = workbook.sheet_names().to_vec();
    let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
    let mut all_headers: Vec<String> = Vec::new();
    let mut global_row_index = 0i32;

    for sheet_name in &sheet_names {
        if let Ok(range) = workbook.worksheet_range(sheet_name) {
            let mut headers: Vec<String> = Vec::new();
            let mut first_row = true;

            for row in range_as_data_rows(&range) {
                if first_row {
                    headers = row.iter().map(|cell| {
                        cell.to_string().trim().to_string()
                    }).collect();
                    headers.retain(|h| !h.is_empty());
                    if headers.is_empty() { first_row = false; continue; }
                    for h in &headers {
                        if !all_headers.contains(h) {
                            all_headers.push(h.clone());
                        }
                    }
                    first_row = false;
                    continue;
                }

                let mut row_map = serde_json::Map::new();
                for (i, cell) in row.iter().enumerate() {
                    if i >= headers.len() { break; }
                    let value = match cell {
                        Data::Empty => continue,
                        Data::String(s) => serde_json::Value::String(s.clone()),
                        Data::Float(f) => serde_json::json!(f),
                        Data::Int(n) => serde_json::json!(n),
                        Data::Bool(b) => serde_json::Value::Bool(*b),
                        Data::DateTime(dt) => {
                            serde_json::Value::String(dt.to_string())
                        }
                        Data::DateTimeIso(s) => {
                            serde_json::Value::String(s.clone())
                        }
                        Data::DurationIso(s) => {
                            serde_json::Value::String(s.clone())
                        }
                        Data::Error(e) => {
                            serde_json::Value::String(format!("{:?}", e))
                        }
                    };
                    row_map.insert(headers[i].clone(), value);
                }

                if !row_map.is_empty() {
                    all_rows.push((
                        Some(sheet_name.clone()),
                        global_row_index,
                        headers.clone(),
                        serde_json::Value::Object(row_map),
                    ));
                    global_row_index += 1;
                }
            }
        }
    }

    let title = extract_title(file_name, ".xlsx");

    Ok(ProcessedFile::Structured {
        title,
        sheet_names,
        column_headers: all_headers,
        rows: all_rows,
    })
}

// === 工具函数 ===

/// 辅助：将 Range<Data> 转为行的 Vec，解决 calamine 类型推断问题
fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
    range.rows().map(|row| row.to_vec()).collect()
}

/// 从文件名提取标题
fn extract_title(file_name: &str, ext: &str) -> String {
    file_name
        .rsplit_once('/')
        .or_else(|| file_name.rsplit_once('\\'))
        .map(|(_, name)| name)
        .unwrap_or(file_name)
        .trim_end_matches(ext)
        .to_string()
}

/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
    let mut md = String::new();
    for section in &doc.sections {
        if let Some(ref heading) = section.heading {
            md.push_str(&format!("## {}\n\n", heading));
        }
        md.push_str(&section.content);
        md.push_str("\n\n");
    }
    md.trim().to_string()
}