feat(ai): 知识库 V2 菜单迁移 + 文本切片器 + 前端路由权限
- 新增迁移 000168:在 AI 知识库同级添加「知识库 V2」菜单,绑定 admin 角色 - 新增 document/chunker.rs:固定大小 + overlap 文本切片器(5 单元测试) - 前端 routeConfig 添加 /health/ai-knowledge-v2 权限声明 - App.tsx validateRouteCoverage 补充 v2 路径
This commit is contained in:
87
crates/erp-ai/src/service/document/chunker.rs
Normal file
87
crates/erp-ai/src/service/document/chunker.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
/// 文本切片:按固定大小 + 重叠切分
|
||||
pub fn chunk_text(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
|
||||
if text.is_empty() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let chars: Vec<char> = text.chars().collect();
|
||||
let total = chars.len();
|
||||
|
||||
if total <= chunk_size {
|
||||
return vec![text.to_string()];
|
||||
}
|
||||
|
||||
let mut chunks = Vec::new();
|
||||
let mut start = 0;
|
||||
|
||||
while start < total {
|
||||
let end = (start + chunk_size).min(total);
|
||||
let chunk: String = chars[start..end].iter().collect();
|
||||
|
||||
let trimmed = chunk.trim().to_string();
|
||||
if !trimmed.is_empty() {
|
||||
chunks.push(trimmed);
|
||||
}
|
||||
|
||||
if end >= total {
|
||||
break;
|
||||
}
|
||||
start += chunk_size.saturating_sub(overlap);
|
||||
|
||||
// 防止无限循环
|
||||
if start <= end - chunk_size && start > 0 {
|
||||
start = end;
|
||||
}
|
||||
}
|
||||
|
||||
chunks
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_chunk_empty() {
|
||||
assert_eq!(chunk_text("", 100, 20), Vec::<String>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_small_text() {
|
||||
let text = "hello world";
|
||||
let chunks = chunk_text(text, 100, 20);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(chunks[0], "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_long_text() {
|
||||
let text = "abcdefghij".repeat(100); // 1000 chars
|
||||
let chunks = chunk_text(&text, 200, 50);
|
||||
assert!(chunks.len() > 1);
|
||||
// First chunk should be 200 chars
|
||||
assert_eq!(chars_count(&chunks[0]), 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_with_overlap() {
|
||||
let text = "abcdefghijklmnopqrstuvwxyz".repeat(20); // 520 chars
|
||||
let chunks = chunk_text(&text, 100, 20);
|
||||
assert!(chunks.len() > 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_chinese() {
|
||||
let text = "你好世界这是一段中文测试文本。".repeat(30);
|
||||
let chunks = chunk_text(&text, 100, 20);
|
||||
assert!(chunks.len() > 1);
|
||||
// 确保中文不被截断
|
||||
for chunk in &chunks {
|
||||
assert!(!chunk.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
fn chars_count(s: &str) -> usize {
|
||||
s.chars().count()
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ pub mod chat_session;
|
||||
pub mod comparison;
|
||||
pub mod cost;
|
||||
pub mod dialysis_risk_scorer;
|
||||
pub mod document;
|
||||
pub mod embedding;
|
||||
pub mod feature_flag_service;
|
||||
pub mod insight_service;
|
||||
|
||||
Reference in New Issue
Block a user