Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
重构所有代码和文档中的项目名称,将OpenFang统一更新为ZCLAW。包括: - 配置文件中的项目名称 - 代码注释和文档引用 - 环境变量和路径 - 类型定义和接口名称 - 测试用例和模拟数据 同时优化部分代码结构,移除未使用的模块,并更新相关依赖项。
136 lines
2.5 KiB
TOML
136 lines
2.5 KiB
TOML
# Collector Hand - 数据收集和聚合能力包
|
|
#
|
|
# ZCLAW Hand 配置
|
|
# 这个 Hand 提供自动化数据收集、网页抓取和聚合能力
|
|
|
|
[hand]
|
|
name = "collector"
|
|
version = "1.0.0"
|
|
description = "数据收集和聚合能力包 - 自动抓取、解析和结构化数据"
|
|
author = "ZCLAW Team"
|
|
|
|
# Hand 类型
|
|
type = "data"
|
|
|
|
# 是否需要人工审批才能执行
|
|
requires_approval = false
|
|
|
|
# 默认超时时间(秒)
|
|
timeout = 300
|
|
|
|
# 最大并发执行数
|
|
max_concurrent = 5
|
|
|
|
# 能力标签
|
|
tags = ["data", "scraping", "collection", "aggregation", "web"]
|
|
|
|
[hand.config]
|
|
# 请求配置
|
|
user_agent = "ZCLAW-Collector/1.0"
|
|
request_timeout = 30
|
|
retry_count = 3
|
|
retry_delay = 5
|
|
|
|
# 分页处理
|
|
max_pages = 100
|
|
pagination_delay = 1 # 秒
|
|
|
|
# 输出配置
|
|
default_output_format = "json" # json, csv, xlsx
|
|
output_dir = "/tmp/zclaw/collector"
|
|
|
|
# 反爬虫设置
|
|
respect_robots_txt = true
|
|
rate_limit_per_second = 2
|
|
|
|
[hand.triggers]
|
|
# 触发器配置
|
|
manual = true
|
|
schedule = true
|
|
webhook = true
|
|
|
|
# 事件触发器
|
|
[[hand.triggers.events]]
|
|
type = "schedule.cron"
|
|
pattern = "0 */6 * * *" # 每6小时
|
|
priority = 5
|
|
|
|
[[hand.triggers.events]]
|
|
type = "chat.intent"
|
|
pattern = "收集|抓取|爬取|采集|scrape|collect|crawl"
|
|
priority = 5
|
|
|
|
[hand.permissions]
|
|
# 权限要求
|
|
requires = [
|
|
"web.fetch",
|
|
"file.read",
|
|
"file.write"
|
|
]
|
|
|
|
# RBAC 角色要求
|
|
roles = ["operator.read", "operator.write"]
|
|
|
|
# 速率限制
|
|
[hand.rate_limit]
|
|
max_requests = 50
|
|
window_seconds = 3600 # 1 hour
|
|
|
|
# 审计配置
|
|
[hand.audit]
|
|
log_inputs = true
|
|
log_outputs = true
|
|
retention_days = 30
|
|
|
|
# 参数定义
|
|
[[hand.parameters]]
|
|
name = "targetUrl"
|
|
label = "目标 URL"
|
|
type = "text"
|
|
required = true
|
|
description = "要抓取的网页 URL"
|
|
|
|
[[hand.parameters]]
|
|
name = "selector"
|
|
label = "CSS 选择器"
|
|
type = "text"
|
|
required = false
|
|
description = "要提取的元素 CSS 选择器"
|
|
|
|
[[hand.parameters]]
|
|
name = "outputFormat"
|
|
label = "输出格式"
|
|
type = "select"
|
|
required = false
|
|
default = "json"
|
|
options = ["json", "csv", "xlsx"]
|
|
|
|
[[hand.parameters]]
|
|
name = "pagination"
|
|
label = "跟踪分页"
|
|
type = "boolean"
|
|
required = false
|
|
default = false
|
|
description = "是否自动跟踪分页链接"
|
|
|
|
# 工作流步骤
|
|
[[hand.workflow]]
|
|
id = "fetch"
|
|
name = "获取页面"
|
|
description = "下载目标网页内容"
|
|
|
|
[[hand.workflow]]
|
|
id = "parse"
|
|
name = "解析内容"
|
|
description = "使用选择器提取目标数据"
|
|
|
|
[[hand.workflow]]
|
|
id = "transform"
|
|
name = "转换数据"
|
|
description = "清理和结构化提取的数据"
|
|
|
|
[[hand.workflow]]
|
|
id = "export"
|
|
name = "导出结果"
|
|
description = "保存为指定格式的文件"
|