Files
zclaw_openfang/crates/zclaw-hands/src/hands/speech.rs
iven 13c0b18bbc
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
feat: Batch 5-9 — GrowthIntegration桥接、验证补全、死代码清理、Pipeline模板、Speech/Twitter真实实现
Batch 5 (P0): GrowthIntegration 接入 Tauri
- Kernel 新增 set_viking()/set_extraction_driver() 桥接 SqliteStorage
- 中间件链共享存储,MemoryExtractor 接入 LLM 驱动

Batch 6 (P1): 输入验证 + Heartbeat
- Relay 验证补全(stream 兼容检查、API key 格式校验)
- UUID 类型校验、SessionId 错误返回
- Heartbeat 默认开启 + 首次聊天自动初始化

Batch 7 (P2): 死代码清理
- zclaw-channels 整体移除(317 行)
- multi-agent 特性门控、admin 方法标注

Batch 8 (P2): Pipeline 模板
- PipelineMetadata 新增 annotations 字段
- pipeline_templates 命令 + 2 个示例模板
- fallback driver base_url 修复(doubao/qwen/deepseek 端点)

Batch 9 (P1): SpeechHand/TwitterHand 真实实现
- SpeechHand: tts_method 字段 + Browser TTS 前端集成 (Web Speech API)
- TwitterHand: 12 个 action 全部替换为 Twitter API v2 真实 HTTP 调用
- chatStore/useAutomationEvents 双路径 TTS 触发
2026-03-30 09:24:50 +08:00

441 lines
14 KiB
Rust

//! Speech Hand - Text-to-Speech synthesis capabilities
//!
//! Provides speech synthesis for teaching:
//! - speak: Convert text to speech
//! - speak_ssml: Advanced speech with SSML markup
//! - pause/resume/stop: Playback control
//! - list_voices: Get available voices
//! - set_voice: Configure voice settings
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::sync::Arc;
use tokio::sync::RwLock;
use zclaw_types::Result;
use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus};
/// TTS Provider types
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum TtsProvider {
#[default]
Browser,
Azure,
OpenAI,
ElevenLabs,
Local,
}
/// Speech action types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "action", rename_all = "snake_case")]
pub enum SpeechAction {
/// Speak text
Speak {
text: String,
#[serde(default)]
voice: Option<String>,
#[serde(default = "default_rate")]
rate: f32,
#[serde(default = "default_pitch")]
pitch: f32,
#[serde(default = "default_volume")]
volume: f32,
#[serde(default)]
language: Option<String>,
},
/// Speak with SSML markup
SpeakSsml {
ssml: String,
#[serde(default)]
voice: Option<String>,
},
/// Pause playback
Pause,
/// Resume playback
Resume,
/// Stop playback
Stop,
/// List available voices
ListVoices {
#[serde(default)]
language: Option<String>,
},
/// Set default voice
SetVoice {
voice: String,
#[serde(default)]
language: Option<String>,
},
/// Set provider
SetProvider {
provider: TtsProvider,
#[serde(default)]
api_key: Option<String>,
#[serde(default)]
region: Option<String>,
},
}
fn default_rate() -> f32 { 1.0 }
fn default_pitch() -> f32 { 1.0 }
fn default_volume() -> f32 { 1.0 }
/// Voice information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceInfo {
pub id: String,
pub name: String,
pub language: String,
pub gender: String,
#[serde(default)]
pub preview_url: Option<String>,
}
/// Playback state
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub enum PlaybackState {
#[default]
Idle,
Playing,
Paused,
}
/// Speech configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeechConfig {
pub provider: TtsProvider,
pub default_voice: Option<String>,
pub default_language: String,
pub default_rate: f32,
pub default_pitch: f32,
pub default_volume: f32,
}
impl Default for SpeechConfig {
fn default() -> Self {
Self {
provider: TtsProvider::Browser,
default_voice: None,
default_language: "zh-CN".to_string(),
default_rate: 1.0,
default_pitch: 1.0,
default_volume: 1.0,
}
}
}
/// Speech state
#[derive(Debug, Clone, Default)]
pub struct SpeechState {
pub config: SpeechConfig,
pub playback: PlaybackState,
pub current_text: Option<String>,
pub position_ms: u64,
pub available_voices: Vec<VoiceInfo>,
}
/// Speech Hand implementation
pub struct SpeechHand {
config: HandConfig,
state: Arc<RwLock<SpeechState>>,
}
impl SpeechHand {
/// Create a new speech hand
pub fn new() -> Self {
Self {
config: HandConfig {
id: "speech".to_string(),
name: "语音合成".to_string(),
description: "文本转语音合成输出".to_string(),
needs_approval: false,
dependencies: vec![],
input_schema: Some(serde_json::json!({
"type": "object",
"properties": {
"action": { "type": "string" },
"text": { "type": "string" },
"voice": { "type": "string" },
"rate": { "type": "number" },
}
})),
tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string(), "demo".to_string()],
enabled: true,
},
state: Arc::new(RwLock::new(SpeechState {
config: SpeechConfig::default(),
playback: PlaybackState::Idle,
available_voices: Self::get_default_voices(),
..Default::default()
})),
}
}
/// Create with custom provider
pub fn with_provider(provider: TtsProvider) -> Self {
let hand = Self::new();
let mut state = hand.state.blocking_write();
state.config.provider = provider;
drop(state);
hand
}
/// Get default voices
fn get_default_voices() -> Vec<VoiceInfo> {
vec![
VoiceInfo {
id: "zh-CN-XiaoxiaoNeural".to_string(),
name: "Xiaoxiao".to_string(),
language: "zh-CN".to_string(),
gender: "female".to_string(),
preview_url: None,
},
VoiceInfo {
id: "zh-CN-YunxiNeural".to_string(),
name: "Yunxi".to_string(),
language: "zh-CN".to_string(),
gender: "male".to_string(),
preview_url: None,
},
VoiceInfo {
id: "en-US-JennyNeural".to_string(),
name: "Jenny".to_string(),
language: "en-US".to_string(),
gender: "female".to_string(),
preview_url: None,
},
VoiceInfo {
id: "en-US-GuyNeural".to_string(),
name: "Guy".to_string(),
language: "en-US".to_string(),
gender: "male".to_string(),
preview_url: None,
},
]
}
/// Execute a speech action
pub async fn execute_action(&self, action: SpeechAction) -> Result<HandResult> {
let mut state = self.state.write().await;
match action {
SpeechAction::Speak { text, voice, rate, pitch, volume, language } => {
let voice_id = voice.or(state.config.default_voice.clone())
.unwrap_or_else(|| "default".to_string());
let lang = language.unwrap_or_else(|| state.config.default_language.clone());
let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate };
let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch };
let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume };
state.playback = PlaybackState::Playing;
state.current_text = Some(text.clone());
// Determine TTS method based on provider:
// - Browser: frontend uses Web Speech API (zero deps, works offline)
// - OpenAI: frontend calls speech_tts command (high-quality, needs API key)
// - Others: future support
let tts_method = match state.config.provider {
TtsProvider::Browser => "browser",
TtsProvider::OpenAI => "openai_api",
TtsProvider::Azure => "azure_api",
TtsProvider::ElevenLabs => "elevenlabs_api",
TtsProvider::Local => "local_engine",
};
let estimated_duration_ms = (text.chars().count() as f64 / 5.0 * 1000.0) as u64;
Ok(HandResult::success(serde_json::json!({
"status": "speaking",
"tts_method": tts_method,
"text": text,
"voice": voice_id,
"language": lang,
"rate": actual_rate,
"pitch": actual_pitch,
"volume": actual_volume,
"provider": format!("{:?}", state.config.provider).to_lowercase(),
"duration_ms": estimated_duration_ms,
"instruction": "Frontend should play this via TTS engine"
})))
}
SpeechAction::SpeakSsml { ssml, voice } => {
let voice_id = voice.or(state.config.default_voice.clone())
.unwrap_or_else(|| "default".to_string());
state.playback = PlaybackState::Playing;
state.current_text = Some(ssml.clone());
Ok(HandResult::success(serde_json::json!({
"status": "speaking_ssml",
"ssml": ssml,
"voice": voice_id,
"provider": state.config.provider,
})))
}
SpeechAction::Pause => {
state.playback = PlaybackState::Paused;
Ok(HandResult::success(serde_json::json!({
"status": "paused",
"position_ms": state.position_ms,
})))
}
SpeechAction::Resume => {
state.playback = PlaybackState::Playing;
Ok(HandResult::success(serde_json::json!({
"status": "resumed",
"position_ms": state.position_ms,
})))
}
SpeechAction::Stop => {
state.playback = PlaybackState::Idle;
state.current_text = None;
state.position_ms = 0;
Ok(HandResult::success(serde_json::json!({
"status": "stopped",
})))
}
SpeechAction::ListVoices { language } => {
let voices: Vec<_> = state.available_voices.iter()
.filter(|v| {
language.as_ref()
.map(|l| v.language.starts_with(l))
.unwrap_or(true)
})
.cloned()
.collect();
Ok(HandResult::success(serde_json::json!({
"voices": voices,
"count": voices.len(),
})))
}
SpeechAction::SetVoice { voice, language } => {
state.config.default_voice = Some(voice.clone());
if let Some(lang) = language {
state.config.default_language = lang;
}
Ok(HandResult::success(serde_json::json!({
"status": "voice_set",
"voice": voice,
"language": state.config.default_language,
})))
}
SpeechAction::SetProvider { provider, api_key, region: _ } => {
state.config.provider = provider.clone();
// In real implementation, would configure provider
Ok(HandResult::success(serde_json::json!({
"status": "provider_set",
"provider": provider,
"configured": api_key.is_some(),
})))
}
}
}
/// Get current state
pub async fn get_state(&self) -> SpeechState {
self.state.read().await.clone()
}
}
impl Default for SpeechHand {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl Hand for SpeechHand {
fn config(&self) -> &HandConfig {
&self.config
}
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
let action: SpeechAction = match serde_json::from_value(input) {
Ok(a) => a,
Err(e) => {
return Ok(HandResult::error(format!("Invalid speech action: {}", e)));
}
};
self.execute_action(action).await
}
fn status(&self) -> HandStatus {
HandStatus::Idle
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_speech_creation() {
let hand = SpeechHand::new();
assert_eq!(hand.config().id, "speech");
}
#[tokio::test]
async fn test_speak() {
let hand = SpeechHand::new();
let action = SpeechAction::Speak {
text: "Hello, world!".to_string(),
voice: None,
rate: 1.0,
pitch: 1.0,
volume: 1.0,
language: None,
};
let result = hand.execute_action(action).await.unwrap();
assert!(result.success);
}
#[tokio::test]
async fn test_pause_resume() {
let hand = SpeechHand::new();
// Speak first
hand.execute_action(SpeechAction::Speak {
text: "Test".to_string(),
voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None,
}).await.unwrap();
// Pause
let result = hand.execute_action(SpeechAction::Pause).await.unwrap();
assert!(result.success);
// Resume
let result = hand.execute_action(SpeechAction::Resume).await.unwrap();
assert!(result.success);
}
#[tokio::test]
async fn test_list_voices() {
let hand = SpeechHand::new();
let action = SpeechAction::ListVoices { language: Some("zh".to_string()) };
let result = hand.execute_action(action).await.unwrap();
assert!(result.success);
}
#[tokio::test]
async fn test_set_voice() {
let hand = SpeechHand::new();
let action = SpeechAction::SetVoice {
voice: "zh-CN-XiaoxiaoNeural".to_string(),
language: Some("zh-CN".to_string()),
};
let result = hand.execute_action(action).await.unwrap();
assert!(result.success);
let state = hand.get_state().await;
assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string()));
}
}