//! Speech Hand - Text-to-Speech synthesis capabilities //! //! Provides speech synthesis for teaching: //! - speak: Convert text to speech //! - speak_ssml: Advanced speech with SSML markup //! - pause/resume/stop: Playback control //! - list_voices: Get available voices //! - set_voice: Configure voice settings use async_trait::async_trait; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::sync::Arc; use tokio::sync::RwLock; use zclaw_types::Result; use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus}; /// TTS Provider types #[derive(Debug, Clone, Serialize, Deserialize, Default)] #[serde(rename_all = "lowercase")] pub enum TtsProvider { #[default] Browser, Azure, OpenAI, ElevenLabs, Local, } /// Speech action types #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "action", rename_all = "snake_case")] pub enum SpeechAction { /// Speak text Speak { text: String, #[serde(default)] voice: Option, #[serde(default = "default_rate")] rate: f32, #[serde(default = "default_pitch")] pitch: f32, #[serde(default = "default_volume")] volume: f32, #[serde(default)] language: Option, }, /// Speak with SSML markup SpeakSsml { ssml: String, #[serde(default)] voice: Option, }, /// Pause playback Pause, /// Resume playback Resume, /// Stop playback Stop, /// List available voices ListVoices { #[serde(default)] language: Option, }, /// Set default voice SetVoice { voice: String, #[serde(default)] language: Option, }, /// Set provider SetProvider { provider: TtsProvider, #[serde(default)] api_key: Option, #[serde(default)] region: Option, }, } fn default_rate() -> f32 { 1.0 } fn default_pitch() -> f32 { 1.0 } fn default_volume() -> f32 { 1.0 } /// Voice information #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VoiceInfo { pub id: String, pub name: String, pub language: String, pub gender: String, #[serde(default)] pub preview_url: Option, } /// Playback state #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub enum PlaybackState { #[default] Idle, Playing, Paused, } /// Speech configuration #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SpeechConfig { pub provider: TtsProvider, pub default_voice: Option, pub default_language: String, pub default_rate: f32, pub default_pitch: f32, pub default_volume: f32, } impl Default for SpeechConfig { fn default() -> Self { Self { provider: TtsProvider::Browser, default_voice: None, default_language: "zh-CN".to_string(), default_rate: 1.0, default_pitch: 1.0, default_volume: 1.0, } } } /// Speech state #[derive(Debug, Clone, Default)] pub struct SpeechState { pub config: SpeechConfig, pub playback: PlaybackState, pub current_text: Option, pub position_ms: u64, pub available_voices: Vec, } /// Speech Hand implementation pub struct SpeechHand { config: HandConfig, state: Arc>, } impl SpeechHand { /// Create a new speech hand pub fn new() -> Self { Self { config: HandConfig { id: "speech".to_string(), name: "语音合成".to_string(), description: "文本转语音合成输出".to_string(), needs_approval: false, dependencies: vec![], input_schema: Some(serde_json::json!({ "type": "object", "properties": { "action": { "type": "string" }, "text": { "type": "string" }, "voice": { "type": "string" }, "rate": { "type": "number" }, } })), tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string(), "demo".to_string()], enabled: true, max_concurrent: 0, timeout_secs: 0, }, state: Arc::new(RwLock::new(SpeechState { config: SpeechConfig::default(), playback: PlaybackState::Idle, available_voices: Self::get_default_voices(), ..Default::default() })), } } /// Create with custom provider pub fn with_provider(provider: TtsProvider) -> Self { let hand = Self::new(); let mut state = hand.state.blocking_write(); state.config.provider = provider; drop(state); hand } /// Get default voices fn get_default_voices() -> Vec { vec![ VoiceInfo { id: "zh-CN-XiaoxiaoNeural".to_string(), name: "Xiaoxiao".to_string(), language: "zh-CN".to_string(), gender: "female".to_string(), preview_url: None, }, VoiceInfo { id: "zh-CN-YunxiNeural".to_string(), name: "Yunxi".to_string(), language: "zh-CN".to_string(), gender: "male".to_string(), preview_url: None, }, VoiceInfo { id: "en-US-JennyNeural".to_string(), name: "Jenny".to_string(), language: "en-US".to_string(), gender: "female".to_string(), preview_url: None, }, VoiceInfo { id: "en-US-GuyNeural".to_string(), name: "Guy".to_string(), language: "en-US".to_string(), gender: "male".to_string(), preview_url: None, }, ] } /// Execute a speech action pub async fn execute_action(&self, action: SpeechAction) -> Result { let mut state = self.state.write().await; match action { SpeechAction::Speak { text, voice, rate, pitch, volume, language } => { let voice_id = voice.or(state.config.default_voice.clone()) .unwrap_or_else(|| "default".to_string()); let lang = language.unwrap_or_else(|| state.config.default_language.clone()); let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate }; let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch }; let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume }; state.playback = PlaybackState::Playing; state.current_text = Some(text.clone()); // Determine TTS method based on provider: // - Browser: frontend uses Web Speech API (zero deps, works offline) // - OpenAI: frontend calls speech_tts command (high-quality, needs API key) // - Others: future support let tts_method = match state.config.provider { TtsProvider::Browser => "browser", TtsProvider::OpenAI => "openai_api", TtsProvider::Azure => "azure_api", TtsProvider::ElevenLabs => "elevenlabs_api", TtsProvider::Local => "local_engine", }; let estimated_duration_ms = (text.chars().count() as f64 / 5.0 * 1000.0) as u64; Ok(HandResult::success(serde_json::json!({ "status": "speaking", "tts_method": tts_method, "text": text, "voice": voice_id, "language": lang, "rate": actual_rate, "pitch": actual_pitch, "volume": actual_volume, "provider": format!("{:?}", state.config.provider).to_lowercase(), "duration_ms": estimated_duration_ms, "instruction": "Frontend should play this via TTS engine" }))) } SpeechAction::SpeakSsml { ssml, voice } => { let voice_id = voice.or(state.config.default_voice.clone()) .unwrap_or_else(|| "default".to_string()); state.playback = PlaybackState::Playing; state.current_text = Some(ssml.clone()); Ok(HandResult::success(serde_json::json!({ "status": "speaking_ssml", "ssml": ssml, "voice": voice_id, "provider": state.config.provider, }))) } SpeechAction::Pause => { state.playback = PlaybackState::Paused; Ok(HandResult::success(serde_json::json!({ "status": "paused", "position_ms": state.position_ms, }))) } SpeechAction::Resume => { state.playback = PlaybackState::Playing; Ok(HandResult::success(serde_json::json!({ "status": "resumed", "position_ms": state.position_ms, }))) } SpeechAction::Stop => { state.playback = PlaybackState::Idle; state.current_text = None; state.position_ms = 0; Ok(HandResult::success(serde_json::json!({ "status": "stopped", }))) } SpeechAction::ListVoices { language } => { let voices: Vec<_> = state.available_voices.iter() .filter(|v| { language.as_ref() .map(|l| v.language.starts_with(l)) .unwrap_or(true) }) .cloned() .collect(); Ok(HandResult::success(serde_json::json!({ "voices": voices, "count": voices.len(), }))) } SpeechAction::SetVoice { voice, language } => { state.config.default_voice = Some(voice.clone()); if let Some(lang) = language { state.config.default_language = lang; } Ok(HandResult::success(serde_json::json!({ "status": "voice_set", "voice": voice, "language": state.config.default_language, }))) } SpeechAction::SetProvider { provider, api_key, region: _ } => { state.config.provider = provider.clone(); // In real implementation, would configure provider Ok(HandResult::success(serde_json::json!({ "status": "provider_set", "provider": provider, "configured": api_key.is_some(), }))) } } } /// Get current state pub async fn get_state(&self) -> SpeechState { self.state.read().await.clone() } } impl Default for SpeechHand { fn default() -> Self { Self::new() } } #[async_trait] impl Hand for SpeechHand { fn config(&self) -> &HandConfig { &self.config } async fn execute(&self, _context: &HandContext, input: Value) -> Result { let action: SpeechAction = match serde_json::from_value(input) { Ok(a) => a, Err(e) => { return Ok(HandResult::error(format!("Invalid speech action: {}", e))); } }; self.execute_action(action).await } fn status(&self) -> HandStatus { HandStatus::Idle } } #[cfg(test)] mod tests { use super::*; #[tokio::test] async fn test_speech_creation() { let hand = SpeechHand::new(); assert_eq!(hand.config().id, "speech"); } #[tokio::test] async fn test_speak() { let hand = SpeechHand::new(); let action = SpeechAction::Speak { text: "Hello, world!".to_string(), voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None, }; let result = hand.execute_action(action).await.unwrap(); assert!(result.success); } #[tokio::test] async fn test_pause_resume() { let hand = SpeechHand::new(); // Speak first hand.execute_action(SpeechAction::Speak { text: "Test".to_string(), voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None, }).await.unwrap(); // Pause let result = hand.execute_action(SpeechAction::Pause).await.unwrap(); assert!(result.success); // Resume let result = hand.execute_action(SpeechAction::Resume).await.unwrap(); assert!(result.success); } #[tokio::test] async fn test_list_voices() { let hand = SpeechHand::new(); let action = SpeechAction::ListVoices { language: Some("zh".to_string()) }; let result = hand.execute_action(action).await.unwrap(); assert!(result.success); } #[tokio::test] async fn test_set_voice() { let hand = SpeechHand::new(); let action = SpeechAction::SetVoice { voice: "zh-CN-XiaoxiaoNeural".to_string(), language: Some("zh-CN".to_string()), }; let result = hand.execute_action(action).await.unwrap(); assert!(result.success); let state = hand.get_state().await; assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string())); } }