zclaw_openfang/crates/zclaw-hands/src/hands/speech.rs

//! Speech Hand - Text-to-Speech synthesis capabilities
//!
//! Provides speech synthesis for teaching:
//! - speak: Convert text to speech
//! - speak_ssml: Advanced speech with SSML markup
//! - pause/resume/stop: Playback control
//! - list_voices: Get available voices
//! - set_voice: Configure voice settings

use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::sync::Arc;
use tokio::sync::RwLock;
use zclaw_types::Result;

use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus};

/// TTS Provider types
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum TtsProvider {
    #[default]
    Browser,
    Azure,
    OpenAI,
    ElevenLabs,
    Local,
}

/// Speech action types
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "action", rename_all = "snake_case")]
pub enum SpeechAction {
    /// Speak text
    Speak {
        text: String,
        #[serde(default)]
        voice: Option<String>,
        #[serde(default = "default_rate")]
        rate: f32,
        #[serde(default = "default_pitch")]
        pitch: f32,
        #[serde(default = "default_volume")]
        volume: f32,
        #[serde(default)]
        language: Option<String>,
    },
    /// Speak with SSML markup
    SpeakSsml {
        ssml: String,
        #[serde(default)]
        voice: Option<String>,
    },
    /// Pause playback
    Pause,
    /// Resume playback
    Resume,
    /// Stop playback
    Stop,
    /// List available voices
    ListVoices {
        #[serde(default)]
        language: Option<String>,
    },
    /// Set default voice
    SetVoice {
        voice: String,
        #[serde(default)]
        language: Option<String>,
    },
    /// Set provider
    SetProvider {
        provider: TtsProvider,
        #[serde(default)]
        api_key: Option<String>,
        #[serde(default)]
        region: Option<String>,
    },
}

fn default_rate() -> f32 { 1.0 }
fn default_pitch() -> f32 { 1.0 }
fn default_volume() -> f32 { 1.0 }

/// Voice information
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceInfo {
    pub id: String,
    pub name: String,
    pub language: String,
    pub gender: String,
    #[serde(default)]
    pub preview_url: Option<String>,
}

/// Playback state
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub enum PlaybackState {
    #[default]
    Idle,
    Playing,
    Paused,
}

/// Speech configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeechConfig {
    pub provider: TtsProvider,
    pub default_voice: Option<String>,
    pub default_language: String,
    pub default_rate: f32,
    pub default_pitch: f32,
    pub default_volume: f32,
}

impl Default for SpeechConfig {
    fn default() -> Self {
        Self {
            provider: TtsProvider::Browser,
            default_voice: None,
            default_language: "zh-CN".to_string(),
            default_rate: 1.0,
            default_pitch: 1.0,
            default_volume: 1.0,
        }
    }
}

/// Speech state
#[derive(Debug, Clone, Default)]
pub struct SpeechState {
    pub config: SpeechConfig,
    pub playback: PlaybackState,
    pub current_text: Option<String>,
    pub position_ms: u64,
    pub available_voices: Vec<VoiceInfo>,
}

/// Speech Hand implementation
pub struct SpeechHand {
    config: HandConfig,
    state: Arc<RwLock<SpeechState>>,
}

impl SpeechHand {
    /// Create a new speech hand
    pub fn new() -> Self {
        Self {
            config: HandConfig {
                id: "speech".to_string(),
                name: "语音合成".to_string(),
                description: "文本转语音合成输出".to_string(),
                needs_approval: false,
                dependencies: vec![],
                input_schema: Some(serde_json::json!({
                    "type": "object",
                    "properties": {
                        "action": { "type": "string" },
                        "text": { "type": "string" },
                        "voice": { "type": "string" },
                        "rate": { "type": "number" },
                    }
                })),
                tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string(), "demo".to_string()],
                enabled: true,
            },
            state: Arc::new(RwLock::new(SpeechState {
                config: SpeechConfig::default(),
                playback: PlaybackState::Idle,
                available_voices: Self::get_default_voices(),
                ..Default::default()
            })),
        }
    }

    /// Create with custom provider
    pub fn with_provider(provider: TtsProvider) -> Self {
        let hand = Self::new();
        let mut state = hand.state.blocking_write();
        state.config.provider = provider;
        drop(state);
        hand
    }

    /// Get default voices
    fn get_default_voices() -> Vec<VoiceInfo> {
        vec![
            VoiceInfo {
                id: "zh-CN-XiaoxiaoNeural".to_string(),
                name: "Xiaoxiao".to_string(),
                language: "zh-CN".to_string(),
                gender: "female".to_string(),
                preview_url: None,
            },
            VoiceInfo {
                id: "zh-CN-YunxiNeural".to_string(),
                name: "Yunxi".to_string(),
                language: "zh-CN".to_string(),
                gender: "male".to_string(),
                preview_url: None,
            },
            VoiceInfo {
                id: "en-US-JennyNeural".to_string(),
                name: "Jenny".to_string(),
                language: "en-US".to_string(),
                gender: "female".to_string(),
                preview_url: None,
            },
            VoiceInfo {
                id: "en-US-GuyNeural".to_string(),
                name: "Guy".to_string(),
                language: "en-US".to_string(),
                gender: "male".to_string(),
                preview_url: None,
            },
        ]
    }

    /// Execute a speech action
    pub async fn execute_action(&self, action: SpeechAction) -> Result<HandResult> {
        let mut state = self.state.write().await;

        match action {
            SpeechAction::Speak { text, voice, rate, pitch, volume, language } => {
                let voice_id = voice.or(state.config.default_voice.clone())
                    .unwrap_or_else(|| "default".to_string());
                let lang = language.unwrap_or_else(|| state.config.default_language.clone());
                let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate };
                let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch };
                let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume };

                state.playback = PlaybackState::Playing;
                state.current_text = Some(text.clone());

                // Determine TTS method based on provider:
                // - Browser: frontend uses Web Speech API (zero deps, works offline)
                // - OpenAI: frontend calls speech_tts command (high-quality, needs API key)
                // - Others: future support
                let tts_method = match state.config.provider {
                    TtsProvider::Browser => "browser",
                    TtsProvider::OpenAI => "openai_api",
                    TtsProvider::Azure => "azure_api",
                    TtsProvider::ElevenLabs => "elevenlabs_api",
                    TtsProvider::Local => "local_engine",
                };

                let estimated_duration_ms = (text.chars().count() as f64 / 5.0 * 1000.0) as u64;

                Ok(HandResult::success(serde_json::json!({
                    "status": "speaking",
                    "tts_method": tts_method,
                    "text": text,
                    "voice": voice_id,
                    "language": lang,
                    "rate": actual_rate,
                    "pitch": actual_pitch,
                    "volume": actual_volume,
                    "provider": format!("{:?}", state.config.provider).to_lowercase(),
                    "duration_ms": estimated_duration_ms,
                    "instruction": "Frontend should play this via TTS engine"
                })))
            }
            SpeechAction::SpeakSsml { ssml, voice } => {
                let voice_id = voice.or(state.config.default_voice.clone())
                    .unwrap_or_else(|| "default".to_string());

                state.playback = PlaybackState::Playing;
                state.current_text = Some(ssml.clone());

                Ok(HandResult::success(serde_json::json!({
                    "status": "speaking_ssml",
                    "ssml": ssml,
                    "voice": voice_id,
                    "provider": state.config.provider,
                })))
            }
            SpeechAction::Pause => {
                state.playback = PlaybackState::Paused;
                Ok(HandResult::success(serde_json::json!({
                    "status": "paused",
                    "position_ms": state.position_ms,
                })))
            }
            SpeechAction::Resume => {
                state.playback = PlaybackState::Playing;
                Ok(HandResult::success(serde_json::json!({
                    "status": "resumed",
                    "position_ms": state.position_ms,
                })))
            }
            SpeechAction::Stop => {
                state.playback = PlaybackState::Idle;
                state.current_text = None;
                state.position_ms = 0;
                Ok(HandResult::success(serde_json::json!({
                    "status": "stopped",
                })))
            }
            SpeechAction::ListVoices { language } => {
                let voices: Vec<_> = state.available_voices.iter()
                    .filter(|v| {
                        language.as_ref()
                            .map(|l| v.language.starts_with(l))
                            .unwrap_or(true)
                    })
                    .cloned()
                    .collect();

                Ok(HandResult::success(serde_json::json!({
                    "voices": voices,
                    "count": voices.len(),
                })))
            }
            SpeechAction::SetVoice { voice, language } => {
                state.config.default_voice = Some(voice.clone());
                if let Some(lang) = language {
                    state.config.default_language = lang;
                }
                Ok(HandResult::success(serde_json::json!({
                    "status": "voice_set",
                    "voice": voice,
                    "language": state.config.default_language,
                })))
            }
            SpeechAction::SetProvider { provider, api_key, region: _ } => {
                state.config.provider = provider.clone();
                // In real implementation, would configure provider
                Ok(HandResult::success(serde_json::json!({
                    "status": "provider_set",
                    "provider": provider,
                    "configured": api_key.is_some(),
                })))
            }
        }
    }

    /// Get current state
    pub async fn get_state(&self) -> SpeechState {
        self.state.read().await.clone()
    }
}

impl Default for SpeechHand {
    fn default() -> Self {
        Self::new()
    }
}

#[async_trait]
impl Hand for SpeechHand {
    fn config(&self) -> &HandConfig {
        &self.config
    }

    async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
        let action: SpeechAction = match serde_json::from_value(input) {
            Ok(a) => a,
            Err(e) => {
                return Ok(HandResult::error(format!("Invalid speech action: {}", e)));
            }
        };

        self.execute_action(action).await
    }

    fn status(&self) -> HandStatus {
        HandStatus::Idle
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn test_speech_creation() {
        let hand = SpeechHand::new();
        assert_eq!(hand.config().id, "speech");
    }

    #[tokio::test]
    async fn test_speak() {
        let hand = SpeechHand::new();
        let action = SpeechAction::Speak {
            text: "Hello, world!".to_string(),
            voice: None,
            rate: 1.0,
            pitch: 1.0,
            volume: 1.0,
            language: None,
        };

        let result = hand.execute_action(action).await.unwrap();
        assert!(result.success);
    }

    #[tokio::test]
    async fn test_pause_resume() {
        let hand = SpeechHand::new();

        // Speak first
        hand.execute_action(SpeechAction::Speak {
            text: "Test".to_string(),
            voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None,
        }).await.unwrap();

        // Pause
        let result = hand.execute_action(SpeechAction::Pause).await.unwrap();
        assert!(result.success);

        // Resume
        let result = hand.execute_action(SpeechAction::Resume).await.unwrap();
        assert!(result.success);
    }

    #[tokio::test]
    async fn test_list_voices() {
        let hand = SpeechHand::new();
        let action = SpeechAction::ListVoices { language: Some("zh".to_string()) };

        let result = hand.execute_action(action).await.unwrap();
        assert!(result.success);
    }

    #[tokio::test]
    async fn test_set_voice() {
        let hand = SpeechHand::new();
        let action = SpeechAction::SetVoice {
            voice: "zh-CN-XiaoxiaoNeural".to_string(),
            language: Some("zh-CN".to_string()),
        };

        let result = hand.execute_action(action).await.unwrap();
        assert!(result.success);

        let state = hand.get_state().await;
        assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string()));
    }
}