Some checks failed
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
M3-04/M3-05 audit fixes: - HandConfig: add max_concurrent (u32) and timeout_secs (u64) with serde defaults - Kernel execute_hand: enforce timeout via tokio::time::timeout, cancel on expiry - All 9 hand implementations: add max_concurrent: 0, timeout_secs: 0 - Agent createClone: pass soul field through to kernel - Fix duplicate soul block in agent_create command
443 lines
14 KiB
Rust
443 lines
14 KiB
Rust
//! Speech Hand - Text-to-Speech synthesis capabilities
|
|
//!
|
|
//! Provides speech synthesis for teaching:
|
|
//! - speak: Convert text to speech
|
|
//! - speak_ssml: Advanced speech with SSML markup
|
|
//! - pause/resume/stop: Playback control
|
|
//! - list_voices: Get available voices
|
|
//! - set_voice: Configure voice settings
|
|
|
|
use async_trait::async_trait;
|
|
use serde::{Deserialize, Serialize};
|
|
use serde_json::Value;
|
|
use std::sync::Arc;
|
|
use tokio::sync::RwLock;
|
|
use zclaw_types::Result;
|
|
|
|
use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus};
|
|
|
|
/// TTS Provider types
|
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
#[serde(rename_all = "lowercase")]
|
|
pub enum TtsProvider {
|
|
#[default]
|
|
Browser,
|
|
Azure,
|
|
OpenAI,
|
|
ElevenLabs,
|
|
Local,
|
|
}
|
|
|
|
/// Speech action types
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
#[serde(tag = "action", rename_all = "snake_case")]
|
|
pub enum SpeechAction {
|
|
/// Speak text
|
|
Speak {
|
|
text: String,
|
|
#[serde(default)]
|
|
voice: Option<String>,
|
|
#[serde(default = "default_rate")]
|
|
rate: f32,
|
|
#[serde(default = "default_pitch")]
|
|
pitch: f32,
|
|
#[serde(default = "default_volume")]
|
|
volume: f32,
|
|
#[serde(default)]
|
|
language: Option<String>,
|
|
},
|
|
/// Speak with SSML markup
|
|
SpeakSsml {
|
|
ssml: String,
|
|
#[serde(default)]
|
|
voice: Option<String>,
|
|
},
|
|
/// Pause playback
|
|
Pause,
|
|
/// Resume playback
|
|
Resume,
|
|
/// Stop playback
|
|
Stop,
|
|
/// List available voices
|
|
ListVoices {
|
|
#[serde(default)]
|
|
language: Option<String>,
|
|
},
|
|
/// Set default voice
|
|
SetVoice {
|
|
voice: String,
|
|
#[serde(default)]
|
|
language: Option<String>,
|
|
},
|
|
/// Set provider
|
|
SetProvider {
|
|
provider: TtsProvider,
|
|
#[serde(default)]
|
|
api_key: Option<String>,
|
|
#[serde(default)]
|
|
region: Option<String>,
|
|
},
|
|
}
|
|
|
|
fn default_rate() -> f32 { 1.0 }
|
|
fn default_pitch() -> f32 { 1.0 }
|
|
fn default_volume() -> f32 { 1.0 }
|
|
|
|
/// Voice information
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct VoiceInfo {
|
|
pub id: String,
|
|
pub name: String,
|
|
pub language: String,
|
|
pub gender: String,
|
|
#[serde(default)]
|
|
pub preview_url: Option<String>,
|
|
}
|
|
|
|
/// Playback state
|
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
pub enum PlaybackState {
|
|
#[default]
|
|
Idle,
|
|
Playing,
|
|
Paused,
|
|
}
|
|
|
|
/// Speech configuration
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct SpeechConfig {
|
|
pub provider: TtsProvider,
|
|
pub default_voice: Option<String>,
|
|
pub default_language: String,
|
|
pub default_rate: f32,
|
|
pub default_pitch: f32,
|
|
pub default_volume: f32,
|
|
}
|
|
|
|
impl Default for SpeechConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
provider: TtsProvider::Browser,
|
|
default_voice: None,
|
|
default_language: "zh-CN".to_string(),
|
|
default_rate: 1.0,
|
|
default_pitch: 1.0,
|
|
default_volume: 1.0,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Speech state
|
|
#[derive(Debug, Clone, Default)]
|
|
pub struct SpeechState {
|
|
pub config: SpeechConfig,
|
|
pub playback: PlaybackState,
|
|
pub current_text: Option<String>,
|
|
pub position_ms: u64,
|
|
pub available_voices: Vec<VoiceInfo>,
|
|
}
|
|
|
|
/// Speech Hand implementation
|
|
pub struct SpeechHand {
|
|
config: HandConfig,
|
|
state: Arc<RwLock<SpeechState>>,
|
|
}
|
|
|
|
impl SpeechHand {
|
|
/// Create a new speech hand
|
|
pub fn new() -> Self {
|
|
Self {
|
|
config: HandConfig {
|
|
id: "speech".to_string(),
|
|
name: "语音合成".to_string(),
|
|
description: "文本转语音合成输出".to_string(),
|
|
needs_approval: false,
|
|
dependencies: vec![],
|
|
input_schema: Some(serde_json::json!({
|
|
"type": "object",
|
|
"properties": {
|
|
"action": { "type": "string" },
|
|
"text": { "type": "string" },
|
|
"voice": { "type": "string" },
|
|
"rate": { "type": "number" },
|
|
}
|
|
})),
|
|
tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string(), "demo".to_string()],
|
|
enabled: true,
|
|
max_concurrent: 0,
|
|
timeout_secs: 0,
|
|
},
|
|
state: Arc::new(RwLock::new(SpeechState {
|
|
config: SpeechConfig::default(),
|
|
playback: PlaybackState::Idle,
|
|
available_voices: Self::get_default_voices(),
|
|
..Default::default()
|
|
})),
|
|
}
|
|
}
|
|
|
|
/// Create with custom provider
|
|
pub fn with_provider(provider: TtsProvider) -> Self {
|
|
let hand = Self::new();
|
|
let mut state = hand.state.blocking_write();
|
|
state.config.provider = provider;
|
|
drop(state);
|
|
hand
|
|
}
|
|
|
|
/// Get default voices
|
|
fn get_default_voices() -> Vec<VoiceInfo> {
|
|
vec![
|
|
VoiceInfo {
|
|
id: "zh-CN-XiaoxiaoNeural".to_string(),
|
|
name: "Xiaoxiao".to_string(),
|
|
language: "zh-CN".to_string(),
|
|
gender: "female".to_string(),
|
|
preview_url: None,
|
|
},
|
|
VoiceInfo {
|
|
id: "zh-CN-YunxiNeural".to_string(),
|
|
name: "Yunxi".to_string(),
|
|
language: "zh-CN".to_string(),
|
|
gender: "male".to_string(),
|
|
preview_url: None,
|
|
},
|
|
VoiceInfo {
|
|
id: "en-US-JennyNeural".to_string(),
|
|
name: "Jenny".to_string(),
|
|
language: "en-US".to_string(),
|
|
gender: "female".to_string(),
|
|
preview_url: None,
|
|
},
|
|
VoiceInfo {
|
|
id: "en-US-GuyNeural".to_string(),
|
|
name: "Guy".to_string(),
|
|
language: "en-US".to_string(),
|
|
gender: "male".to_string(),
|
|
preview_url: None,
|
|
},
|
|
]
|
|
}
|
|
|
|
/// Execute a speech action
|
|
pub async fn execute_action(&self, action: SpeechAction) -> Result<HandResult> {
|
|
let mut state = self.state.write().await;
|
|
|
|
match action {
|
|
SpeechAction::Speak { text, voice, rate, pitch, volume, language } => {
|
|
let voice_id = voice.or(state.config.default_voice.clone())
|
|
.unwrap_or_else(|| "default".to_string());
|
|
let lang = language.unwrap_or_else(|| state.config.default_language.clone());
|
|
let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate };
|
|
let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch };
|
|
let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume };
|
|
|
|
state.playback = PlaybackState::Playing;
|
|
state.current_text = Some(text.clone());
|
|
|
|
// Determine TTS method based on provider:
|
|
// - Browser: frontend uses Web Speech API (zero deps, works offline)
|
|
// - OpenAI: frontend calls speech_tts command (high-quality, needs API key)
|
|
// - Others: future support
|
|
let tts_method = match state.config.provider {
|
|
TtsProvider::Browser => "browser",
|
|
TtsProvider::OpenAI => "openai_api",
|
|
TtsProvider::Azure => "azure_api",
|
|
TtsProvider::ElevenLabs => "elevenlabs_api",
|
|
TtsProvider::Local => "local_engine",
|
|
};
|
|
|
|
let estimated_duration_ms = (text.chars().count() as f64 / 5.0 * 1000.0) as u64;
|
|
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "speaking",
|
|
"tts_method": tts_method,
|
|
"text": text,
|
|
"voice": voice_id,
|
|
"language": lang,
|
|
"rate": actual_rate,
|
|
"pitch": actual_pitch,
|
|
"volume": actual_volume,
|
|
"provider": format!("{:?}", state.config.provider).to_lowercase(),
|
|
"duration_ms": estimated_duration_ms,
|
|
"instruction": "Frontend should play this via TTS engine"
|
|
})))
|
|
}
|
|
SpeechAction::SpeakSsml { ssml, voice } => {
|
|
let voice_id = voice.or(state.config.default_voice.clone())
|
|
.unwrap_or_else(|| "default".to_string());
|
|
|
|
state.playback = PlaybackState::Playing;
|
|
state.current_text = Some(ssml.clone());
|
|
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "speaking_ssml",
|
|
"ssml": ssml,
|
|
"voice": voice_id,
|
|
"provider": state.config.provider,
|
|
})))
|
|
}
|
|
SpeechAction::Pause => {
|
|
state.playback = PlaybackState::Paused;
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "paused",
|
|
"position_ms": state.position_ms,
|
|
})))
|
|
}
|
|
SpeechAction::Resume => {
|
|
state.playback = PlaybackState::Playing;
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "resumed",
|
|
"position_ms": state.position_ms,
|
|
})))
|
|
}
|
|
SpeechAction::Stop => {
|
|
state.playback = PlaybackState::Idle;
|
|
state.current_text = None;
|
|
state.position_ms = 0;
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "stopped",
|
|
})))
|
|
}
|
|
SpeechAction::ListVoices { language } => {
|
|
let voices: Vec<_> = state.available_voices.iter()
|
|
.filter(|v| {
|
|
language.as_ref()
|
|
.map(|l| v.language.starts_with(l))
|
|
.unwrap_or(true)
|
|
})
|
|
.cloned()
|
|
.collect();
|
|
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"voices": voices,
|
|
"count": voices.len(),
|
|
})))
|
|
}
|
|
SpeechAction::SetVoice { voice, language } => {
|
|
state.config.default_voice = Some(voice.clone());
|
|
if let Some(lang) = language {
|
|
state.config.default_language = lang;
|
|
}
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "voice_set",
|
|
"voice": voice,
|
|
"language": state.config.default_language,
|
|
})))
|
|
}
|
|
SpeechAction::SetProvider { provider, api_key, region: _ } => {
|
|
state.config.provider = provider.clone();
|
|
// In real implementation, would configure provider
|
|
Ok(HandResult::success(serde_json::json!({
|
|
"status": "provider_set",
|
|
"provider": provider,
|
|
"configured": api_key.is_some(),
|
|
})))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Get current state
|
|
pub async fn get_state(&self) -> SpeechState {
|
|
self.state.read().await.clone()
|
|
}
|
|
}
|
|
|
|
impl Default for SpeechHand {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Hand for SpeechHand {
|
|
fn config(&self) -> &HandConfig {
|
|
&self.config
|
|
}
|
|
|
|
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
|
|
let action: SpeechAction = match serde_json::from_value(input) {
|
|
Ok(a) => a,
|
|
Err(e) => {
|
|
return Ok(HandResult::error(format!("Invalid speech action: {}", e)));
|
|
}
|
|
};
|
|
|
|
self.execute_action(action).await
|
|
}
|
|
|
|
fn status(&self) -> HandStatus {
|
|
HandStatus::Idle
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[tokio::test]
|
|
async fn test_speech_creation() {
|
|
let hand = SpeechHand::new();
|
|
assert_eq!(hand.config().id, "speech");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_speak() {
|
|
let hand = SpeechHand::new();
|
|
let action = SpeechAction::Speak {
|
|
text: "Hello, world!".to_string(),
|
|
voice: None,
|
|
rate: 1.0,
|
|
pitch: 1.0,
|
|
volume: 1.0,
|
|
language: None,
|
|
};
|
|
|
|
let result = hand.execute_action(action).await.unwrap();
|
|
assert!(result.success);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_pause_resume() {
|
|
let hand = SpeechHand::new();
|
|
|
|
// Speak first
|
|
hand.execute_action(SpeechAction::Speak {
|
|
text: "Test".to_string(),
|
|
voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None,
|
|
}).await.unwrap();
|
|
|
|
// Pause
|
|
let result = hand.execute_action(SpeechAction::Pause).await.unwrap();
|
|
assert!(result.success);
|
|
|
|
// Resume
|
|
let result = hand.execute_action(SpeechAction::Resume).await.unwrap();
|
|
assert!(result.success);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_list_voices() {
|
|
let hand = SpeechHand::new();
|
|
let action = SpeechAction::ListVoices { language: Some("zh".to_string()) };
|
|
|
|
let result = hand.execute_action(action).await.unwrap();
|
|
assert!(result.success);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_set_voice() {
|
|
let hand = SpeechHand::new();
|
|
let action = SpeechAction::SetVoice {
|
|
voice: "zh-CN-XiaoxiaoNeural".to_string(),
|
|
language: Some("zh-CN".to_string()),
|
|
};
|
|
|
|
let result = hand.execute_action(action).await.unwrap();
|
|
assert!(result.success);
|
|
|
|
let state = hand.get_state().await;
|
|
assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string()));
|
|
}
|
|
}
|