Files
AI-VideoAssistant/engine/services/tts.py
2026-02-06 14:01:34 +08:00

272 lines
8.8 KiB
Python

"""TTS (Text-to-Speech) Service implementations.
Provides multiple TTS backend options including edge-tts (free)
and placeholder for cloud services.
"""
import os
import io
import asyncio
import struct
from typing import AsyncIterator, Optional
from loguru import logger
from services.base import BaseTTSService, TTSChunk, ServiceState
# Try to import edge-tts
try:
import edge_tts
EDGE_TTS_AVAILABLE = True
except ImportError:
EDGE_TTS_AVAILABLE = False
logger.warning("edge-tts not available - EdgeTTS service will be disabled")
class EdgeTTSService(BaseTTSService):
"""
Microsoft Edge TTS service.
Uses edge-tts library for free, high-quality speech synthesis.
Supports streaming for low-latency playback.
"""
# Voice mapping for common languages
VOICE_MAP = {
"en": "en-US-JennyNeural",
"en-US": "en-US-JennyNeural",
"en-GB": "en-GB-SoniaNeural",
"zh": "zh-CN-XiaoxiaoNeural",
"zh-CN": "zh-CN-XiaoxiaoNeural",
"zh-TW": "zh-TW-HsiaoChenNeural",
"ja": "ja-JP-NanamiNeural",
"ko": "ko-KR-SunHiNeural",
"fr": "fr-FR-DeniseNeural",
"de": "de-DE-KatjaNeural",
"es": "es-ES-ElviraNeural",
}
def __init__(
self,
voice: str = "en-US-JennyNeural",
sample_rate: int = 16000,
speed: float = 1.0
):
"""
Initialize Edge TTS service.
Args:
voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
sample_rate: Target sample rate (will be resampled)
speed: Speech speed multiplier
"""
# Resolve voice from language code if needed
if voice in self.VOICE_MAP:
voice = self.VOICE_MAP[voice]
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
self._cancel_event = asyncio.Event()
async def connect(self) -> None:
"""Edge TTS doesn't require explicit connection."""
if not EDGE_TTS_AVAILABLE:
raise RuntimeError("edge-tts package not installed")
self.state = ServiceState.CONNECTED
logger.info(f"Edge TTS service ready: voice={self.voice}")
async def disconnect(self) -> None:
"""Edge TTS doesn't require explicit disconnection."""
self.state = ServiceState.DISCONNECTED
logger.info("Edge TTS service disconnected")
def _get_rate_string(self) -> str:
"""Convert speed to rate string for edge-tts."""
# edge-tts uses percentage format: "+0%", "-10%", "+20%"
percentage = int((self.speed - 1.0) * 100)
if percentage >= 0:
return f"+{percentage}%"
return f"{percentage}%"
async def synthesize(self, text: str) -> bytes:
"""
Synthesize complete audio for text.
Args:
text: Text to synthesize
Returns:
PCM audio data (16-bit, mono, 16kHz)
"""
if not EDGE_TTS_AVAILABLE:
raise RuntimeError("edge-tts not available")
# Collect all chunks
audio_data = b""
async for chunk in self.synthesize_stream(text):
audio_data += chunk.audio
return audio_data
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects with PCM audio
"""
if not EDGE_TTS_AVAILABLE:
raise RuntimeError("edge-tts not available")
self._cancel_event.clear()
try:
communicate = edge_tts.Communicate(
text,
voice=self.voice,
rate=self._get_rate_string()
)
# edge-tts outputs MP3, we need to decode to PCM
# For now, collect MP3 chunks and yield after conversion
mp3_data = b""
async for chunk in communicate.stream():
# Check for cancellation
if self._cancel_event.is_set():
logger.info("TTS synthesis cancelled")
return
if chunk["type"] == "audio":
mp3_data += chunk["data"]
# Convert MP3 to PCM
if mp3_data:
pcm_data = await self._convert_mp3_to_pcm(mp3_data)
if pcm_data:
# Yield in chunks for streaming playback
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
for i in range(0, len(pcm_data), chunk_size):
if self._cancel_event.is_set():
return
chunk_data = pcm_data[i:i + chunk_size]
yield TTSChunk(
audio=chunk_data,
sample_rate=self.sample_rate,
is_final=(i + chunk_size >= len(pcm_data))
)
except asyncio.CancelledError:
logger.info("TTS synthesis cancelled via asyncio")
raise
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
raise
async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
"""
Convert MP3 audio to PCM.
Uses pydub or ffmpeg for conversion.
"""
try:
# Try using pydub (requires ffmpeg)
from pydub import AudioSegment
# Load MP3 from bytes
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
# Convert to target format
audio = audio.set_frame_rate(self.sample_rate)
audio = audio.set_channels(1)
audio = audio.set_sample_width(2) # 16-bit
# Export as raw PCM
return audio.raw_data
except ImportError:
logger.warning("pydub not available, trying fallback")
# Fallback: Use subprocess to call ffmpeg directly
return await self._ffmpeg_convert(mp3_data)
except Exception as e:
logger.error(f"Audio conversion error: {e}")
return b""
async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
"""Convert MP3 to PCM using ffmpeg subprocess."""
try:
process = await asyncio.create_subprocess_exec(
"ffmpeg",
"-i", "pipe:0",
"-f", "s16le",
"-acodec", "pcm_s16le",
"-ar", str(self.sample_rate),
"-ac", "1",
"pipe:1",
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.DEVNULL
)
stdout, _ = await process.communicate(input=mp3_data)
return stdout
except Exception as e:
logger.error(f"ffmpeg conversion error: {e}")
return b""
async def cancel(self) -> None:
"""Cancel ongoing synthesis."""
self._cancel_event.set()
class MockTTSService(BaseTTSService):
"""
Mock TTS service for testing without actual synthesis.
Generates silence or simple tones.
"""
def __init__(
self,
voice: str = "mock",
sample_rate: int = 16000,
speed: float = 1.0
):
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
async def connect(self) -> None:
self.state = ServiceState.CONNECTED
logger.info("Mock TTS service connected")
async def disconnect(self) -> None:
self.state = ServiceState.DISCONNECTED
logger.info("Mock TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Generate silence based on text length."""
# Approximate: 100ms per word
word_count = len(text.split())
duration_ms = word_count * 100
samples = int(self.sample_rate * duration_ms / 1000)
# Generate silence (zeros)
return bytes(samples * 2) # 16-bit = 2 bytes per sample
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""Generate silence chunks."""
audio = await self.synthesize(text)
# Yield in 100ms chunks
chunk_size = self.sample_rate * 2 // 10
for i in range(0, len(audio), chunk_size):
chunk_data = audio[i:i + chunk_size]
yield TTSChunk(
audio=chunk_data,
sample_rate=self.sample_rate,
is_final=(i + chunk_size >= len(audio))
)
await asyncio.sleep(0.05) # Simulate processing time