272 lines
8.8 KiB
Python
272 lines
8.8 KiB
Python
"""TTS (Text-to-Speech) Service implementations.
|
|
|
|
Provides multiple TTS backend options including edge-tts (free)
|
|
and placeholder for cloud services.
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import asyncio
|
|
import struct
|
|
from typing import AsyncIterator, Optional
|
|
from loguru import logger
|
|
|
|
from services.base import BaseTTSService, TTSChunk, ServiceState
|
|
|
|
# Try to import edge-tts
|
|
try:
|
|
import edge_tts
|
|
EDGE_TTS_AVAILABLE = True
|
|
except ImportError:
|
|
EDGE_TTS_AVAILABLE = False
|
|
logger.warning("edge-tts not available - EdgeTTS service will be disabled")
|
|
|
|
|
|
class EdgeTTSService(BaseTTSService):
|
|
"""
|
|
Microsoft Edge TTS service.
|
|
|
|
Uses edge-tts library for free, high-quality speech synthesis.
|
|
Supports streaming for low-latency playback.
|
|
"""
|
|
|
|
# Voice mapping for common languages
|
|
VOICE_MAP = {
|
|
"en": "en-US-JennyNeural",
|
|
"en-US": "en-US-JennyNeural",
|
|
"en-GB": "en-GB-SoniaNeural",
|
|
"zh": "zh-CN-XiaoxiaoNeural",
|
|
"zh-CN": "zh-CN-XiaoxiaoNeural",
|
|
"zh-TW": "zh-TW-HsiaoChenNeural",
|
|
"ja": "ja-JP-NanamiNeural",
|
|
"ko": "ko-KR-SunHiNeural",
|
|
"fr": "fr-FR-DeniseNeural",
|
|
"de": "de-DE-KatjaNeural",
|
|
"es": "es-ES-ElviraNeural",
|
|
}
|
|
|
|
def __init__(
|
|
self,
|
|
voice: str = "en-US-JennyNeural",
|
|
sample_rate: int = 16000,
|
|
speed: float = 1.0
|
|
):
|
|
"""
|
|
Initialize Edge TTS service.
|
|
|
|
Args:
|
|
voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
|
|
sample_rate: Target sample rate (will be resampled)
|
|
speed: Speech speed multiplier
|
|
"""
|
|
# Resolve voice from language code if needed
|
|
if voice in self.VOICE_MAP:
|
|
voice = self.VOICE_MAP[voice]
|
|
|
|
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
|
self._cancel_event = asyncio.Event()
|
|
|
|
async def connect(self) -> None:
|
|
"""Edge TTS doesn't require explicit connection."""
|
|
if not EDGE_TTS_AVAILABLE:
|
|
raise RuntimeError("edge-tts package not installed")
|
|
self.state = ServiceState.CONNECTED
|
|
logger.info(f"Edge TTS service ready: voice={self.voice}")
|
|
|
|
async def disconnect(self) -> None:
|
|
"""Edge TTS doesn't require explicit disconnection."""
|
|
self.state = ServiceState.DISCONNECTED
|
|
logger.info("Edge TTS service disconnected")
|
|
|
|
def _get_rate_string(self) -> str:
|
|
"""Convert speed to rate string for edge-tts."""
|
|
# edge-tts uses percentage format: "+0%", "-10%", "+20%"
|
|
percentage = int((self.speed - 1.0) * 100)
|
|
if percentage >= 0:
|
|
return f"+{percentage}%"
|
|
return f"{percentage}%"
|
|
|
|
async def synthesize(self, text: str) -> bytes:
|
|
"""
|
|
Synthesize complete audio for text.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
|
|
Returns:
|
|
PCM audio data (16-bit, mono, 16kHz)
|
|
"""
|
|
if not EDGE_TTS_AVAILABLE:
|
|
raise RuntimeError("edge-tts not available")
|
|
|
|
# Collect all chunks
|
|
audio_data = b""
|
|
async for chunk in self.synthesize_stream(text):
|
|
audio_data += chunk.audio
|
|
|
|
return audio_data
|
|
|
|
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
|
"""
|
|
Synthesize audio in streaming mode.
|
|
|
|
Args:
|
|
text: Text to synthesize
|
|
|
|
Yields:
|
|
TTSChunk objects with PCM audio
|
|
"""
|
|
if not EDGE_TTS_AVAILABLE:
|
|
raise RuntimeError("edge-tts not available")
|
|
|
|
self._cancel_event.clear()
|
|
|
|
try:
|
|
communicate = edge_tts.Communicate(
|
|
text,
|
|
voice=self.voice,
|
|
rate=self._get_rate_string()
|
|
)
|
|
|
|
# edge-tts outputs MP3, we need to decode to PCM
|
|
# For now, collect MP3 chunks and yield after conversion
|
|
mp3_data = b""
|
|
|
|
async for chunk in communicate.stream():
|
|
# Check for cancellation
|
|
if self._cancel_event.is_set():
|
|
logger.info("TTS synthesis cancelled")
|
|
return
|
|
|
|
if chunk["type"] == "audio":
|
|
mp3_data += chunk["data"]
|
|
|
|
# Convert MP3 to PCM
|
|
if mp3_data:
|
|
pcm_data = await self._convert_mp3_to_pcm(mp3_data)
|
|
if pcm_data:
|
|
# Yield in chunks for streaming playback
|
|
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
|
|
for i in range(0, len(pcm_data), chunk_size):
|
|
if self._cancel_event.is_set():
|
|
return
|
|
|
|
chunk_data = pcm_data[i:i + chunk_size]
|
|
yield TTSChunk(
|
|
audio=chunk_data,
|
|
sample_rate=self.sample_rate,
|
|
is_final=(i + chunk_size >= len(pcm_data))
|
|
)
|
|
|
|
except asyncio.CancelledError:
|
|
logger.info("TTS synthesis cancelled via asyncio")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"TTS synthesis error: {e}")
|
|
raise
|
|
|
|
async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
|
"""
|
|
Convert MP3 audio to PCM.
|
|
|
|
Uses pydub or ffmpeg for conversion.
|
|
"""
|
|
try:
|
|
# Try using pydub (requires ffmpeg)
|
|
from pydub import AudioSegment
|
|
|
|
# Load MP3 from bytes
|
|
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
|
|
|
# Convert to target format
|
|
audio = audio.set_frame_rate(self.sample_rate)
|
|
audio = audio.set_channels(1)
|
|
audio = audio.set_sample_width(2) # 16-bit
|
|
|
|
# Export as raw PCM
|
|
return audio.raw_data
|
|
|
|
except ImportError:
|
|
logger.warning("pydub not available, trying fallback")
|
|
# Fallback: Use subprocess to call ffmpeg directly
|
|
return await self._ffmpeg_convert(mp3_data)
|
|
except Exception as e:
|
|
logger.error(f"Audio conversion error: {e}")
|
|
return b""
|
|
|
|
async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
|
|
"""Convert MP3 to PCM using ffmpeg subprocess."""
|
|
try:
|
|
process = await asyncio.create_subprocess_exec(
|
|
"ffmpeg",
|
|
"-i", "pipe:0",
|
|
"-f", "s16le",
|
|
"-acodec", "pcm_s16le",
|
|
"-ar", str(self.sample_rate),
|
|
"-ac", "1",
|
|
"pipe:1",
|
|
stdin=asyncio.subprocess.PIPE,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.DEVNULL
|
|
)
|
|
|
|
stdout, _ = await process.communicate(input=mp3_data)
|
|
return stdout
|
|
|
|
except Exception as e:
|
|
logger.error(f"ffmpeg conversion error: {e}")
|
|
return b""
|
|
|
|
async def cancel(self) -> None:
|
|
"""Cancel ongoing synthesis."""
|
|
self._cancel_event.set()
|
|
|
|
|
|
class MockTTSService(BaseTTSService):
|
|
"""
|
|
Mock TTS service for testing without actual synthesis.
|
|
|
|
Generates silence or simple tones.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
voice: str = "mock",
|
|
sample_rate: int = 16000,
|
|
speed: float = 1.0
|
|
):
|
|
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
|
|
|
async def connect(self) -> None:
|
|
self.state = ServiceState.CONNECTED
|
|
logger.info("Mock TTS service connected")
|
|
|
|
async def disconnect(self) -> None:
|
|
self.state = ServiceState.DISCONNECTED
|
|
logger.info("Mock TTS service disconnected")
|
|
|
|
async def synthesize(self, text: str) -> bytes:
|
|
"""Generate silence based on text length."""
|
|
# Approximate: 100ms per word
|
|
word_count = len(text.split())
|
|
duration_ms = word_count * 100
|
|
samples = int(self.sample_rate * duration_ms / 1000)
|
|
|
|
# Generate silence (zeros)
|
|
return bytes(samples * 2) # 16-bit = 2 bytes per sample
|
|
|
|
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
|
"""Generate silence chunks."""
|
|
audio = await self.synthesize(text)
|
|
|
|
# Yield in 100ms chunks
|
|
chunk_size = self.sample_rate * 2 // 10
|
|
for i in range(0, len(audio), chunk_size):
|
|
chunk_data = audio[i:i + chunk_size]
|
|
yield TTSChunk(
|
|
audio=chunk_data,
|
|
sample_rate=self.sample_rate,
|
|
is_final=(i + chunk_size >= len(audio))
|
|
)
|
|
await asyncio.sleep(0.05) # Simulate processing time
|