I can use text to get audio response and barge in
This commit is contained in:
271
services/tts.py
Normal file
271
services/tts.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""TTS (Text-to-Speech) Service implementations.
|
||||
|
||||
Provides multiple TTS backend options including edge-tts (free)
|
||||
and placeholder for cloud services.
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import asyncio
|
||||
import struct
|
||||
from typing import AsyncIterator, Optional
|
||||
from loguru import logger
|
||||
|
||||
from services.base import BaseTTSService, TTSChunk, ServiceState
|
||||
|
||||
# Try to import edge-tts
|
||||
try:
|
||||
import edge_tts
|
||||
EDGE_TTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
EDGE_TTS_AVAILABLE = False
|
||||
logger.warning("edge-tts not available - EdgeTTS service will be disabled")
|
||||
|
||||
|
||||
class EdgeTTSService(BaseTTSService):
|
||||
"""
|
||||
Microsoft Edge TTS service.
|
||||
|
||||
Uses edge-tts library for free, high-quality speech synthesis.
|
||||
Supports streaming for low-latency playback.
|
||||
"""
|
||||
|
||||
# Voice mapping for common languages
|
||||
VOICE_MAP = {
|
||||
"en": "en-US-JennyNeural",
|
||||
"en-US": "en-US-JennyNeural",
|
||||
"en-GB": "en-GB-SoniaNeural",
|
||||
"zh": "zh-CN-XiaoxiaoNeural",
|
||||
"zh-CN": "zh-CN-XiaoxiaoNeural",
|
||||
"zh-TW": "zh-TW-HsiaoChenNeural",
|
||||
"ja": "ja-JP-NanamiNeural",
|
||||
"ko": "ko-KR-SunHiNeural",
|
||||
"fr": "fr-FR-DeniseNeural",
|
||||
"de": "de-DE-KatjaNeural",
|
||||
"es": "es-ES-ElviraNeural",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
voice: str = "en-US-JennyNeural",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
"""
|
||||
Initialize Edge TTS service.
|
||||
|
||||
Args:
|
||||
voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
|
||||
sample_rate: Target sample rate (will be resampled)
|
||||
speed: Speech speed multiplier
|
||||
"""
|
||||
# Resolve voice from language code if needed
|
||||
if voice in self.VOICE_MAP:
|
||||
voice = self.VOICE_MAP[voice]
|
||||
|
||||
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
||||
self._cancel_event = asyncio.Event()
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Edge TTS doesn't require explicit connection."""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts package not installed")
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info(f"Edge TTS service ready: voice={self.voice}")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Edge TTS doesn't require explicit disconnection."""
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Edge TTS service disconnected")
|
||||
|
||||
def _get_rate_string(self) -> str:
|
||||
"""Convert speed to rate string for edge-tts."""
|
||||
# edge-tts uses percentage format: "+0%", "-10%", "+20%"
|
||||
percentage = int((self.speed - 1.0) * 100)
|
||||
if percentage >= 0:
|
||||
return f"+{percentage}%"
|
||||
return f"{percentage}%"
|
||||
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""
|
||||
Synthesize complete audio for text.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Returns:
|
||||
PCM audio data (16-bit, mono, 16kHz)
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not available")
|
||||
|
||||
# Collect all chunks
|
||||
audio_data = b""
|
||||
async for chunk in self.synthesize_stream(text):
|
||||
audio_data += chunk.audio
|
||||
|
||||
return audio_data
|
||||
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""
|
||||
Synthesize audio in streaming mode.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Yields:
|
||||
TTSChunk objects with PCM audio
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not available")
|
||||
|
||||
self._cancel_event.clear()
|
||||
|
||||
try:
|
||||
communicate = edge_tts.Communicate(
|
||||
text,
|
||||
voice=self.voice,
|
||||
rate=self._get_rate_string()
|
||||
)
|
||||
|
||||
# edge-tts outputs MP3, we need to decode to PCM
|
||||
# For now, collect MP3 chunks and yield after conversion
|
||||
mp3_data = b""
|
||||
|
||||
async for chunk in communicate.stream():
|
||||
# Check for cancellation
|
||||
if self._cancel_event.is_set():
|
||||
logger.info("TTS synthesis cancelled")
|
||||
return
|
||||
|
||||
if chunk["type"] == "audio":
|
||||
mp3_data += chunk["data"]
|
||||
|
||||
# Convert MP3 to PCM
|
||||
if mp3_data:
|
||||
pcm_data = await self._convert_mp3_to_pcm(mp3_data)
|
||||
if pcm_data:
|
||||
# Yield in chunks for streaming playback
|
||||
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
|
||||
for i in range(0, len(pcm_data), chunk_size):
|
||||
if self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
chunk_data = pcm_data[i:i + chunk_size]
|
||||
yield TTSChunk(
|
||||
audio=chunk_data,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=(i + chunk_size >= len(pcm_data))
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("TTS synthesis cancelled via asyncio")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TTS synthesis error: {e}")
|
||||
raise
|
||||
|
||||
async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
||||
"""
|
||||
Convert MP3 audio to PCM.
|
||||
|
||||
Uses pydub or ffmpeg for conversion.
|
||||
"""
|
||||
try:
|
||||
# Try using pydub (requires ffmpeg)
|
||||
from pydub import AudioSegment
|
||||
|
||||
# Load MP3 from bytes
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||||
|
||||
# Convert to target format
|
||||
audio = audio.set_frame_rate(self.sample_rate)
|
||||
audio = audio.set_channels(1)
|
||||
audio = audio.set_sample_width(2) # 16-bit
|
||||
|
||||
# Export as raw PCM
|
||||
return audio.raw_data
|
||||
|
||||
except ImportError:
|
||||
logger.warning("pydub not available, trying fallback")
|
||||
# Fallback: Use subprocess to call ffmpeg directly
|
||||
return await self._ffmpeg_convert(mp3_data)
|
||||
except Exception as e:
|
||||
logger.error(f"Audio conversion error: {e}")
|
||||
return b""
|
||||
|
||||
async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
|
||||
"""Convert MP3 to PCM using ffmpeg subprocess."""
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
"ffmpeg",
|
||||
"-i", "pipe:0",
|
||||
"-f", "s16le",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", str(self.sample_rate),
|
||||
"-ac", "1",
|
||||
"pipe:1",
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.DEVNULL
|
||||
)
|
||||
|
||||
stdout, _ = await process.communicate(input=mp3_data)
|
||||
return stdout
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ffmpeg conversion error: {e}")
|
||||
return b""
|
||||
|
||||
async def cancel(self) -> None:
|
||||
"""Cancel ongoing synthesis."""
|
||||
self._cancel_event.set()
|
||||
|
||||
|
||||
class MockTTSService(BaseTTSService):
|
||||
"""
|
||||
Mock TTS service for testing without actual synthesis.
|
||||
|
||||
Generates silence or simple tones.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
voice: str = "mock",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
||||
|
||||
async def connect(self) -> None:
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("Mock TTS service connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Mock TTS service disconnected")
|
||||
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""Generate silence based on text length."""
|
||||
# Approximate: 100ms per word
|
||||
word_count = len(text.split())
|
||||
duration_ms = word_count * 100
|
||||
samples = int(self.sample_rate * duration_ms / 1000)
|
||||
|
||||
# Generate silence (zeros)
|
||||
return bytes(samples * 2) # 16-bit = 2 bytes per sample
|
||||
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""Generate silence chunks."""
|
||||
audio = await self.synthesize(text)
|
||||
|
||||
# Yield in 100ms chunks
|
||||
chunk_size = self.sample_rate * 2 // 10
|
||||
for i in range(0, len(audio), chunk_size):
|
||||
chunk_data = audio[i:i + chunk_size]
|
||||
yield TTSChunk(
|
||||
audio=chunk_data,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=(i + chunk_size >= len(audio))
|
||||
)
|
||||
await asyncio.sleep(0.05) # Simulate processing time
|
||||
Reference in New Issue
Block a user