I can use text to get audio response and barge in

2026-01-29 16:25:53 +08:00
parent cd90b4fb37
commit ac0c76e6e8
16 changed files with 3394 additions and 119 deletions
--- a/services/init.py
+++ b/services/init.py
@@ -0,0 +1,42 @@
+"""AI Services package.
+
+Provides ASR, LLM, TTS, and Realtime API services for voice conversation.
+"""
+
+from services.base import (
+    ServiceState,
+    ASRResult,
+    LLMMessage,
+    TTSChunk,
+    BaseASRService,
+    BaseLLMService,
+    BaseTTSService,
+)
+from services.llm import OpenAILLMService, MockLLMService
+from services.tts import EdgeTTSService, MockTTSService
+from services.asr import BufferedASRService, MockASRService
+from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline
+
+__all__ = [
+    # Base classes
+    "ServiceState",
+    "ASRResult",
+    "LLMMessage",
+    "TTSChunk",
+    "BaseASRService",
+    "BaseLLMService",
+    "BaseTTSService",
+    # LLM
+    "OpenAILLMService",
+    "MockLLMService",
+    # TTS
+    "EdgeTTSService",
+    "MockTTSService",
+    # ASR
+    "BufferedASRService",
+    "MockASRService",
+    # Realtime
+    "RealtimeService",
+    "RealtimeConfig",
+    "RealtimePipeline",
+]
--- a/services/asr.py
+++ b/services/asr.py
@@ -0,0 +1,147 @@
+"""ASR (Automatic Speech Recognition) Service implementations.
+
+Provides speech-to-text capabilities with streaming support.
+"""
+
+import os
+import asyncio
+import json
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from services.base import BaseASRService, ASRResult, ServiceState
+
+# Try to import websockets for streaming ASR
+try:
+    import websockets
+    WEBSOCKETS_AVAILABLE = True
+except ImportError:
+    WEBSOCKETS_AVAILABLE = False
+
+
+class BufferedASRService(BaseASRService):
+    """
+    Buffered ASR service that accumulates audio and provides
+    a simple text accumulator for use with EOU detection.
+    
+    This is a lightweight implementation that works with the
+    existing VAD + EOU pattern without requiring external ASR.
+    """
+    
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        language: str = "en"
+    ):
+        super().__init__(sample_rate=sample_rate, language=language)
+        
+        self._audio_buffer: bytes = b""
+        self._current_text: str = ""
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+    
+    async def connect(self) -> None:
+        """No connection needed for buffered ASR."""
+        self.state = ServiceState.CONNECTED
+        logger.info("Buffered ASR service connected")
+    
+    async def disconnect(self) -> None:
+        """Clear buffers on disconnect."""
+        self._audio_buffer = b""
+        self._current_text = ""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Buffered ASR service disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """Buffer audio for later processing."""
+        self._audio_buffer += audio
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """Yield transcription results."""
+        while True:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
+    
+    def set_text(self, text: str) -> None:
+        """
+        Set the current transcript text directly.
+        
+        This allows external integration (e.g., Whisper, other ASR)
+        to provide transcripts.
+        """
+        self._current_text = text
+        result = ASRResult(text=text, is_final=False)
+        asyncio.create_task(self._transcript_queue.put(result))
+    
+    def get_and_clear_text(self) -> str:
+        """Get accumulated text and clear buffer."""
+        text = self._current_text
+        self._current_text = ""
+        self._audio_buffer = b""
+        return text
+    
+    def get_audio_buffer(self) -> bytes:
+        """Get accumulated audio buffer."""
+        return self._audio_buffer
+    
+    def clear_audio_buffer(self) -> None:
+        """Clear audio buffer."""
+        self._audio_buffer = b""
+
+
+class MockASRService(BaseASRService):
+    """
+    Mock ASR service for testing without actual recognition.
+    """
+    
+    def __init__(self, sample_rate: int = 16000, language: str = "en"):
+        super().__init__(sample_rate=sample_rate, language=language)
+        self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
+        self._mock_texts = [
+            "Hello, how are you?",
+            "That's interesting.",
+            "Tell me more about that.",
+            "I understand.",
+        ]
+        self._text_index = 0
+    
+    async def connect(self) -> None:
+        self.state = ServiceState.CONNECTED
+        logger.info("Mock ASR service connected")
+    
+    async def disconnect(self) -> None:
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Mock ASR service disconnected")
+    
+    async def send_audio(self, audio: bytes) -> None:
+        """Mock audio processing - generates fake transcripts periodically."""
+        pass
+    
+    def trigger_transcript(self) -> None:
+        """Manually trigger a transcript (for testing)."""
+        text = self._mock_texts[self._text_index % len(self._mock_texts)]
+        self._text_index += 1
+        
+        result = ASRResult(text=text, is_final=True, confidence=0.95)
+        asyncio.create_task(self._transcript_queue.put(result))
+    
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """Yield transcription results."""
+        while True:
+            try:
+                result = await asyncio.wait_for(
+                    self._transcript_queue.get(),
+                    timeout=0.1
+                )
+                yield result
+            except asyncio.TimeoutError:
+                continue
+            except asyncio.CancelledError:
+                break
--- a/services/base.py
+++ b/services/base.py
@@ -0,0 +1,244 @@
+"""Base classes for AI services.
+
+Defines abstract interfaces for ASR, LLM, and TTS services,
+inspired by pipecat's service architecture and active-call's
+StreamEngine pattern.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AsyncIterator, Optional, List, Dict, Any
+from enum import Enum
+
+
+class ServiceState(Enum):
+    """Service connection state."""
+    DISCONNECTED = "disconnected"
+    CONNECTING = "connecting"
+    CONNECTED = "connected"
+    ERROR = "error"
+
+
+@dataclass
+class ASRResult:
+    """ASR transcription result."""
+    text: str
+    is_final: bool = False
+    confidence: float = 1.0
+    language: Optional[str] = None
+    start_time: Optional[float] = None
+    end_time: Optional[float] = None
+    
+    def __str__(self) -> str:
+        status = "FINAL" if self.is_final else "PARTIAL"
+        return f"[{status}] {self.text}"
+
+
+@dataclass
+class LLMMessage:
+    """LLM conversation message."""
+    role: str  # "system", "user", "assistant", "function"
+    content: str
+    name: Optional[str] = None  # For function calls
+    function_call: Optional[Dict[str, Any]] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to API-compatible dict."""
+        d = {"role": self.role, "content": self.content}
+        if self.name:
+            d["name"] = self.name
+        if self.function_call:
+            d["function_call"] = self.function_call
+        return d
+
+
+@dataclass
+class TTSChunk:
+    """TTS audio chunk."""
+    audio: bytes  # PCM audio data
+    sample_rate: int = 16000
+    channels: int = 1
+    bits_per_sample: int = 16
+    is_final: bool = False
+    text_offset: Optional[int] = None  # Character offset in original text
+
+
+class BaseASRService(ABC):
+    """
+    Abstract base class for ASR (Speech-to-Text) services.
+    
+    Supports both streaming and non-streaming transcription.
+    """
+    
+    def __init__(self, sample_rate: int = 16000, language: str = "en"):
+        self.sample_rate = sample_rate
+        self.language = language
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Establish connection to ASR service."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close connection to ASR service."""
+        pass
+    
+    @abstractmethod
+    async def send_audio(self, audio: bytes) -> None:
+        """
+        Send audio chunk for transcription.
+        
+        Args:
+            audio: PCM audio data (16-bit, mono)
+        """
+        pass
+    
+    @abstractmethod
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """
+        Receive transcription results.
+        
+        Yields:
+            ASRResult objects as they become available
+        """
+        pass
+    
+    async def transcribe(self, audio: bytes) -> ASRResult:
+        """
+        Transcribe a complete audio buffer (non-streaming).
+        
+        Args:
+            audio: Complete PCM audio data
+            
+        Returns:
+            Final ASRResult
+        """
+        # Default implementation using streaming
+        await self.send_audio(audio)
+        async for result in self.receive_transcripts():
+            if result.is_final:
+                return result
+        return ASRResult(text="", is_final=True)
+
+
+class BaseLLMService(ABC):
+    """
+    Abstract base class for LLM (Language Model) services.
+    
+    Supports streaming responses for real-time conversation.
+    """
+    
+    def __init__(self, model: str = "gpt-4"):
+        self.model = model
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Initialize LLM service connection."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close LLM service connection."""
+        pass
+    
+    @abstractmethod
+    async def generate(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> str:
+        """
+        Generate a complete response.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Returns:
+            Complete response text
+        """
+        pass
+    
+    @abstractmethod
+    async def generate_stream(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> AsyncIterator[str]:
+        """
+        Generate response in streaming mode.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Yields:
+            Text chunks as they are generated
+        """
+        pass
+
+
+class BaseTTSService(ABC):
+    """
+    Abstract base class for TTS (Text-to-Speech) services.
+    
+    Supports streaming audio synthesis for low-latency playback.
+    """
+    
+    def __init__(
+        self,
+        voice: str = "default",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        self.voice = voice
+        self.sample_rate = sample_rate
+        self.speed = speed
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Initialize TTS service connection."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close TTS service connection."""
+        pass
+    
+    @abstractmethod
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize complete audio for text (non-streaming).
+        
+        Args:
+            text: Text to synthesize
+            
+        Returns:
+            Complete PCM audio data
+        """
+        pass
+    
+    @abstractmethod
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects as audio is generated
+        """
+        pass
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis (for barge-in support)."""
+        pass
--- a/services/llm.py
+++ b/services/llm.py
@@ -0,0 +1,239 @@
+"""LLM (Large Language Model) Service implementations.
+
+Provides OpenAI-compatible LLM integration with streaming support
+for real-time voice conversation.
+"""
+
+import os
+import asyncio
+from typing import AsyncIterator, Optional, List, Dict, Any
+from loguru import logger
+
+from services.base import BaseLLMService, LLMMessage, ServiceState
+
+# Try to import openai
+try:
+    from openai import AsyncOpenAI
+    OPENAI_AVAILABLE = True
+except ImportError:
+    OPENAI_AVAILABLE = False
+    logger.warning("openai package not available - LLM service will be disabled")
+
+
+class OpenAILLMService(BaseLLMService):
+    """
+    OpenAI-compatible LLM service.
+    
+    Supports streaming responses for low-latency voice conversation.
+    Works with OpenAI API, Azure OpenAI, and compatible APIs.
+    """
+    
+    def __init__(
+        self,
+        model: str = "gpt-4o-mini",
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        system_prompt: Optional[str] = None
+    ):
+        """
+        Initialize OpenAI LLM service.
+        
+        Args:
+            model: Model name (e.g., "gpt-4o-mini", "gpt-4o")
+            api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
+            base_url: Custom API base URL (for Azure or compatible APIs)
+            system_prompt: Default system prompt for conversations
+        """
+        super().__init__(model=model)
+        
+        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        self.base_url = base_url or os.getenv("OPENAI_API_URL")
+        self.system_prompt = system_prompt or (
+            "You are a helpful, friendly voice assistant. "
+            "Keep your responses concise and conversational. "
+            "Respond naturally as if having a phone conversation."
+        )
+        
+        self.client: Optional[AsyncOpenAI] = None
+        self._cancel_event = asyncio.Event()
+    
+    async def connect(self) -> None:
+        """Initialize OpenAI client."""
+        if not OPENAI_AVAILABLE:
+            raise RuntimeError("openai package not installed")
+        
+        if not self.api_key:
+            raise ValueError("OpenAI API key not provided")
+        
+        self.client = AsyncOpenAI(
+            api_key=self.api_key,
+            base_url=self.base_url
+        )
+        self.state = ServiceState.CONNECTED
+        logger.info(f"OpenAI LLM service connected: model={self.model}")
+    
+    async def disconnect(self) -> None:
+        """Close OpenAI client."""
+        if self.client:
+            await self.client.close()
+            self.client = None
+        self.state = ServiceState.DISCONNECTED
+        logger.info("OpenAI LLM service disconnected")
+    
+    def _prepare_messages(self, messages: List[LLMMessage]) -> List[Dict[str, Any]]:
+        """Prepare messages list with system prompt."""
+        result = []
+        
+        # Add system prompt if not already present
+        has_system = any(m.role == "system" for m in messages)
+        if not has_system and self.system_prompt:
+            result.append({"role": "system", "content": self.system_prompt})
+        
+        # Add all messages
+        for msg in messages:
+            result.append(msg.to_dict())
+        
+        return result
+    
+    async def generate(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> str:
+        """
+        Generate a complete response.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Returns:
+            Complete response text
+        """
+        if not self.client:
+            raise RuntimeError("LLM service not connected")
+        
+        prepared = self._prepare_messages(messages)
+        
+        try:
+            response = await self.client.chat.completions.create(
+                model=self.model,
+                messages=prepared,
+                temperature=temperature,
+                max_tokens=max_tokens
+            )
+            
+            content = response.choices[0].message.content or ""
+            logger.debug(f"LLM response: {content[:100]}...")
+            return content
+            
+        except Exception as e:
+            logger.error(f"LLM generation error: {e}")
+            raise
+    
+    async def generate_stream(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> AsyncIterator[str]:
+        """
+        Generate response in streaming mode.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Yields:
+            Text chunks as they are generated
+        """
+        if not self.client:
+            raise RuntimeError("LLM service not connected")
+        
+        prepared = self._prepare_messages(messages)
+        self._cancel_event.clear()
+        
+        try:
+            stream = await self.client.chat.completions.create(
+                model=self.model,
+                messages=prepared,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stream=True
+            )
+            
+            async for chunk in stream:
+                # Check for cancellation
+                if self._cancel_event.is_set():
+                    logger.info("LLM stream cancelled")
+                    break
+                
+                if chunk.choices and chunk.choices[0].delta.content:
+                    content = chunk.choices[0].delta.content
+                    yield content
+            
+        except asyncio.CancelledError:
+            logger.info("LLM stream cancelled via asyncio")
+            raise
+        except Exception as e:
+            logger.error(f"LLM streaming error: {e}")
+            raise
+    
+    def cancel(self) -> None:
+        """Cancel ongoing generation."""
+        self._cancel_event.set()
+
+
+class MockLLMService(BaseLLMService):
+    """
+    Mock LLM service for testing without API calls.
+    """
+    
+    def __init__(self, response_delay: float = 0.5):
+        super().__init__(model="mock")
+        self.response_delay = response_delay
+        self.responses = [
+            "Hello! How can I help you today?",
+            "That's an interesting question. Let me think about it.",
+            "I understand. Is there anything else you'd like to know?",
+            "Great! I'm here if you need anything else.",
+        ]
+        self._response_index = 0
+    
+    async def connect(self) -> None:
+        self.state = ServiceState.CONNECTED
+        logger.info("Mock LLM service connected")
+    
+    async def disconnect(self) -> None:
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Mock LLM service disconnected")
+    
+    async def generate(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> str:
+        await asyncio.sleep(self.response_delay)
+        response = self.responses[self._response_index % len(self.responses)]
+        self._response_index += 1
+        return response
+    
+    async def generate_stream(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> AsyncIterator[str]:
+        response = await self.generate(messages, temperature, max_tokens)
+        
+        # Stream word by word
+        words = response.split()
+        for i, word in enumerate(words):
+            if i > 0:
+                yield " "
+            yield word
+            await asyncio.sleep(0.05)  # Simulate streaming delay
--- a/services/realtime.py
+++ b/services/realtime.py
@@ -0,0 +1,548 @@
+"""OpenAI Realtime API Service.
+
+Provides true duplex voice conversation using OpenAI's Realtime API,
+similar to active-call's RealtimeProcessor. This bypasses the need for
+separate ASR/LLM/TTS services by handling everything server-side.
+
+The Realtime API provides:
+- Server-side VAD with turn detection
+- Streaming speech-to-text
+- Streaming LLM responses
+- Streaming text-to-speech
+- Function calling support
+- Barge-in/interruption handling
+"""
+
+import os
+import asyncio
+import json
+import base64
+from typing import Optional, Dict, Any, Callable, Awaitable, List
+from dataclasses import dataclass, field
+from enum import Enum
+from loguru import logger
+
+try:
+    import websockets
+    WEBSOCKETS_AVAILABLE = True
+except ImportError:
+    WEBSOCKETS_AVAILABLE = False
+    logger.warning("websockets not available - Realtime API will be disabled")
+
+
+class RealtimeState(Enum):
+    """Realtime API connection state."""
+    DISCONNECTED = "disconnected"
+    CONNECTING = "connecting"
+    CONNECTED = "connected"
+    ERROR = "error"
+
+
+@dataclass
+class RealtimeConfig:
+    """Configuration for OpenAI Realtime API."""
+    
+    # API Configuration
+    api_key: Optional[str] = None
+    model: str = "gpt-4o-realtime-preview"
+    endpoint: Optional[str] = None  # For Azure or custom endpoints
+    
+    # Voice Configuration
+    voice: str = "alloy"  # alloy, echo, shimmer, etc.
+    instructions: str = (
+        "You are a helpful, friendly voice assistant. "
+        "Keep your responses concise and conversational."
+    )
+    
+    # Turn Detection (Server-side VAD)
+    turn_detection: Optional[Dict[str, Any]] = field(default_factory=lambda: {
+        "type": "server_vad",
+        "threshold": 0.5,
+        "prefix_padding_ms": 300,
+        "silence_duration_ms": 500
+    })
+    
+    # Audio Configuration
+    input_audio_format: str = "pcm16"
+    output_audio_format: str = "pcm16"
+    
+    # Tools/Functions
+    tools: List[Dict[str, Any]] = field(default_factory=list)
+
+
+class RealtimeService:
+    """
+    OpenAI Realtime API service for true duplex voice conversation.
+    
+    This service handles the entire voice conversation pipeline:
+    1. Audio input → Server-side VAD → Speech-to-text
+    2. Text → LLM processing → Response generation
+    3. Response → Text-to-speech → Audio output
+    
+    Events emitted:
+    - on_audio: Audio output from the assistant
+    - on_transcript: Text transcript (user or assistant)
+    - on_speech_started: User started speaking
+    - on_speech_stopped: User stopped speaking
+    - on_response_started: Assistant started responding
+    - on_response_done: Assistant finished responding
+    - on_function_call: Function call requested
+    - on_error: Error occurred
+    """
+    
+    def __init__(self, config: Optional[RealtimeConfig] = None):
+        """
+        Initialize Realtime API service.
+        
+        Args:
+            config: Realtime configuration (uses defaults if not provided)
+        """
+        self.config = config or RealtimeConfig()
+        self.config.api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
+        
+        self.state = RealtimeState.DISCONNECTED
+        self._ws = None
+        self._receive_task: Optional[asyncio.Task] = None
+        self._cancel_event = asyncio.Event()
+        
+        # Event callbacks
+        self._callbacks: Dict[str, List[Callable]] = {
+            "on_audio": [],
+            "on_transcript": [],
+            "on_speech_started": [],
+            "on_speech_stopped": [],
+            "on_response_started": [],
+            "on_response_done": [],
+            "on_function_call": [],
+            "on_error": [],
+            "on_interrupted": [],
+        }
+        
+        logger.debug(f"RealtimeService initialized with model={self.config.model}")
+    
+    def on(self, event: str, callback: Callable[..., Awaitable[None]]) -> None:
+        """
+        Register event callback.
+        
+        Args:
+            event: Event name
+            callback: Async callback function
+        """
+        if event in self._callbacks:
+            self._callbacks[event].append(callback)
+    
+    async def _emit(self, event: str, *args, **kwargs) -> None:
+        """Emit event to all registered callbacks."""
+        for callback in self._callbacks.get(event, []):
+            try:
+                await callback(*args, **kwargs)
+            except Exception as e:
+                logger.error(f"Event callback error ({event}): {e}")
+    
+    async def connect(self) -> None:
+        """Connect to OpenAI Realtime API."""
+        if not WEBSOCKETS_AVAILABLE:
+            raise RuntimeError("websockets package not installed")
+        
+        if not self.config.api_key:
+            raise ValueError("OpenAI API key not provided")
+        
+        self.state = RealtimeState.CONNECTING
+        
+        # Build URL
+        if self.config.endpoint:
+            # Azure or custom endpoint
+            url = f"{self.config.endpoint}/openai/realtime?api-version=2024-10-01-preview&deployment={self.config.model}"
+        else:
+            # OpenAI endpoint
+            url = f"wss://api.openai.com/v1/realtime?model={self.config.model}"
+        
+        # Build headers
+        headers = {}
+        if self.config.endpoint:
+            headers["api-key"] = self.config.api_key
+        else:
+            headers["Authorization"] = f"Bearer {self.config.api_key}"
+            headers["OpenAI-Beta"] = "realtime=v1"
+        
+        try:
+            logger.info(f"Connecting to Realtime API: {url}")
+            self._ws = await websockets.connect(url, extra_headers=headers)
+            
+            # Send session configuration
+            await self._configure_session()
+            
+            # Start receive loop
+            self._receive_task = asyncio.create_task(self._receive_loop())
+            
+            self.state = RealtimeState.CONNECTED
+            logger.info("Realtime API connected successfully")
+            
+        except Exception as e:
+            self.state = RealtimeState.ERROR
+            logger.error(f"Realtime API connection failed: {e}")
+            raise
+    
+    async def _configure_session(self) -> None:
+        """Send session configuration to server."""
+        session_config = {
+            "type": "session.update",
+            "session": {
+                "modalities": ["text", "audio"],
+                "instructions": self.config.instructions,
+                "voice": self.config.voice,
+                "input_audio_format": self.config.input_audio_format,
+                "output_audio_format": self.config.output_audio_format,
+                "turn_detection": self.config.turn_detection,
+            }
+        }
+        
+        if self.config.tools:
+            session_config["session"]["tools"] = self.config.tools
+        
+        await self._send(session_config)
+        logger.debug("Session configuration sent")
+    
+    async def _send(self, data: Dict[str, Any]) -> None:
+        """Send JSON data to server."""
+        if self._ws:
+            await self._ws.send(json.dumps(data))
+    
+    async def send_audio(self, audio_bytes: bytes) -> None:
+        """
+        Send audio to the Realtime API.
+        
+        Args:
+            audio_bytes: PCM audio data (16-bit, mono, 24kHz by default)
+        """
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        # Encode audio as base64
+        audio_b64 = base64.standard_b64encode(audio_bytes).decode()
+        
+        await self._send({
+            "type": "input_audio_buffer.append",
+            "audio": audio_b64
+        })
+    
+    async def send_text(self, text: str) -> None:
+        """
+        Send text input (bypassing audio).
+        
+        Args:
+            text: User text input
+        """
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        # Create a conversation item with user text
+        await self._send({
+            "type": "conversation.item.create",
+            "item": {
+                "type": "message",
+                "role": "user",
+                "content": [{"type": "input_text", "text": text}]
+            }
+        })
+        
+        # Trigger response
+        await self._send({"type": "response.create"})
+    
+    async def cancel_response(self) -> None:
+        """Cancel the current response (for barge-in)."""
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        await self._send({"type": "response.cancel"})
+        logger.debug("Response cancelled")
+    
+    async def commit_audio(self) -> None:
+        """Commit the audio buffer and trigger response."""
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        await self._send({"type": "input_audio_buffer.commit"})
+        await self._send({"type": "response.create"})
+    
+    async def clear_audio_buffer(self) -> None:
+        """Clear the input audio buffer."""
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        await self._send({"type": "input_audio_buffer.clear"})
+    
+    async def submit_function_result(self, call_id: str, result: str) -> None:
+        """
+        Submit function call result.
+        
+        Args:
+            call_id: The function call ID
+            result: JSON string result
+        """
+        if self.state != RealtimeState.CONNECTED:
+            return
+        
+        await self._send({
+            "type": "conversation.item.create",
+            "item": {
+                "type": "function_call_output",
+                "call_id": call_id,
+                "output": result
+            }
+        })
+        
+        # Trigger response with the function result
+        await self._send({"type": "response.create"})
+    
+    async def _receive_loop(self) -> None:
+        """Receive and process messages from the Realtime API."""
+        if not self._ws:
+            return
+        
+        try:
+            async for message in self._ws:
+                try:
+                    data = json.loads(message)
+                    await self._handle_event(data)
+                except json.JSONDecodeError:
+                    logger.warning(f"Invalid JSON received: {message[:100]}")
+        
+        except asyncio.CancelledError:
+            logger.debug("Receive loop cancelled")
+        except websockets.ConnectionClosed as e:
+            logger.info(f"WebSocket closed: {e}")
+            self.state = RealtimeState.DISCONNECTED
+        except Exception as e:
+            logger.error(f"Receive loop error: {e}")
+            self.state = RealtimeState.ERROR
+    
+    async def _handle_event(self, data: Dict[str, Any]) -> None:
+        """Handle incoming event from Realtime API."""
+        event_type = data.get("type", "unknown")
+        
+        # Audio delta - streaming audio output
+        if event_type == "response.audio.delta":
+            if "delta" in data:
+                audio_bytes = base64.standard_b64decode(data["delta"])
+                await self._emit("on_audio", audio_bytes)
+        
+        # Audio transcript delta - streaming text
+        elif event_type == "response.audio_transcript.delta":
+            if "delta" in data:
+                await self._emit("on_transcript", data["delta"], "assistant", False)
+        
+        # Audio transcript done
+        elif event_type == "response.audio_transcript.done":
+            if "transcript" in data:
+                await self._emit("on_transcript", data["transcript"], "assistant", True)
+        
+        # Input audio transcript (user speech)
+        elif event_type == "conversation.item.input_audio_transcription.completed":
+            if "transcript" in data:
+                await self._emit("on_transcript", data["transcript"], "user", True)
+        
+        # Speech started (server VAD detected speech)
+        elif event_type == "input_audio_buffer.speech_started":
+            await self._emit("on_speech_started", data.get("audio_start_ms", 0))
+        
+        # Speech stopped
+        elif event_type == "input_audio_buffer.speech_stopped":
+            await self._emit("on_speech_stopped", data.get("audio_end_ms", 0))
+        
+        # Response started
+        elif event_type == "response.created":
+            await self._emit("on_response_started", data.get("response", {}))
+        
+        # Response done
+        elif event_type == "response.done":
+            await self._emit("on_response_done", data.get("response", {}))
+        
+        # Function call
+        elif event_type == "response.function_call_arguments.done":
+            call_id = data.get("call_id")
+            name = data.get("name")
+            arguments = data.get("arguments", "{}")
+            await self._emit("on_function_call", call_id, name, arguments)
+        
+        # Error
+        elif event_type == "error":
+            error = data.get("error", {})
+            logger.error(f"Realtime API error: {error}")
+            await self._emit("on_error", error)
+        
+        # Session events
+        elif event_type == "session.created":
+            logger.info("Session created")
+        elif event_type == "session.updated":
+            logger.debug("Session updated")
+        
+        else:
+            logger.debug(f"Unhandled event type: {event_type}")
+    
+    async def disconnect(self) -> None:
+        """Disconnect from Realtime API."""
+        self._cancel_event.set()
+        
+        if self._receive_task:
+            self._receive_task.cancel()
+            try:
+                await self._receive_task
+            except asyncio.CancelledError:
+                pass
+        
+        if self._ws:
+            await self._ws.close()
+            self._ws = None
+        
+        self.state = RealtimeState.DISCONNECTED
+        logger.info("Realtime API disconnected")
+
+
+class RealtimePipeline:
+    """
+    Pipeline adapter for RealtimeService.
+    
+    Provides a compatible interface with DuplexPipeline but uses
+    OpenAI Realtime API for all processing.
+    """
+    
+    def __init__(
+        self,
+        transport,
+        session_id: str,
+        config: Optional[RealtimeConfig] = None
+    ):
+        """
+        Initialize Realtime pipeline.
+        
+        Args:
+            transport: Transport for sending audio/events
+            session_id: Session identifier
+            config: Realtime configuration
+        """
+        self.transport = transport
+        self.session_id = session_id
+        
+        self.service = RealtimeService(config)
+        
+        # Register callbacks
+        self.service.on("on_audio", self._on_audio)
+        self.service.on("on_transcript", self._on_transcript)
+        self.service.on("on_speech_started", self._on_speech_started)
+        self.service.on("on_speech_stopped", self._on_speech_stopped)
+        self.service.on("on_response_started", self._on_response_started)
+        self.service.on("on_response_done", self._on_response_done)
+        self.service.on("on_error", self._on_error)
+        
+        self._is_speaking = False
+        self._running = True
+        
+        logger.info(f"RealtimePipeline initialized for session {session_id}")
+    
+    async def start(self) -> None:
+        """Start the pipeline."""
+        await self.service.connect()
+    
+    async def process_audio(self, pcm_bytes: bytes) -> None:
+        """
+        Process incoming audio.
+        
+        Note: Realtime API expects 24kHz audio by default.
+        You may need to resample from 16kHz.
+        """
+        if not self._running:
+            return
+        
+        # TODO: Resample from 16kHz to 24kHz if needed
+        await self.service.send_audio(pcm_bytes)
+    
+    async def process_text(self, text: str) -> None:
+        """Process text input."""
+        if not self._running:
+            return
+        
+        await self.service.send_text(text)
+    
+    async def interrupt(self) -> None:
+        """Interrupt current response."""
+        await self.service.cancel_response()
+        await self.transport.send_event({
+            "event": "interrupt",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms()
+        })
+    
+    async def cleanup(self) -> None:
+        """Cleanup resources."""
+        self._running = False
+        await self.service.disconnect()
+    
+    # Event handlers
+    
+    async def _on_audio(self, audio_bytes: bytes) -> None:
+        """Handle audio output."""
+        await self.transport.send_audio(audio_bytes)
+    
+    async def _on_transcript(self, text: str, role: str, is_final: bool) -> None:
+        """Handle transcript."""
+        logger.info(f"[{role.upper()}] {text[:50]}..." if len(text) > 50 else f"[{role.upper()}] {text}")
+    
+    async def _on_speech_started(self, start_ms: int) -> None:
+        """Handle user speech start."""
+        self._is_speaking = True
+        await self.transport.send_event({
+            "event": "speaking",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms(),
+            "startTime": start_ms
+        })
+        
+        # Cancel any ongoing response (barge-in)
+        await self.service.cancel_response()
+    
+    async def _on_speech_stopped(self, end_ms: int) -> None:
+        """Handle user speech stop."""
+        self._is_speaking = False
+        await self.transport.send_event({
+            "event": "silence",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms(),
+            "duration": end_ms
+        })
+    
+    async def _on_response_started(self, response: Dict) -> None:
+        """Handle response start."""
+        await self.transport.send_event({
+            "event": "trackStart",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms()
+        })
+    
+    async def _on_response_done(self, response: Dict) -> None:
+        """Handle response complete."""
+        await self.transport.send_event({
+            "event": "trackEnd",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms()
+        })
+    
+    async def _on_error(self, error: Dict) -> None:
+        """Handle error."""
+        await self.transport.send_event({
+            "event": "error",
+            "trackId": self.session_id,
+            "timestamp": self._get_timestamp_ms(),
+            "sender": "realtime",
+            "error": str(error)
+        })
+    
+    def _get_timestamp_ms(self) -> int:
+        """Get current timestamp in milliseconds."""
+        import time
+        return int(time.time() * 1000)
+    
+    @property
+    def is_speaking(self) -> bool:
+        """Check if user is speaking."""
+        return self._is_speaking
--- a/services/siliconflow_tts.py
+++ b/services/siliconflow_tts.py
@@ -0,0 +1,255 @@
+"""SiliconFlow TTS Service with streaming support.
+
+Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
+text-to-speech synthesis with streaming.
+
+API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
+"""
+
+import os
+import asyncio
+import aiohttp
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from services.base import BaseTTSService, TTSChunk, ServiceState
+
+
+class SiliconFlowTTSService(BaseTTSService):
+    """
+    SiliconFlow TTS service with streaming support.
+    
+    Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
+    """
+    
+    # Available voices
+    VOICES = {
+        "alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
+        "anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
+        "bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
+        "benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
+        "charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
+        "claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
+        "david": "FunAudioLLM/CosyVoice2-0.5B:david",
+        "diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
+    }
+    
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        voice: str = "anna",
+        model: str = "FunAudioLLM/CosyVoice2-0.5B",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        """
+        Initialize SiliconFlow TTS service.
+        
+        Args:
+            api_key: SiliconFlow API key (defaults to SILICONFLOW_API_KEY env var)
+            voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
+            model: Model name
+            sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
+            speed: Speech speed (0.25 to 4.0)
+        """
+        # Resolve voice name
+        if voice in self.VOICES:
+            full_voice = self.VOICES[voice]
+        else:
+            full_voice = voice
+            
+        super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
+        
+        self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
+        self.model = model
+        self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
+        
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._cancel_event = asyncio.Event()
+    
+    async def connect(self) -> None:
+        """Initialize HTTP session."""
+        if not self.api_key:
+            raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
+        
+        self._session = aiohttp.ClientSession(
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json"
+            }
+        )
+        self.state = ServiceState.CONNECTED
+        logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
+    
+    async def disconnect(self) -> None:
+        """Close HTTP session."""
+        if self._session:
+            await self._session.close()
+            self._session = None
+        self.state = ServiceState.DISCONNECTED
+        logger.info("SiliconFlow TTS service disconnected")
+    
+    async def synthesize(self, text: str) -> bytes:
+        """Synthesize complete audio for text."""
+        audio_data = b""
+        async for chunk in self.synthesize_stream(text):
+            audio_data += chunk.audio
+        return audio_data
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects with PCM audio
+        """
+        if not self._session:
+            raise RuntimeError("TTS service not connected")
+        
+        if not text.strip():
+            return
+        
+        self._cancel_event.clear()
+        
+        payload = {
+            "model": self.model,
+            "input": text,
+            "voice": self.voice,
+            "response_format": "pcm",
+            "sample_rate": self.sample_rate,
+            "stream": True,
+            "speed": self.speed
+        }
+        
+        try:
+            async with self._session.post(self.api_url, json=payload) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
+                    return
+                
+                # Stream audio chunks
+                chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
+                buffer = b""
+                
+                async for chunk in response.content.iter_any():
+                    if self._cancel_event.is_set():
+                        logger.info("TTS synthesis cancelled")
+                        return
+                    
+                    buffer += chunk
+                    
+                    # Yield complete chunks
+                    while len(buffer) >= chunk_size:
+                        audio_chunk = buffer[:chunk_size]
+                        buffer = buffer[chunk_size:]
+                        
+                        yield TTSChunk(
+                            audio=audio_chunk,
+                            sample_rate=self.sample_rate,
+                            is_final=False
+                        )
+                
+                # Yield remaining buffer
+                if buffer:
+                    yield TTSChunk(
+                        audio=buffer,
+                        sample_rate=self.sample_rate,
+                        is_final=True
+                    )
+                    
+        except asyncio.CancelledError:
+            logger.info("TTS synthesis cancelled via asyncio")
+            raise
+        except Exception as e:
+            logger.error(f"TTS synthesis error: {e}")
+            raise
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis."""
+        self._cancel_event.set()
+
+
+class StreamingTTSAdapter:
+    """
+    Adapter for streaming LLM text to TTS with sentence-level chunking.
+    
+    This reduces latency by starting TTS as soon as a complete sentence
+    is received from the LLM, rather than waiting for the full response.
+    """
+    
+    # Sentence delimiters
+    SENTENCE_ENDS = {'.', '!', '?', '。', '！', '？', '；', '\n'}
+    
+    def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
+        self.tts_service = tts_service
+        self.transport = transport
+        self.session_id = session_id
+        self._buffer = ""
+        self._cancel_event = asyncio.Event()
+        self._is_speaking = False
+    
+    async def process_text_chunk(self, text_chunk: str) -> None:
+        """
+        Process a text chunk from LLM and trigger TTS when sentence is complete.
+        
+        Args:
+            text_chunk: Text chunk from LLM streaming
+        """
+        if self._cancel_event.is_set():
+            return
+        
+        self._buffer += text_chunk
+        
+        # Check for sentence completion
+        for i, char in enumerate(self._buffer):
+            if char in self.SENTENCE_ENDS:
+                # Found sentence end, synthesize up to this point
+                sentence = self._buffer[:i+1].strip()
+                self._buffer = self._buffer[i+1:]
+                
+                if sentence:
+                    await self._speak_sentence(sentence)
+                break
+    
+    async def flush(self) -> None:
+        """Flush remaining buffer."""
+        if self._buffer.strip() and not self._cancel_event.is_set():
+            await self._speak_sentence(self._buffer.strip())
+        self._buffer = ""
+    
+    async def _speak_sentence(self, text: str) -> None:
+        """Synthesize and send a sentence."""
+        if not text or self._cancel_event.is_set():
+            return
+        
+        self._is_speaking = True
+        
+        try:
+            async for chunk in self.tts_service.synthesize_stream(text):
+                if self._cancel_event.is_set():
+                    break
+                await self.transport.send_audio(chunk.audio)
+                await asyncio.sleep(0.01)  # Prevent flooding
+        except Exception as e:
+            logger.error(f"TTS speak error: {e}")
+        finally:
+            self._is_speaking = False
+    
+    def cancel(self) -> None:
+        """Cancel ongoing speech."""
+        self._cancel_event.set()
+        self._buffer = ""
+    
+    def reset(self) -> None:
+        """Reset for new turn."""
+        self._cancel_event.clear()
+        self._buffer = ""
+        self._is_speaking = False
+    
+    @property
+    def is_speaking(self) -> bool:
+        return self._is_speaking
--- a/services/tts.py
+++ b/services/tts.py
@@ -0,0 +1,271 @@
+"""TTS (Text-to-Speech) Service implementations.
+
+Provides multiple TTS backend options including edge-tts (free)
+and placeholder for cloud services.
+"""
+
+import os
+import io
+import asyncio
+import struct
+from typing import AsyncIterator, Optional
+from loguru import logger
+
+from services.base import BaseTTSService, TTSChunk, ServiceState
+
+# Try to import edge-tts
+try:
+    import edge_tts
+    EDGE_TTS_AVAILABLE = True
+except ImportError:
+    EDGE_TTS_AVAILABLE = False
+    logger.warning("edge-tts not available - EdgeTTS service will be disabled")
+
+
+class EdgeTTSService(BaseTTSService):
+    """
+    Microsoft Edge TTS service.
+    
+    Uses edge-tts library for free, high-quality speech synthesis.
+    Supports streaming for low-latency playback.
+    """
+    
+    # Voice mapping for common languages
+    VOICE_MAP = {
+        "en": "en-US-JennyNeural",
+        "en-US": "en-US-JennyNeural",
+        "en-GB": "en-GB-SoniaNeural",
+        "zh": "zh-CN-XiaoxiaoNeural",
+        "zh-CN": "zh-CN-XiaoxiaoNeural",
+        "zh-TW": "zh-TW-HsiaoChenNeural",
+        "ja": "ja-JP-NanamiNeural",
+        "ko": "ko-KR-SunHiNeural",
+        "fr": "fr-FR-DeniseNeural",
+        "de": "de-DE-KatjaNeural",
+        "es": "es-ES-ElviraNeural",
+    }
+    
+    def __init__(
+        self,
+        voice: str = "en-US-JennyNeural",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        """
+        Initialize Edge TTS service.
+        
+        Args:
+            voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
+            sample_rate: Target sample rate (will be resampled)
+            speed: Speech speed multiplier
+        """
+        # Resolve voice from language code if needed
+        if voice in self.VOICE_MAP:
+            voice = self.VOICE_MAP[voice]
+        
+        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
+        self._cancel_event = asyncio.Event()
+    
+    async def connect(self) -> None:
+        """Edge TTS doesn't require explicit connection."""
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts package not installed")
+        self.state = ServiceState.CONNECTED
+        logger.info(f"Edge TTS service ready: voice={self.voice}")
+    
+    async def disconnect(self) -> None:
+        """Edge TTS doesn't require explicit disconnection."""
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Edge TTS service disconnected")
+    
+    def _get_rate_string(self) -> str:
+        """Convert speed to rate string for edge-tts."""
+        # edge-tts uses percentage format: "+0%", "-10%", "+20%"
+        percentage = int((self.speed - 1.0) * 100)
+        if percentage >= 0:
+            return f"+{percentage}%"
+        return f"{percentage}%"
+    
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize complete audio for text.
+        
+        Args:
+            text: Text to synthesize
+            
+        Returns:
+            PCM audio data (16-bit, mono, 16kHz)
+        """
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts not available")
+        
+        # Collect all chunks
+        audio_data = b""
+        async for chunk in self.synthesize_stream(text):
+            audio_data += chunk.audio
+        
+        return audio_data
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects with PCM audio
+        """
+        if not EDGE_TTS_AVAILABLE:
+            raise RuntimeError("edge-tts not available")
+        
+        self._cancel_event.clear()
+        
+        try:
+            communicate = edge_tts.Communicate(
+                text,
+                voice=self.voice,
+                rate=self._get_rate_string()
+            )
+            
+            # edge-tts outputs MP3, we need to decode to PCM
+            # For now, collect MP3 chunks and yield after conversion
+            mp3_data = b""
+            
+            async for chunk in communicate.stream():
+                # Check for cancellation
+                if self._cancel_event.is_set():
+                    logger.info("TTS synthesis cancelled")
+                    return
+                
+                if chunk["type"] == "audio":
+                    mp3_data += chunk["data"]
+            
+            # Convert MP3 to PCM
+            if mp3_data:
+                pcm_data = await self._convert_mp3_to_pcm(mp3_data)
+                if pcm_data:
+                    # Yield in chunks for streaming playback
+                    chunk_size = self.sample_rate * 2 // 10  # 100ms chunks
+                    for i in range(0, len(pcm_data), chunk_size):
+                        if self._cancel_event.is_set():
+                            return
+                        
+                        chunk_data = pcm_data[i:i + chunk_size]
+                        yield TTSChunk(
+                            audio=chunk_data,
+                            sample_rate=self.sample_rate,
+                            is_final=(i + chunk_size >= len(pcm_data))
+                        )
+        
+        except asyncio.CancelledError:
+            logger.info("TTS synthesis cancelled via asyncio")
+            raise
+        except Exception as e:
+            logger.error(f"TTS synthesis error: {e}")
+            raise
+    
+    async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
+        """
+        Convert MP3 audio to PCM.
+        
+        Uses pydub or ffmpeg for conversion.
+        """
+        try:
+            # Try using pydub (requires ffmpeg)
+            from pydub import AudioSegment
+            
+            # Load MP3 from bytes
+            audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
+            
+            # Convert to target format
+            audio = audio.set_frame_rate(self.sample_rate)
+            audio = audio.set_channels(1)
+            audio = audio.set_sample_width(2)  # 16-bit
+            
+            # Export as raw PCM
+            return audio.raw_data
+            
+        except ImportError:
+            logger.warning("pydub not available, trying fallback")
+            # Fallback: Use subprocess to call ffmpeg directly
+            return await self._ffmpeg_convert(mp3_data)
+        except Exception as e:
+            logger.error(f"Audio conversion error: {e}")
+            return b""
+    
+    async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
+        """Convert MP3 to PCM using ffmpeg subprocess."""
+        try:
+            process = await asyncio.create_subprocess_exec(
+                "ffmpeg",
+                "-i", "pipe:0",
+                "-f", "s16le",
+                "-acodec", "pcm_s16le",
+                "-ar", str(self.sample_rate),
+                "-ac", "1",
+                "pipe:1",
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.DEVNULL
+            )
+            
+            stdout, _ = await process.communicate(input=mp3_data)
+            return stdout
+            
+        except Exception as e:
+            logger.error(f"ffmpeg conversion error: {e}")
+            return b""
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis."""
+        self._cancel_event.set()
+
+
+class MockTTSService(BaseTTSService):
+    """
+    Mock TTS service for testing without actual synthesis.
+    
+    Generates silence or simple tones.
+    """
+    
+    def __init__(
+        self,
+        voice: str = "mock",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
+    
+    async def connect(self) -> None:
+        self.state = ServiceState.CONNECTED
+        logger.info("Mock TTS service connected")
+    
+    async def disconnect(self) -> None:
+        self.state = ServiceState.DISCONNECTED
+        logger.info("Mock TTS service disconnected")
+    
+    async def synthesize(self, text: str) -> bytes:
+        """Generate silence based on text length."""
+        # Approximate: 100ms per word
+        word_count = len(text.split())
+        duration_ms = word_count * 100
+        samples = int(self.sample_rate * duration_ms / 1000)
+        
+        # Generate silence (zeros)
+        return bytes(samples * 2)  # 16-bit = 2 bytes per sample
+    
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """Generate silence chunks."""
+        audio = await self.synthesize(text)
+        
+        # Yield in 100ms chunks
+        chunk_size = self.sample_rate * 2 // 10
+        for i in range(0, len(audio), chunk_size):
+            chunk_data = audio[i:i + chunk_size]
+            yield TTSChunk(
+                audio=chunk_data,
+                sample_rate=self.sample_rate,
+                is_final=(i + chunk_size >= len(audio))
+            )
+            await asyncio.sleep(0.05)  # Simulate processing time