Refactor project structure and enhance backend integration

- Expanded package inclusion in `pyproject.toml` to support new modules.
- Introduced new `adapters` and `protocol` packages for better organization.
- Added backend adapter implementations for control plane integration.
- Updated main application imports to reflect new package structure.
- Removed deprecated core components and adjusted documentation accordingly.
- Enhanced architecture documentation to clarify the new runtime and integration layers.
This commit is contained in:
Xin Wang
2026-03-06 09:51:56 +08:00
parent 4e2450e800
commit 7e0b777923
75 changed files with 274 additions and 688 deletions

View File

@@ -0,0 +1 @@
"""Providers package."""

View File

@@ -0,0 +1 @@
"""ASR providers."""

View File

@@ -0,0 +1,147 @@
"""ASR (Automatic Speech Recognition) Service implementations.
Provides speech-to-text capabilities with streaming support.
"""
import os
import asyncio
import json
from typing import AsyncIterator, Optional
from loguru import logger
from providers.common.base import BaseASRService, ASRResult, ServiceState
# Try to import websockets for streaming ASR
try:
import websockets
WEBSOCKETS_AVAILABLE = True
except ImportError:
WEBSOCKETS_AVAILABLE = False
class BufferedASRService(BaseASRService):
"""
Buffered ASR service that accumulates audio and provides
a simple text accumulator for use with EOU detection.
This is a lightweight implementation that works with the
existing VAD + EOU pattern without requiring external ASR.
"""
def __init__(
self,
sample_rate: int = 16000,
language: str = "en"
):
super().__init__(sample_rate=sample_rate, language=language)
self._audio_buffer: bytes = b""
self._current_text: str = ""
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
async def connect(self) -> None:
"""No connection needed for buffered ASR."""
self.state = ServiceState.CONNECTED
logger.info("Buffered ASR service connected")
async def disconnect(self) -> None:
"""Clear buffers on disconnect."""
self._audio_buffer = b""
self._current_text = ""
self.state = ServiceState.DISCONNECTED
logger.info("Buffered ASR service disconnected")
async def send_audio(self, audio: bytes) -> None:
"""Buffer audio for later processing."""
self._audio_buffer += audio
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""Yield transcription results."""
while True:
try:
result = await asyncio.wait_for(
self._transcript_queue.get(),
timeout=0.1
)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
def set_text(self, text: str) -> None:
"""
Set the current transcript text directly.
This allows external integration (e.g., Whisper, other ASR)
to provide transcripts.
"""
self._current_text = text
result = ASRResult(text=text, is_final=False)
asyncio.create_task(self._transcript_queue.put(result))
def get_and_clear_text(self) -> str:
"""Get accumulated text and clear buffer."""
text = self._current_text
self._current_text = ""
self._audio_buffer = b""
return text
def get_audio_buffer(self) -> bytes:
"""Get accumulated audio buffer."""
return self._audio_buffer
def clear_audio_buffer(self) -> None:
"""Clear audio buffer."""
self._audio_buffer = b""
class MockASRService(BaseASRService):
"""
Mock ASR service for testing without actual recognition.
"""
def __init__(self, sample_rate: int = 16000, language: str = "en"):
super().__init__(sample_rate=sample_rate, language=language)
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
self._mock_texts = [
"Hello, how are you?",
"That's interesting.",
"Tell me more about that.",
"I understand.",
]
self._text_index = 0
async def connect(self) -> None:
self.state = ServiceState.CONNECTED
logger.info("Mock ASR service connected")
async def disconnect(self) -> None:
self.state = ServiceState.DISCONNECTED
logger.info("Mock ASR service disconnected")
async def send_audio(self, audio: bytes) -> None:
"""Mock audio processing - generates fake transcripts periodically."""
pass
def trigger_transcript(self) -> None:
"""Manually trigger a transcript (for testing)."""
text = self._mock_texts[self._text_index % len(self._mock_texts)]
self._text_index += 1
result = ASRResult(text=text, is_final=True, confidence=0.95)
asyncio.create_task(self._transcript_queue.put(result))
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""Yield transcription results."""
while True:
try:
result = await asyncio.wait_for(
self._transcript_queue.get(),
timeout=0.1
)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break

View File

@@ -0,0 +1,353 @@
"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
Uses the SiliconFlow API for speech-to-text transcription.
API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
"""
import asyncio
import io
import os
import wave
from typing import AsyncIterator, Optional, Callable, Awaitable
from urllib.parse import urlparse, urlunparse
from loguru import logger
try:
import aiohttp
AIOHTTP_AVAILABLE = True
except ImportError:
AIOHTTP_AVAILABLE = False
logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
from providers.common.base import BaseASRService, ASRResult, ServiceState
class OpenAICompatibleASRService(BaseASRService):
"""
OpenAI-compatible ASR service for speech-to-text transcription.
Features:
- Buffers incoming audio chunks
- Provides interim transcriptions periodically (for streaming to client)
- Final transcription on EOU
API Details:
- Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
- Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
- Input: Audio file (multipart/form-data)
- Output: {"text": "transcribed text"}
"""
# Supported models
MODELS = {
"sensevoice": "FunAudioLLM/SenseVoiceSmall",
"telespeech": "TeleAI/TeleSpeechASR",
}
API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
model: str = "FunAudioLLM/SenseVoiceSmall",
sample_rate: int = 16000,
language: str = "auto",
interim_interval_ms: int = 500, # How often to send interim results
min_audio_for_interim_ms: int = 300, # Min audio before first interim
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
):
"""
Initialize OpenAI-compatible ASR service.
Args:
api_key: Provider API key
api_url: Provider API URL (defaults to SiliconFlow endpoint)
model: ASR model name or alias
sample_rate: Audio sample rate (16000 recommended)
language: Language code (auto for automatic detection)
interim_interval_ms: How often to generate interim transcriptions
min_audio_for_interim_ms: Minimum audio duration before first interim
on_transcript: Callback for transcription results (text, is_final)
"""
super().__init__(sample_rate=sample_rate, language=language)
if not AIOHTTP_AVAILABLE:
raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
self.api_key = api_key
raw_api_url = api_url or os.getenv("ASR_API_URL") or self.API_URL
self.api_url = self._resolve_transcriptions_endpoint(raw_api_url)
self.model = self.MODELS.get(model.lower(), model)
self.interim_interval_ms = interim_interval_ms
self.min_audio_for_interim_ms = min_audio_for_interim_ms
self.on_transcript = on_transcript
# Session
self._session: Optional[aiohttp.ClientSession] = None
# Audio buffer
self._audio_buffer: bytes = b""
self._current_text: str = ""
self._last_interim_time: float = 0
# Transcript queue for async iteration
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
# Background task for interim results
self._interim_task: Optional[asyncio.Task] = None
self._running = False
logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
@staticmethod
def _resolve_transcriptions_endpoint(api_url: str) -> str:
"""
Accept either:
- base URL: https://host/v1
- full endpoint: https://host/v1/audio/transcriptions
and always return the final transcriptions endpoint URL.
"""
raw = str(api_url or "").strip()
if not raw:
return OpenAICompatibleASRService.API_URL
parsed = urlparse(raw)
path = (parsed.path or "").rstrip("/")
if path.endswith("/audio/transcriptions"):
return raw
if not path:
new_path = "/audio/transcriptions"
else:
new_path = f"{path}/audio/transcriptions"
return urlunparse(parsed._replace(path=new_path))
async def connect(self) -> None:
"""Connect to the service."""
if not self.api_key:
raise ValueError("ASR API key not provided. Configure agent.asr.api_key in YAML.")
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}"
}
)
self._running = True
self.state = ServiceState.CONNECTED
logger.info("OpenAICompatibleASRService connected")
async def disconnect(self) -> None:
"""Disconnect and cleanup."""
self._running = False
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
if self._session:
await self._session.close()
self._session = None
self._audio_buffer = b""
self._current_text = ""
self.state = ServiceState.DISCONNECTED
logger.info("OpenAICompatibleASRService disconnected")
async def send_audio(self, audio: bytes) -> None:
"""
Buffer incoming audio data.
Args:
audio: PCM audio data (16-bit, mono)
"""
self._audio_buffer += audio
async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
"""
Transcribe current audio buffer.
Args:
is_final: Whether this is the final transcription
Returns:
Transcribed text or None if not enough audio
"""
if not self._session:
logger.warning("ASR session not connected")
return None
# Check minimum audio duration
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
return None
if audio_duration_ms < 100: # Less than 100ms - too short
return None
try:
# Convert PCM to WAV in memory
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2) # 16-bit
wav_file.setframerate(self.sample_rate)
wav_file.writeframes(self._audio_buffer)
wav_buffer.seek(0)
wav_data = wav_buffer.read()
# Send to API
form_data = aiohttp.FormData()
form_data.add_field(
'file',
wav_data,
filename='audio.wav',
content_type='audio/wav'
)
form_data.add_field('model', self.model)
async with self._session.post(self.api_url, data=form_data) as response:
if response.status == 200:
result = await response.json()
text = result.get("text", "").strip()
if text:
self._current_text = text
# Notify via callback
if self.on_transcript:
await self.on_transcript(text, is_final)
# Queue result
await self._transcript_queue.put(
ASRResult(text=text, is_final=is_final)
)
logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
return text
else:
error_text = await response.text()
logger.error(f"ASR API error {response.status}: {error_text}")
return None
except Exception as e:
logger.error(f"ASR transcription error: {e}")
return None
async def get_final_transcription(self) -> str:
"""
Get final transcription and clear buffer.
Call this when EOU is detected.
Returns:
Final transcribed text
"""
# Transcribe full buffer as final
text = await self.transcribe_buffer(is_final=True)
# Clear buffer
result = text or self._current_text
self._audio_buffer = b""
self._current_text = ""
return result
def get_and_clear_text(self) -> str:
"""
Get accumulated text and clear buffer.
Compatible with BufferedASRService interface.
"""
text = self._current_text
self._current_text = ""
self._audio_buffer = b""
return text
def get_audio_buffer(self) -> bytes:
"""Get current audio buffer."""
return self._audio_buffer
def get_audio_duration_ms(self) -> float:
"""Get current audio buffer duration in milliseconds."""
return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
def clear_buffer(self) -> None:
"""Clear audio and text buffers."""
self._audio_buffer = b""
self._current_text = ""
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""
Async iterator for transcription results.
Yields:
ASRResult with text and is_final flag
"""
while self._running:
try:
result = await asyncio.wait_for(
self._transcript_queue.get(),
timeout=0.1
)
yield result
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
async def start_interim_transcription(self) -> None:
"""
Start background task for interim transcriptions.
This periodically transcribes buffered audio for
real-time feedback to the user.
"""
if self._interim_task and not self._interim_task.done():
return
self._interim_task = asyncio.create_task(self._interim_loop())
async def stop_interim_transcription(self) -> None:
"""Stop interim transcription task."""
if self._interim_task:
self._interim_task.cancel()
try:
await self._interim_task
except asyncio.CancelledError:
pass
self._interim_task = None
async def _interim_loop(self) -> None:
"""Background loop for interim transcriptions."""
import time
while self._running:
try:
await asyncio.sleep(self.interim_interval_ms / 1000)
# Check if we have enough new audio
current_time = time.time()
time_since_last = (current_time - self._last_interim_time) * 1000
if time_since_last >= self.interim_interval_ms:
audio_duration = self.get_audio_duration_ms()
if audio_duration >= self.min_audio_for_interim_ms:
await self.transcribe_buffer(is_final=False)
self._last_interim_time = current_time
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Interim transcription error: {e}")
# Backward-compatible alias
SiliconFlowASRService = OpenAICompatibleASRService

View File

@@ -0,0 +1,8 @@
"""Backward-compatible imports for legacy siliconflow_asr module."""
from providers.asr.openai_compatible import OpenAICompatibleASRService
# Backward-compatible alias
SiliconFlowASRService = OpenAICompatibleASRService
__all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"]

View File

@@ -0,0 +1 @@
"""Common provider types."""

View File

@@ -0,0 +1,253 @@
"""Base classes for AI services.
Defines abstract interfaces for ASR, LLM, and TTS services,
inspired by pipecat's service architecture and active-call's
StreamEngine pattern.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import AsyncIterator, Optional, List, Dict, Any, Literal
from enum import Enum
class ServiceState(Enum):
"""Service connection state."""
DISCONNECTED = "disconnected"
CONNECTING = "connecting"
CONNECTED = "connected"
ERROR = "error"
@dataclass
class ASRResult:
"""ASR transcription result."""
text: str
is_final: bool = False
confidence: float = 1.0
language: Optional[str] = None
start_time: Optional[float] = None
end_time: Optional[float] = None
def __str__(self) -> str:
status = "FINAL" if self.is_final else "PARTIAL"
return f"[{status}] {self.text}"
@dataclass
class LLMMessage:
"""LLM conversation message."""
role: str # "system", "user", "assistant", "function"
content: str
name: Optional[str] = None # For function calls
function_call: Optional[Dict[str, Any]] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to API-compatible dict."""
d = {"role": self.role, "content": self.content}
if self.name:
d["name"] = self.name
if self.function_call:
d["function_call"] = self.function_call
return d
@dataclass
class LLMStreamEvent:
"""Structured LLM stream event."""
type: Literal["text_delta", "tool_call", "done"]
text: Optional[str] = None
tool_call: Optional[Dict[str, Any]] = None
@dataclass
class TTSChunk:
"""TTS audio chunk."""
audio: bytes # PCM audio data
sample_rate: int = 16000
channels: int = 1
bits_per_sample: int = 16
is_final: bool = False
text_offset: Optional[int] = None # Character offset in original text
class BaseASRService(ABC):
"""
Abstract base class for ASR (Speech-to-Text) services.
Supports both streaming and non-streaming transcription.
"""
def __init__(self, sample_rate: int = 16000, language: str = "en"):
self.sample_rate = sample_rate
self.language = language
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Establish connection to ASR service."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close connection to ASR service."""
pass
@abstractmethod
async def send_audio(self, audio: bytes) -> None:
"""
Send audio chunk for transcription.
Args:
audio: PCM audio data (16-bit, mono)
"""
pass
@abstractmethod
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""
Receive transcription results.
Yields:
ASRResult objects as they become available
"""
pass
async def transcribe(self, audio: bytes) -> ASRResult:
"""
Transcribe a complete audio buffer (non-streaming).
Args:
audio: Complete PCM audio data
Returns:
Final ASRResult
"""
# Default implementation using streaming
await self.send_audio(audio)
async for result in self.receive_transcripts():
if result.is_final:
return result
return ASRResult(text="", is_final=True)
class BaseLLMService(ABC):
"""
Abstract base class for LLM (Language Model) services.
Supports streaming responses for real-time conversation.
"""
def __init__(self, model: str = "gpt-4"):
self.model = model
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Initialize LLM service connection."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close LLM service connection."""
pass
@abstractmethod
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> str:
"""
Generate a complete response.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Returns:
Complete response text
"""
pass
@abstractmethod
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> AsyncIterator[LLMStreamEvent]:
"""
Generate response in streaming mode.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Yields:
Stream events (text delta/tool call/done)
"""
pass
class BaseTTSService(ABC):
"""
Abstract base class for TTS (Text-to-Speech) services.
Supports streaming audio synthesis for low-latency playback.
"""
def __init__(
self,
voice: str = "default",
sample_rate: int = 16000,
speed: float = 1.0
):
self.voice = voice
self.sample_rate = sample_rate
self.speed = speed
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Initialize TTS service connection."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close TTS service connection."""
pass
@abstractmethod
async def synthesize(self, text: str) -> bytes:
"""
Synthesize complete audio for text (non-streaming).
Args:
text: Text to synthesize
Returns:
Complete PCM audio data
"""
pass
@abstractmethod
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects as audio is generated
"""
pass
async def cancel(self) -> None:
"""Cancel ongoing synthesis (for barge-in support)."""
pass

View File

@@ -0,0 +1,86 @@
"""Shared text chunking helpers for streaming TTS."""
from typing import Optional
def is_non_sentence_period(text: str, idx: int) -> bool:
"""Check whether '.' should NOT be treated as a sentence delimiter."""
if idx < 0 or idx >= len(text) or text[idx] != ".":
return False
# Decimal/version segment: 1.2, v1.2.3
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
return True
# Number abbreviations: No.1 / No. 1
left_start = idx - 1
while left_start >= 0 and text[left_start].isalpha():
left_start -= 1
left_token = text[left_start + 1:idx].lower()
if left_token == "no":
j = idx + 1
while j < len(text) and text[j].isspace():
j += 1
if j < len(text) and text[j].isdigit():
return True
return False
def has_spoken_content(text: str) -> bool:
"""Check whether text contains pronounceable content (not punctuation-only)."""
return any(char.isalnum() for char in text)
def extract_tts_sentence(
text_buffer: str,
*,
end_chars: frozenset[str],
trailing_chars: frozenset[str],
closers: frozenset[str],
min_split_spoken_chars: int = 0,
hold_trailing_at_buffer_end: bool = False,
force: bool = False,
) -> Optional[tuple[str, str]]:
"""Extract one TTS sentence from text buffer."""
if not text_buffer:
return None
search_start = 0
while True:
split_idx = -1
for idx in range(search_start, len(text_buffer)):
char = text_buffer[idx]
if char == "." and is_non_sentence_period(text_buffer, idx):
continue
if char in end_chars:
split_idx = idx
break
if split_idx == -1:
return None
end_idx = split_idx + 1
while end_idx < len(text_buffer) and text_buffer[end_idx] in trailing_chars:
end_idx += 1
while end_idx < len(text_buffer) and text_buffer[end_idx] in closers:
end_idx += 1
if hold_trailing_at_buffer_end and not force and end_idx >= len(text_buffer):
return None
sentence = text_buffer[:end_idx].strip()
spoken_chars = sum(1 for ch in sentence if ch.isalnum())
if (
not force
and min_split_spoken_chars > 0
and 0 < spoken_chars < min_split_spoken_chars
and end_idx < len(text_buffer)
):
search_start = end_idx
continue
remainder = text_buffer[end_idx:]
return sentence, remainder

View File

@@ -0,0 +1 @@
"""Provider factories."""

View File

@@ -0,0 +1,112 @@
"""Default runtime service factory implementing core extension ports."""
from __future__ import annotations
from typing import Any
from loguru import logger
from runtime.ports import (
ASRPort,
ASRServiceSpec,
LLMPort,
LLMServiceSpec,
RealtimeServiceFactory,
TTSPort,
TTSServiceSpec,
)
from providers.asr.buffered import BufferedASRService
from providers.tts.dashscope import DashScopeTTSService
from providers.llm.openai import MockLLMService, OpenAILLMService
from providers.asr.openai_compatible import OpenAICompatibleASRService
from providers.tts.openai_compatible import OpenAICompatibleTTSService
from providers.tts.mock import MockTTSService
_OPENAI_COMPATIBLE_PROVIDERS = {"openai_compatible", "openai-compatible", "siliconflow"}
_SUPPORTED_LLM_PROVIDERS = {"openai", *_OPENAI_COMPATIBLE_PROVIDERS}
class DefaultRealtimeServiceFactory(RealtimeServiceFactory):
"""Build concrete runtime services from normalized specs."""
_DEFAULT_DASHSCOPE_TTS_REALTIME_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
_DEFAULT_DASHSCOPE_TTS_MODEL = "qwen3-tts-flash-realtime"
_DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
_DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL = "FunAudioLLM/SenseVoiceSmall"
@staticmethod
def _normalize_provider(provider: Any) -> str:
return str(provider or "").strip().lower()
@staticmethod
def _resolve_dashscope_mode(raw_mode: Any) -> str:
mode = str(raw_mode or "commit").strip().lower()
if mode in {"commit", "server_commit"}:
return mode
return "commit"
def create_llm_service(self, spec: LLMServiceSpec) -> LLMPort:
provider = self._normalize_provider(spec.provider)
if provider in _SUPPORTED_LLM_PROVIDERS and spec.api_key:
return OpenAILLMService(
api_key=spec.api_key,
base_url=spec.base_url,
model=spec.model,
system_prompt=spec.system_prompt,
knowledge_config=spec.knowledge_config,
knowledge_searcher=spec.knowledge_searcher,
)
logger.warning(
"LLM provider unsupported or API key missing (provider={}); using mock LLM",
provider or "-",
)
return MockLLMService()
def create_tts_service(self, spec: TTSServiceSpec) -> TTSPort:
provider = self._normalize_provider(spec.provider)
if provider == "dashscope" and spec.api_key:
return DashScopeTTSService(
api_key=spec.api_key,
api_url=spec.api_url or self._DEFAULT_DASHSCOPE_TTS_REALTIME_URL,
voice=spec.voice,
model=spec.model or self._DEFAULT_DASHSCOPE_TTS_MODEL,
mode=self._resolve_dashscope_mode(spec.mode),
sample_rate=spec.sample_rate,
speed=spec.speed,
)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleTTSService(
api_key=spec.api_key,
api_url=spec.api_url,
voice=spec.voice,
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_TTS_MODEL,
sample_rate=spec.sample_rate,
speed=spec.speed,
)
logger.warning(
"TTS provider unsupported or API key missing (provider={}); using mock TTS",
provider or "-",
)
return MockTTSService(sample_rate=spec.sample_rate)
def create_asr_service(self, spec: ASRServiceSpec) -> ASRPort:
provider = self._normalize_provider(spec.provider)
if provider in _OPENAI_COMPATIBLE_PROVIDERS and spec.api_key:
return OpenAICompatibleASRService(
api_key=spec.api_key,
api_url=spec.api_url,
model=spec.model or self._DEFAULT_OPENAI_COMPATIBLE_ASR_MODEL,
sample_rate=spec.sample_rate,
language=spec.language,
interim_interval_ms=spec.interim_interval_ms,
min_audio_for_interim_ms=spec.min_audio_for_interim_ms,
on_transcript=spec.on_transcript,
)
logger.info("Using buffered ASR service (provider={})", provider or "-")
return BufferedASRService(sample_rate=spec.sample_rate, language=spec.language)

View File

@@ -0,0 +1 @@
"""LLM providers."""

View File

@@ -0,0 +1,449 @@
"""LLM (Large Language Model) Service implementations.
Provides OpenAI-compatible LLM integration with streaming support
for real-time voice conversation.
"""
import os
import asyncio
import uuid
from typing import AsyncIterator, Optional, List, Dict, Any, Callable, Awaitable
from loguru import logger
from adapters.control_plane.backend import build_backend_adapter_from_settings
from providers.common.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState
# Try to import openai
try:
from openai import AsyncOpenAI
OPENAI_AVAILABLE = True
except ImportError:
OPENAI_AVAILABLE = False
logger.warning("openai package not available - LLM service will be disabled")
class OpenAILLMService(BaseLLMService):
"""
OpenAI-compatible LLM service.
Supports streaming responses for low-latency voice conversation.
Works with OpenAI API, Azure OpenAI, and compatible APIs.
"""
def __init__(
self,
model: str = "gpt-4o-mini",
api_key: Optional[str] = None,
base_url: Optional[str] = None,
system_prompt: Optional[str] = None,
knowledge_config: Optional[Dict[str, Any]] = None,
knowledge_searcher: Optional[Callable[..., Awaitable[List[Dict[str, Any]]]]] = None,
):
"""
Initialize OpenAI LLM service.
Args:
model: Model name (e.g., "gpt-4o-mini", "gpt-4o")
api_key: Provider API key
base_url: Custom API base URL (for Azure or compatible APIs)
system_prompt: Default system prompt for conversations
"""
super().__init__(model=model)
self.api_key = api_key
self.base_url = base_url or os.getenv("LLM_API_URL") or os.getenv("OPENAI_API_URL")
self.system_prompt = system_prompt or (
"You are a helpful, friendly voice assistant. "
"Keep your responses concise and conversational. "
"Respond naturally as if having a phone conversation."
)
self.client: Optional[AsyncOpenAI] = None
self._cancel_event = asyncio.Event()
self._knowledge_config: Dict[str, Any] = knowledge_config or {}
if knowledge_searcher is None:
adapter = build_backend_adapter_from_settings()
self._knowledge_searcher = adapter.search_knowledge_context
else:
self._knowledge_searcher = knowledge_searcher
self._tool_schemas: List[Dict[str, Any]] = []
_RAG_DEFAULT_RESULTS = 5
_RAG_MAX_RESULTS = 8
_RAG_MAX_CONTEXT_CHARS = 4000
async def connect(self) -> None:
"""Initialize OpenAI client."""
if not OPENAI_AVAILABLE:
raise RuntimeError("openai package not installed")
if not self.api_key:
raise ValueError("OpenAI API key not provided")
self.client = AsyncOpenAI(
api_key=self.api_key,
base_url=self.base_url
)
self.state = ServiceState.CONNECTED
logger.info(f"OpenAI LLM service connected: model={self.model}")
async def disconnect(self) -> None:
"""Close OpenAI client."""
if self.client:
await self.client.close()
self.client = None
self.state = ServiceState.DISCONNECTED
logger.info("OpenAI LLM service disconnected")
def _prepare_messages(self, messages: List[LLMMessage]) -> List[Dict[str, Any]]:
"""Prepare messages list with system prompt."""
result = []
# Add system prompt if not already present
has_system = any(m.role == "system" for m in messages)
if not has_system and self.system_prompt:
result.append({"role": "system", "content": self.system_prompt})
# Add all messages
for msg in messages:
result.append(msg.to_dict())
return result
def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None:
"""Update runtime knowledge retrieval config."""
self._knowledge_config = config or {}
def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None:
"""Update runtime tool schemas."""
self._tool_schemas = []
if not isinstance(schemas, list):
return
for item in schemas:
if not isinstance(item, dict):
continue
fn = item.get("function")
if isinstance(fn, dict) and fn.get("name"):
self._tool_schemas.append(item)
elif item.get("name"):
self._tool_schemas.append(
{
"type": "function",
"function": {
"name": str(item.get("name")),
"description": str(item.get("description") or ""),
"parameters": item.get("parameters") or {"type": "object", "properties": {}},
},
}
)
@staticmethod
def _coerce_int(value: Any, default: int) -> int:
try:
return int(value)
except (TypeError, ValueError):
return default
def _resolve_kb_id(self) -> Optional[str]:
cfg = self._knowledge_config if isinstance(self._knowledge_config, dict) else {}
kb_id = str(
cfg.get("kbId")
or cfg.get("knowledgeBaseId")
or cfg.get("knowledge_base_id")
or ""
).strip()
return kb_id or None
def _build_knowledge_prompt(self, results: List[Dict[str, Any]]) -> Optional[str]:
if not results:
return None
lines = [
"You have retrieved the following knowledge base snippets.",
"Use them only when relevant to the latest user request.",
"If snippets are insufficient, say you are not sure instead of guessing.",
"",
]
used_chars = 0
used_count = 0
for item in results:
content = str(item.get("content") or "").strip()
if not content:
continue
if used_chars >= self._RAG_MAX_CONTEXT_CHARS:
break
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
doc_id = metadata.get("document_id")
chunk_index = metadata.get("chunk_index")
distance = item.get("distance")
source_parts = []
if doc_id:
source_parts.append(f"doc={doc_id}")
if chunk_index is not None:
source_parts.append(f"chunk={chunk_index}")
source = f" ({', '.join(source_parts)})" if source_parts else ""
distance_text = ""
try:
if distance is not None:
distance_text = f", distance={float(distance):.4f}"
except (TypeError, ValueError):
distance_text = ""
remaining = self._RAG_MAX_CONTEXT_CHARS - used_chars
snippet = content[:remaining].strip()
if not snippet:
continue
used_count += 1
lines.append(f"[{used_count}{source}{distance_text}] {snippet}")
used_chars += len(snippet)
if used_count == 0:
return None
return "\n".join(lines)
async def _with_knowledge_context(self, messages: List[LLMMessage]) -> List[LLMMessage]:
cfg = self._knowledge_config if isinstance(self._knowledge_config, dict) else {}
enabled = cfg.get("enabled", True)
if isinstance(enabled, str):
enabled = enabled.strip().lower() not in {"false", "0", "off", "no"}
if not enabled:
return messages
kb_id = self._resolve_kb_id()
if not kb_id:
return messages
latest_user = ""
for msg in reversed(messages):
if msg.role == "user":
latest_user = (msg.content or "").strip()
break
if not latest_user:
return messages
n_results = self._coerce_int(cfg.get("nResults"), self._RAG_DEFAULT_RESULTS)
n_results = max(1, min(n_results, self._RAG_MAX_RESULTS))
results = await self._knowledge_searcher(
kb_id=kb_id,
query=latest_user,
n_results=n_results,
)
prompt = self._build_knowledge_prompt(results)
if not prompt:
return messages
logger.debug(f"RAG context injected (kb_id={kb_id}, chunks={len(results)})")
rag_system = LLMMessage(role="system", content=prompt)
if messages and messages[0].role == "system":
return [messages[0], rag_system, *messages[1:]]
return [rag_system, *messages]
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> str:
"""
Generate a complete response.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Returns:
Complete response text
"""
if not self.client:
raise RuntimeError("LLM service not connected")
rag_messages = await self._with_knowledge_context(messages)
prepared = self._prepare_messages(rag_messages)
try:
response = await self.client.chat.completions.create(
model=self.model,
messages=prepared,
temperature=temperature,
max_tokens=max_tokens
)
content = response.choices[0].message.content or ""
logger.debug(f"LLM response: {content[:100]}...")
return content
except Exception as e:
logger.error(f"LLM generation error: {e}")
raise
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> AsyncIterator[LLMStreamEvent]:
"""
Generate response in streaming mode.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Yields:
Structured stream events
"""
if not self.client:
raise RuntimeError("LLM service not connected")
rag_messages = await self._with_knowledge_context(messages)
prepared = self._prepare_messages(rag_messages)
self._cancel_event.clear()
tool_accumulator: Dict[int, Dict[str, str]] = {}
openai_tools = self._tool_schemas or None
try:
create_args: Dict[str, Any] = dict(
model=self.model,
messages=prepared,
temperature=temperature,
max_tokens=max_tokens,
stream=True,
)
if openai_tools:
create_args["tools"] = openai_tools
create_args["tool_choice"] = "auto"
stream = await self.client.chat.completions.create(**create_args)
async for chunk in stream:
# Check for cancellation
if self._cancel_event.is_set():
logger.info("LLM stream cancelled")
break
if not chunk.choices:
continue
choice = chunk.choices[0]
delta = getattr(choice, "delta", None)
if delta and getattr(delta, "content", None):
content = delta.content
yield LLMStreamEvent(type="text_delta", text=content)
# OpenAI streams function calls via incremental tool_calls deltas.
tool_calls = getattr(delta, "tool_calls", None) if delta else None
if tool_calls:
for tc in tool_calls:
index = getattr(tc, "index", 0) or 0
item = tool_accumulator.setdefault(
int(index),
{"id": "", "name": "", "arguments": ""},
)
tc_id = getattr(tc, "id", None)
if tc_id:
item["id"] = str(tc_id)
fn = getattr(tc, "function", None)
if fn:
fn_name = getattr(fn, "name", None)
if fn_name:
item["name"] = str(fn_name)
fn_args = getattr(fn, "arguments", None)
if fn_args:
item["arguments"] += str(fn_args)
finish_reason = getattr(choice, "finish_reason", None)
if finish_reason == "tool_calls" and tool_accumulator:
for _, payload in sorted(tool_accumulator.items(), key=lambda row: row[0]):
call_name = payload.get("name", "").strip()
if not call_name:
continue
call_id = payload.get("id", "").strip() or f"call_{uuid.uuid4().hex[:10]}"
yield LLMStreamEvent(
type="tool_call",
tool_call={
"id": call_id,
"type": "function",
"function": {
"name": call_name,
"arguments": payload.get("arguments", "") or "{}",
},
},
)
yield LLMStreamEvent(type="done")
return
if finish_reason in {"stop", "length", "content_filter"}:
yield LLMStreamEvent(type="done")
return
except asyncio.CancelledError:
logger.info("LLM stream cancelled via asyncio")
raise
except Exception as e:
logger.error(f"LLM streaming error: {e}")
raise
def cancel(self) -> None:
"""Cancel ongoing generation."""
self._cancel_event.set()
class MockLLMService(BaseLLMService):
"""
Mock LLM service for testing without API calls.
"""
def __init__(self, response_delay: float = 0.5):
super().__init__(model="mock")
self.response_delay = response_delay
self.responses = [
"Hello! How can I help you today?",
"That's an interesting question. Let me think about it.",
"I understand. Is there anything else you'd like to know?",
"Great! I'm here if you need anything else.",
]
self._response_index = 0
async def connect(self) -> None:
self.state = ServiceState.CONNECTED
logger.info("Mock LLM service connected")
async def disconnect(self) -> None:
self.state = ServiceState.DISCONNECTED
logger.info("Mock LLM service disconnected")
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> str:
await asyncio.sleep(self.response_delay)
response = self.responses[self._response_index % len(self.responses)]
self._response_index += 1
return response
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> AsyncIterator[LLMStreamEvent]:
response = await self.generate(messages, temperature, max_tokens)
# Stream word by word
words = response.split()
for i, word in enumerate(words):
if i > 0:
yield LLMStreamEvent(type="text_delta", text=" ")
yield LLMStreamEvent(type="text_delta", text=word)
await asyncio.sleep(0.05) # Simulate streaming delay
yield LLMStreamEvent(type="done")

View File

@@ -0,0 +1 @@
"""Realtime providers."""

View File

@@ -0,0 +1,546 @@
"""OpenAI Realtime API Service.
Provides true duplex voice conversation using OpenAI's Realtime API,
similar to active-call's RealtimeProcessor. This bypasses the need for
separate ASR/LLM/TTS services by handling everything server-side.
The Realtime API provides:
- Server-side VAD with turn detection
- Streaming speech-to-text
- Streaming LLM responses
- Streaming text-to-speech
- Function calling support
- Barge-in/interruption handling
"""
import asyncio
import json
import base64
from typing import Optional, Dict, Any, Callable, Awaitable, List
from dataclasses import dataclass, field
from enum import Enum
from loguru import logger
try:
import websockets
WEBSOCKETS_AVAILABLE = True
except ImportError:
WEBSOCKETS_AVAILABLE = False
logger.warning("websockets not available - Realtime API will be disabled")
class RealtimeState(Enum):
"""Realtime API connection state."""
DISCONNECTED = "disconnected"
CONNECTING = "connecting"
CONNECTED = "connected"
ERROR = "error"
@dataclass
class RealtimeConfig:
"""Configuration for OpenAI Realtime API."""
# API Configuration
api_key: Optional[str] = None
model: str = "gpt-4o-realtime-preview"
endpoint: Optional[str] = None # For Azure or custom endpoints
# Voice Configuration
voice: str = "alloy" # alloy, echo, shimmer, etc.
instructions: str = (
"You are a helpful, friendly voice assistant. "
"Keep your responses concise and conversational."
)
# Turn Detection (Server-side VAD)
turn_detection: Optional[Dict[str, Any]] = field(default_factory=lambda: {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
})
# Audio Configuration
input_audio_format: str = "pcm16"
output_audio_format: str = "pcm16"
# Tools/Functions
tools: List[Dict[str, Any]] = field(default_factory=list)
class RealtimeService:
"""
OpenAI Realtime API service for true duplex voice conversation.
This service handles the entire voice conversation pipeline:
1. Audio input → Server-side VAD → Speech-to-text
2. Text → LLM processing → Response generation
3. Response → Text-to-speech → Audio output
Events emitted:
- on_audio: Audio output from the assistant
- on_transcript: Text transcript (user or assistant)
- on_speech_started: User started speaking
- on_speech_stopped: User stopped speaking
- on_response_started: Assistant started responding
- on_response_done: Assistant finished responding
- on_function_call: Function call requested
- on_error: Error occurred
"""
def __init__(self, config: Optional[RealtimeConfig] = None):
"""
Initialize Realtime API service.
Args:
config: Realtime configuration (uses defaults if not provided)
"""
self.config = config or RealtimeConfig()
self.state = RealtimeState.DISCONNECTED
self._ws = None
self._receive_task: Optional[asyncio.Task] = None
self._cancel_event = asyncio.Event()
# Event callbacks
self._callbacks: Dict[str, List[Callable]] = {
"on_audio": [],
"on_transcript": [],
"on_speech_started": [],
"on_speech_stopped": [],
"on_response_started": [],
"on_response_done": [],
"on_function_call": [],
"on_error": [],
"on_interrupted": [],
}
logger.debug(f"RealtimeService initialized with model={self.config.model}")
def on(self, event: str, callback: Callable[..., Awaitable[None]]) -> None:
"""
Register event callback.
Args:
event: Event name
callback: Async callback function
"""
if event in self._callbacks:
self._callbacks[event].append(callback)
async def _emit(self, event: str, *args, **kwargs) -> None:
"""Emit event to all registered callbacks."""
for callback in self._callbacks.get(event, []):
try:
await callback(*args, **kwargs)
except Exception as e:
logger.error(f"Event callback error ({event}): {e}")
async def connect(self) -> None:
"""Connect to OpenAI Realtime API."""
if not WEBSOCKETS_AVAILABLE:
raise RuntimeError("websockets package not installed")
if not self.config.api_key:
raise ValueError("OpenAI API key not provided")
self.state = RealtimeState.CONNECTING
# Build URL
if self.config.endpoint:
# Azure or custom endpoint
url = f"{self.config.endpoint}/openai/realtime?api-version=2024-10-01-preview&deployment={self.config.model}"
else:
# OpenAI endpoint
url = f"wss://api.openai.com/v1/realtime?model={self.config.model}"
# Build headers
headers = {}
if self.config.endpoint:
headers["api-key"] = self.config.api_key
else:
headers["Authorization"] = f"Bearer {self.config.api_key}"
headers["OpenAI-Beta"] = "realtime=v1"
try:
logger.info(f"Connecting to Realtime API: {url}")
self._ws = await websockets.connect(url, extra_headers=headers)
# Send session configuration
await self._configure_session()
# Start receive loop
self._receive_task = asyncio.create_task(self._receive_loop())
self.state = RealtimeState.CONNECTED
logger.info("Realtime API connected successfully")
except Exception as e:
self.state = RealtimeState.ERROR
logger.error(f"Realtime API connection failed: {e}")
raise
async def _configure_session(self) -> None:
"""Send session configuration to server."""
session_config = {
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"instructions": self.config.instructions,
"voice": self.config.voice,
"input_audio_format": self.config.input_audio_format,
"output_audio_format": self.config.output_audio_format,
"turn_detection": self.config.turn_detection,
}
}
if self.config.tools:
session_config["session"]["tools"] = self.config.tools
await self._send(session_config)
logger.debug("Session configuration sent")
async def _send(self, data: Dict[str, Any]) -> None:
"""Send JSON data to server."""
if self._ws:
await self._ws.send(json.dumps(data))
async def send_audio(self, audio_bytes: bytes) -> None:
"""
Send audio to the Realtime API.
Args:
audio_bytes: PCM audio data (16-bit, mono, 24kHz by default)
"""
if self.state != RealtimeState.CONNECTED:
return
# Encode audio as base64
audio_b64 = base64.standard_b64encode(audio_bytes).decode()
await self._send({
"type": "input_audio_buffer.append",
"audio": audio_b64
})
async def send_text(self, text: str) -> None:
"""
Send text input (bypassing audio).
Args:
text: User text input
"""
if self.state != RealtimeState.CONNECTED:
return
# Create a conversation item with user text
await self._send({
"type": "conversation.item.create",
"item": {
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": text}]
}
})
# Trigger response
await self._send({"type": "response.create"})
async def cancel_response(self) -> None:
"""Cancel the current response (for barge-in)."""
if self.state != RealtimeState.CONNECTED:
return
await self._send({"type": "response.cancel"})
logger.debug("Response cancelled")
async def commit_audio(self) -> None:
"""Commit the audio buffer and trigger response."""
if self.state != RealtimeState.CONNECTED:
return
await self._send({"type": "input_audio_buffer.commit"})
await self._send({"type": "response.create"})
async def clear_audio_buffer(self) -> None:
"""Clear the input audio buffer."""
if self.state != RealtimeState.CONNECTED:
return
await self._send({"type": "input_audio_buffer.clear"})
async def submit_function_result(self, call_id: str, result: str) -> None:
"""
Submit function call result.
Args:
call_id: The function call ID
result: JSON string result
"""
if self.state != RealtimeState.CONNECTED:
return
await self._send({
"type": "conversation.item.create",
"item": {
"type": "function_call_output",
"call_id": call_id,
"output": result
}
})
# Trigger response with the function result
await self._send({"type": "response.create"})
async def _receive_loop(self) -> None:
"""Receive and process messages from the Realtime API."""
if not self._ws:
return
try:
async for message in self._ws:
try:
data = json.loads(message)
await self._handle_event(data)
except json.JSONDecodeError:
logger.warning(f"Invalid JSON received: {message[:100]}")
except asyncio.CancelledError:
logger.debug("Receive loop cancelled")
except websockets.ConnectionClosed as e:
logger.info(f"WebSocket closed: {e}")
self.state = RealtimeState.DISCONNECTED
except Exception as e:
logger.error(f"Receive loop error: {e}")
self.state = RealtimeState.ERROR
async def _handle_event(self, data: Dict[str, Any]) -> None:
"""Handle incoming event from Realtime API."""
event_type = data.get("type", "unknown")
# Audio delta - streaming audio output
if event_type == "response.audio.delta":
if "delta" in data:
audio_bytes = base64.standard_b64decode(data["delta"])
await self._emit("on_audio", audio_bytes)
# Audio transcript delta - streaming text
elif event_type == "response.audio_transcript.delta":
if "delta" in data:
await self._emit("on_transcript", data["delta"], "assistant", False)
# Audio transcript done
elif event_type == "response.audio_transcript.done":
if "transcript" in data:
await self._emit("on_transcript", data["transcript"], "assistant", True)
# Input audio transcript (user speech)
elif event_type == "conversation.item.input_audio_transcription.completed":
if "transcript" in data:
await self._emit("on_transcript", data["transcript"], "user", True)
# Speech started (server VAD detected speech)
elif event_type == "input_audio_buffer.speech_started":
await self._emit("on_speech_started", data.get("audio_start_ms", 0))
# Speech stopped
elif event_type == "input_audio_buffer.speech_stopped":
await self._emit("on_speech_stopped", data.get("audio_end_ms", 0))
# Response started
elif event_type == "response.created":
await self._emit("on_response_started", data.get("response", {}))
# Response done
elif event_type == "response.done":
await self._emit("on_response_done", data.get("response", {}))
# Function call
elif event_type == "response.function_call_arguments.done":
call_id = data.get("call_id")
name = data.get("name")
arguments = data.get("arguments", "{}")
await self._emit("on_function_call", call_id, name, arguments)
# Error
elif event_type == "error":
error = data.get("error", {})
logger.error(f"Realtime API error: {error}")
await self._emit("on_error", error)
# Session events
elif event_type == "session.created":
logger.info("Session created")
elif event_type == "session.updated":
logger.debug("Session updated")
else:
logger.debug(f"Unhandled event type: {event_type}")
async def disconnect(self) -> None:
"""Disconnect from Realtime API."""
self._cancel_event.set()
if self._receive_task:
self._receive_task.cancel()
try:
await self._receive_task
except asyncio.CancelledError:
pass
if self._ws:
await self._ws.close()
self._ws = None
self.state = RealtimeState.DISCONNECTED
logger.info("Realtime API disconnected")
class RealtimePipeline:
"""
Pipeline adapter for RealtimeService.
Provides a compatible interface with DuplexPipeline but uses
OpenAI Realtime API for all processing.
"""
def __init__(
self,
transport,
session_id: str,
config: Optional[RealtimeConfig] = None
):
"""
Initialize Realtime pipeline.
Args:
transport: Transport for sending audio/events
session_id: Session identifier
config: Realtime configuration
"""
self.transport = transport
self.session_id = session_id
self.service = RealtimeService(config)
# Register callbacks
self.service.on("on_audio", self._on_audio)
self.service.on("on_transcript", self._on_transcript)
self.service.on("on_speech_started", self._on_speech_started)
self.service.on("on_speech_stopped", self._on_speech_stopped)
self.service.on("on_response_started", self._on_response_started)
self.service.on("on_response_done", self._on_response_done)
self.service.on("on_error", self._on_error)
self._is_speaking = False
self._running = True
logger.info(f"RealtimePipeline initialized for session {session_id}")
async def start(self) -> None:
"""Start the pipeline."""
await self.service.connect()
async def process_audio(self, pcm_bytes: bytes) -> None:
"""
Process incoming audio.
Note: Realtime API expects 24kHz audio by default.
You may need to resample from 16kHz.
"""
if not self._running:
return
# TODO: Resample from 16kHz to 24kHz if needed
await self.service.send_audio(pcm_bytes)
async def process_text(self, text: str) -> None:
"""Process text input."""
if not self._running:
return
await self.service.send_text(text)
async def interrupt(self) -> None:
"""Interrupt current response."""
await self.service.cancel_response()
await self.transport.send_event({
"event": "interrupt",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms()
})
async def cleanup(self) -> None:
"""Cleanup resources."""
self._running = False
await self.service.disconnect()
# Event handlers
async def _on_audio(self, audio_bytes: bytes) -> None:
"""Handle audio output."""
await self.transport.send_audio(audio_bytes)
async def _on_transcript(self, text: str, role: str, is_final: bool) -> None:
"""Handle transcript."""
logger.info(f"[{role.upper()}] {text[:50]}..." if len(text) > 50 else f"[{role.upper()}] {text}")
async def _on_speech_started(self, start_ms: int) -> None:
"""Handle user speech start."""
self._is_speaking = True
await self.transport.send_event({
"event": "speaking",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms(),
"startTime": start_ms
})
# Cancel any ongoing response (barge-in)
await self.service.cancel_response()
async def _on_speech_stopped(self, end_ms: int) -> None:
"""Handle user speech stop."""
self._is_speaking = False
await self.transport.send_event({
"event": "silence",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms(),
"duration": end_ms
})
async def _on_response_started(self, response: Dict) -> None:
"""Handle response start."""
await self.transport.send_event({
"event": "trackStart",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms()
})
async def _on_response_done(self, response: Dict) -> None:
"""Handle response complete."""
await self.transport.send_event({
"event": "trackEnd",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms()
})
async def _on_error(self, error: Dict) -> None:
"""Handle error."""
await self.transport.send_event({
"event": "error",
"trackId": self.session_id,
"timestamp": self._get_timestamp_ms(),
"sender": "realtime",
"error": str(error)
})
def _get_timestamp_ms(self) -> int:
"""Get current timestamp in milliseconds."""
import time
return int(time.time() * 1000)
@property
def is_speaking(self) -> bool:
"""Check if user is speaking."""
return self._is_speaking

View File

@@ -0,0 +1 @@
"""TTS providers."""

View File

@@ -0,0 +1,352 @@
"""DashScope realtime TTS service.
Implements DashScope's Qwen realtime TTS protocol via the official SDK.
"""
import asyncio
import audioop
import base64
import json
import os
from typing import Any, AsyncIterator, Dict, Optional, Tuple
from loguru import logger
from providers.common.base import BaseTTSService, ServiceState, TTSChunk
try:
import dashscope
from dashscope.audio.qwen_tts_realtime import AudioFormat, QwenTtsRealtime, QwenTtsRealtimeCallback
DASHSCOPE_SDK_AVAILABLE = True
except ImportError:
dashscope = None # type: ignore[assignment]
AudioFormat = None # type: ignore[assignment]
QwenTtsRealtime = None # type: ignore[assignment]
DASHSCOPE_SDK_AVAILABLE = False
class QwenTtsRealtimeCallback: # type: ignore[no-redef]
"""Fallback callback base when DashScope SDK is unavailable."""
pass
class _RealtimeEventCallback(QwenTtsRealtimeCallback):
"""Bridge SDK callback events into an asyncio queue."""
def __init__(self, loop: asyncio.AbstractEventLoop, queue: "asyncio.Queue[Dict[str, Any]]"):
super().__init__()
self._loop = loop
self._queue = queue
def _push(self, event: Dict[str, Any]) -> None:
try:
self._loop.call_soon_threadsafe(self._queue.put_nowait, event)
except RuntimeError:
return
def on_open(self) -> None:
self._push({"type": "session.open"})
def on_close(self, code: int, reason: str) -> None:
self._push({"type": "__close__", "code": code, "reason": reason})
def on_error(self, message: str) -> None:
self._push({"type": "error", "error": {"message": str(message)}})
def on_event(self, event: Any) -> None:
if isinstance(event, dict):
payload = event
elif isinstance(event, str):
try:
payload = json.loads(event)
except json.JSONDecodeError:
payload = {"type": "raw", "message": event}
else:
payload = {"type": "raw", "message": str(event)}
self._push(payload)
def on_data(self, data: bytes) -> None:
# Some SDK versions provide audio via on_data directly.
self._push({"type": "response.audio.delta.raw", "audio": data})
class DashScopeTTSService(BaseTTSService):
"""DashScope realtime TTS service using Qwen Realtime protocol."""
DEFAULT_WS_URL = "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
DEFAULT_MODEL = "qwen3-tts-flash-realtime"
PROVIDER_SAMPLE_RATE = 24000
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
voice: str = "Cherry",
model: Optional[str] = None,
mode: str = "commit",
sample_rate: int = 16000,
speed: float = 1.0,
):
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key
self.api_url = (
api_url
or os.getenv("DASHSCOPE_TTS_API_URL")
or os.getenv("TTS_API_URL")
or self.DEFAULT_WS_URL
)
self.model = model or os.getenv("DASHSCOPE_TTS_MODEL") or self.DEFAULT_MODEL
normalized_mode = str(mode or "").strip().lower()
if normalized_mode not in {"server_commit", "commit"}:
logger.warning(f"Unknown DashScope mode '{mode}', fallback to server_commit")
normalized_mode = "server_commit"
self.mode = normalized_mode
self._client: Optional[Any] = None
self._event_queue: "asyncio.Queue[Dict[str, Any]]" = asyncio.Queue()
self._callback: Optional[_RealtimeEventCallback] = None
self._cancel_event = asyncio.Event()
self._synthesis_lock = asyncio.Lock()
async def connect(self) -> None:
if not DASHSCOPE_SDK_AVAILABLE:
raise RuntimeError("dashscope package not installed; install with `pip install dashscope`")
if not self.api_key:
raise ValueError("DashScope API key not provided. Configure agent.tts.api_key in YAML.")
loop = asyncio.get_running_loop()
self._callback = _RealtimeEventCallback(loop=loop, queue=self._event_queue)
# The official Python SDK docs set key via global `dashscope.api_key`;
# some SDK versions do not accept `api_key=` in QwenTtsRealtime ctor.
if dashscope is not None:
dashscope.api_key = self.api_key
self._client = self._create_realtime_client(self._callback)
await asyncio.to_thread(self._client.connect)
await asyncio.to_thread(
self._client.update_session,
voice=self.voice,
response_format=AudioFormat.PCM_24000HZ_MONO_16BIT,
mode=self.mode,
)
await self._wait_for_session_ready()
self.state = ServiceState.CONNECTED
logger.info(
"DashScope realtime TTS service ready: "
f"voice={self.voice}, model={self.model}, mode={self.mode}"
)
def _create_realtime_client(self, callback: _RealtimeEventCallback) -> Any:
init_kwargs = {
"model": self.model,
"callback": callback,
"url": self.api_url,
}
try:
return QwenTtsRealtime( # type: ignore[misc]
api_key=self.api_key,
**init_kwargs,
)
except TypeError as exc:
if "api_key" not in str(exc):
raise
logger.debug(
"QwenTtsRealtime does not support `api_key` ctor arg; "
"falling back to global dashscope.api_key auth"
)
return QwenTtsRealtime(**init_kwargs) # type: ignore[misc]
async def disconnect(self) -> None:
self._cancel_event.set()
if self._client:
close_fn = getattr(self._client, "close", None)
if callable(close_fn):
await asyncio.to_thread(close_fn)
self._client = None
self._drain_event_queue()
self.state = ServiceState.DISCONNECTED
logger.info("DashScope realtime TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
audio = b""
async for chunk in self.synthesize_stream(text):
audio += chunk.audio
return audio
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
if not self._client:
raise RuntimeError("DashScope TTS service not connected")
if not text.strip():
return
async with self._synthesis_lock:
self._cancel_event.clear()
self._drain_event_queue()
await self._clear_appended_text()
await asyncio.to_thread(self._client.append_text, text)
if self.mode == "commit":
await asyncio.to_thread(self._client.commit)
chunk_size = max(1, self.sample_rate * 2 // 10) # 100ms
buffer = b""
pending_chunk: Optional[bytes] = None
resample_state: Any = None
while True:
timeout = 8.0 if self._cancel_event.is_set() else 20.0
event = await self._next_event(timeout=timeout)
event_type = str(event.get("type") or "").strip()
if event_type in {"response.audio.delta", "response.audio.delta.raw"}:
if self._cancel_event.is_set():
continue
pcm = self._decode_audio_event(event)
if not pcm:
continue
pcm, resample_state = self._resample_if_needed(pcm, resample_state)
if not pcm:
continue
buffer += pcm
while len(buffer) >= chunk_size:
audio_chunk = buffer[:chunk_size]
buffer = buffer[chunk_size:]
if pending_chunk is not None:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False,
)
pending_chunk = audio_chunk
continue
if event_type == "response.done":
break
if event_type == "error":
raise RuntimeError(self._format_error_event(event))
if event_type == "__close__":
reason = str(event.get("reason") or "unknown")
raise RuntimeError(f"DashScope TTS websocket closed unexpectedly: {reason}")
if self._cancel_event.is_set():
return
if pending_chunk is not None:
if buffer:
yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=False)
pending_chunk = None
else:
yield TTSChunk(audio=pending_chunk, sample_rate=self.sample_rate, is_final=True)
pending_chunk = None
if buffer:
yield TTSChunk(audio=buffer, sample_rate=self.sample_rate, is_final=True)
async def cancel(self) -> None:
self._cancel_event.set()
if self.mode == "commit":
await self._clear_appended_text()
return
if not self._client:
return
cancel_fn = (
getattr(self._client, "cancel_response", None)
or getattr(self._client, "cancel", None)
)
if callable(cancel_fn):
try:
await asyncio.to_thread(cancel_fn)
except Exception as exc:
logger.debug(f"DashScope cancel failed: {exc}")
async def _wait_for_session_ready(self) -> None:
try:
while True:
event = await self._next_event(timeout=8.0)
event_type = str(event.get("type") or "").strip()
if event_type in {"session.updated", "session.open"}:
return
if event_type == "error":
raise RuntimeError(self._format_error_event(event))
except asyncio.TimeoutError:
logger.debug("DashScope session update event timeout; continuing with active websocket")
async def _clear_appended_text(self) -> None:
if self.mode != "commit":
return
if not self._client:
return
clear_fn = getattr(self._client, "clear_appended_text", None)
if callable(clear_fn):
try:
await asyncio.to_thread(clear_fn)
except Exception as exc:
logger.debug(f"DashScope clear_appended_text failed: {exc}")
async def _next_event(self, timeout: float) -> Dict[str, Any]:
event = await asyncio.wait_for(self._event_queue.get(), timeout=timeout)
if isinstance(event, dict):
return event
return {"type": "raw", "message": str(event)}
def _drain_event_queue(self) -> None:
while True:
try:
self._event_queue.get_nowait()
except asyncio.QueueEmpty:
break
def _decode_audio_event(self, event: Dict[str, Any]) -> bytes:
event_type = str(event.get("type") or "")
if event_type == "response.audio.delta.raw":
audio = event.get("audio")
if isinstance(audio, (bytes, bytearray)):
return bytes(audio)
return b""
delta = event.get("delta")
if isinstance(delta, str):
try:
return base64.b64decode(delta)
except Exception as exc:
logger.warning(f"Failed to decode DashScope audio delta: {exc}")
return b""
if isinstance(delta, (bytes, bytearray)):
return bytes(delta)
return b""
def _resample_if_needed(self, pcm: bytes, state: Any) -> Tuple[bytes, Any]:
if self.sample_rate == self.PROVIDER_SAMPLE_RATE:
return pcm, state
try:
converted, next_state = audioop.ratecv(
pcm,
2, # 16-bit PCM
1, # mono
self.PROVIDER_SAMPLE_RATE,
self.sample_rate,
state,
)
return converted, next_state
except Exception as exc:
logger.warning(f"DashScope audio resample failed: {exc}; returning original sample rate data")
return pcm, state
@staticmethod
def _format_error_event(event: Dict[str, Any]) -> str:
err = event.get("error")
if isinstance(err, dict):
code = str(err.get("code") or "").strip()
message = str(err.get("message") or "").strip()
if code and message:
return f"{code}: {message}"
return message or str(err)
return str(err or "DashScope realtime TTS error")

View File

@@ -0,0 +1,49 @@
"""TTS service implementations used by the engine runtime."""
import asyncio
from typing import AsyncIterator
from loguru import logger
from providers.common.base import BaseTTSService, TTSChunk, ServiceState
class MockTTSService(BaseTTSService):
"""Mock TTS service for tests and no-provider fallback."""
def __init__(
self,
voice: str = "mock",
sample_rate: int = 16000,
speed: float = 1.0,
):
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
async def connect(self) -> None:
self.state = ServiceState.CONNECTED
logger.info("Mock TTS service connected")
async def disconnect(self) -> None:
self.state = ServiceState.DISCONNECTED
logger.info("Mock TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Generate silence based on text length."""
word_count = len(text.split())
duration_ms = word_count * 100
samples = int(self.sample_rate * duration_ms / 1000)
return bytes(samples * 2)
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""Generate silence chunks to emulate streaming synthesis."""
audio = await self.synthesize(text)
chunk_size = self.sample_rate * 2 // 10
for i in range(0, len(audio), chunk_size):
chunk_data = audio[i : i + chunk_size]
yield TTSChunk(
audio=chunk_data,
sample_rate=self.sample_rate,
is_final=(i + chunk_size >= len(audio)),
)
await asyncio.sleep(0.05)

View File

@@ -0,0 +1,352 @@
"""OpenAI-compatible TTS Service with streaming support.
Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
text-to-speech synthesis with streaming.
API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
"""
import os
import asyncio
import aiohttp
from typing import AsyncIterator, Optional
from urllib.parse import urlparse, urlunparse
from loguru import logger
from providers.common.base import BaseTTSService, TTSChunk, ServiceState
from providers.tts.streaming_adapter import StreamingTTSAdapter # backward-compatible re-export
class OpenAICompatibleTTSService(BaseTTSService):
"""
OpenAI-compatible TTS service with streaming support.
Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
"""
# Available voices
VOICES = {
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
}
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
voice: str = "anna",
model: str = "FunAudioLLM/CosyVoice2-0.5B",
sample_rate: int = 16000,
speed: float = 1.0
):
"""
Initialize OpenAI-compatible TTS service.
Args:
api_key: Provider API key
api_url: Provider API URL (defaults to SiliconFlow endpoint)
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
model: Model name
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
speed: Speech speed (0.25 to 4.0)
"""
# Resolve voice name (case-insensitive), and normalize "model:VoiceId" suffix.
resolved_voice = (voice or "").strip()
voice_lookup = resolved_voice.lower()
if voice_lookup in self.VOICES:
full_voice = self.VOICES[voice_lookup]
elif ":" in resolved_voice:
model_part, voice_part = resolved_voice.split(":", 1)
normalized_voice_part = voice_part.strip().lower()
if normalized_voice_part in self.VOICES:
full_voice = f"{(model_part or model).strip()}:{normalized_voice_part}"
else:
full_voice = resolved_voice
else:
full_voice = resolved_voice
super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
self.api_key = api_key
self.model = model
raw_api_url = api_url or os.getenv("TTS_API_URL") or "https://api.siliconflow.cn/v1/audio/speech"
self.api_url = self._resolve_speech_endpoint(raw_api_url)
self._session: Optional[aiohttp.ClientSession] = None
self._cancel_event = asyncio.Event()
@staticmethod
def _resolve_speech_endpoint(api_url: str) -> str:
"""
Accept either:
- base URL: https://host/v1
- full speech endpoint: https://host/v1/audio/speech
and always return the final speech endpoint URL.
"""
raw = str(api_url or "").strip()
if not raw:
return "https://api.siliconflow.cn/v1/audio/speech"
parsed = urlparse(raw)
path = (parsed.path or "").rstrip("/")
if path.endswith("/audio/speech"):
return raw
if not path:
new_path = "/audio/speech"
else:
new_path = f"{path}/audio/speech"
return urlunparse(parsed._replace(path=new_path))
async def connect(self) -> None:
"""Initialize HTTP session."""
if not self.api_key:
raise ValueError("TTS API key not provided. Configure agent.tts.api_key in YAML.")
self._session = aiohttp.ClientSession(
headers={
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
)
self.state = ServiceState.CONNECTED
logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
async def disconnect(self) -> None:
"""Close HTTP session."""
if self._session:
await self._session.close()
self._session = None
self.state = ServiceState.DISCONNECTED
logger.info("SiliconFlow TTS service disconnected")
async def synthesize(self, text: str) -> bytes:
"""Synthesize complete audio for text."""
audio_data = b""
async for chunk in self.synthesize_stream(text):
audio_data += chunk.audio
return audio_data
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects with PCM audio
"""
if not self._session:
raise RuntimeError("TTS service not connected")
if not text.strip():
return
self._cancel_event.clear()
payload = {
"model": self.model,
"input": text,
"voice": self.voice,
"response_format": "pcm",
"sample_rate": self.sample_rate,
"stream": True,
"speed": self.speed
}
try:
async with self._session.post(self.api_url, json=payload) as response:
if response.status != 200:
error_text = await response.text()
logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
return
# Stream audio chunks
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
buffer = b""
pending_chunk = None
async for chunk in response.content.iter_any():
if self._cancel_event.is_set():
logger.info("TTS synthesis cancelled")
return
buffer += chunk
# Yield complete chunks
while len(buffer) >= chunk_size:
audio_chunk = buffer[:chunk_size]
buffer = buffer[chunk_size:]
# Keep one full chunk buffered so we can always tag the true
# last full chunk as final when stream length is an exact multiple.
if pending_chunk is not None:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = audio_chunk
# Flush pending chunk(s) and remaining tail.
if pending_chunk is not None:
if buffer:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=False
)
pending_chunk = None
else:
yield TTSChunk(
audio=pending_chunk,
sample_rate=self.sample_rate,
is_final=True
)
pending_chunk = None
if buffer:
yield TTSChunk(
audio=buffer,
sample_rate=self.sample_rate,
is_final=True
)
except asyncio.CancelledError:
logger.info("TTS synthesis cancelled via asyncio")
raise
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
raise
async def cancel(self) -> None:
"""Cancel ongoing synthesis."""
self._cancel_event.set()
class StreamingTTSAdapter:
"""
Adapter for streaming LLM text to TTS with sentence-level chunking.
This reduces latency by starting TTS as soon as a complete sentence
is received from the LLM, rather than waiting for the full response.
"""
# Sentence delimiters
SENTENCE_ENDS = {'', '', '', '', '.', '!', '?', '\n'}
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
self.tts_service = tts_service
self.transport = transport
self.session_id = session_id
self._buffer = ""
self._cancel_event = asyncio.Event()
self._is_speaking = False
def _is_non_sentence_period(self, text: str, idx: int) -> bool:
"""Check whether '.' should NOT be treated as a sentence delimiter."""
if text[idx] != ".":
return False
# Decimal/version segment: 1.2, v1.2.3
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
return True
# Number abbreviations: No.1 / No. 1
left_start = idx - 1
while left_start >= 0 and text[left_start].isalpha():
left_start -= 1
left_token = text[left_start + 1:idx].lower()
if left_token == "no":
j = idx + 1
while j < len(text) and text[j].isspace():
j += 1
if j < len(text) and text[j].isdigit():
return True
return False
async def process_text_chunk(self, text_chunk: str) -> None:
"""
Process a text chunk from LLM and trigger TTS when sentence is complete.
Args:
text_chunk: Text chunk from LLM streaming
"""
if self._cancel_event.is_set():
return
self._buffer += text_chunk
# Check for sentence completion
while True:
split_idx = -1
for i, char in enumerate(self._buffer):
if char == "." and self._is_non_sentence_period(self._buffer, i):
continue
if char in self.SENTENCE_ENDS:
split_idx = i
break
if split_idx < 0:
break
end_idx = split_idx + 1
while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
end_idx += 1
sentence = self._buffer[:end_idx].strip()
self._buffer = self._buffer[end_idx:]
if sentence and any(ch.isalnum() for ch in sentence):
await self._speak_sentence(sentence)
async def flush(self) -> None:
"""Flush remaining buffer."""
if self._buffer.strip() and not self._cancel_event.is_set():
await self._speak_sentence(self._buffer.strip())
self._buffer = ""
async def _speak_sentence(self, text: str) -> None:
"""Synthesize and send a sentence."""
if not text or self._cancel_event.is_set():
return
self._is_speaking = True
try:
async for chunk in self.tts_service.synthesize_stream(text):
if self._cancel_event.is_set():
break
await self.transport.send_audio(chunk.audio)
await asyncio.sleep(0.01) # Prevent flooding
except Exception as e:
logger.error(f"TTS speak error: {e}")
finally:
self._is_speaking = False
def cancel(self) -> None:
"""Cancel ongoing speech."""
self._cancel_event.set()
self._buffer = ""
def reset(self) -> None:
"""Reset for new turn."""
self._cancel_event.clear()
self._buffer = ""
self._is_speaking = False
@property
def is_speaking(self) -> bool:
return self._is_speaking
# Backward-compatible alias
SiliconFlowTTSService = OpenAICompatibleTTSService

View File

@@ -0,0 +1,8 @@
"""Backward-compatible imports for legacy siliconflow_tts module."""
from providers.tts.openai_compatible import OpenAICompatibleTTSService, StreamingTTSAdapter
# Backward-compatible alias
SiliconFlowTTSService = OpenAICompatibleTTSService
__all__ = ["OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter"]

View File

@@ -0,0 +1,95 @@
"""Backend-agnostic streaming adapter from LLM text to TTS audio."""
import asyncio
from loguru import logger
from providers.common.base import BaseTTSService
from providers.common.streaming_text import extract_tts_sentence, has_spoken_content
class StreamingTTSAdapter:
"""
Adapter for streaming LLM text to TTS with sentence-level chunking.
This reduces latency by starting TTS as soon as a complete sentence
is received from the LLM, rather than waiting for the full response.
"""
SENTENCE_ENDS = {"", "", "", ".", "!", "?", "\n"}
SENTENCE_CLOSERS = frozenset()
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
self.tts_service = tts_service
self.transport = transport
self.session_id = session_id
self._buffer = ""
self._cancel_event = asyncio.Event()
self._is_speaking = False
async def process_text_chunk(self, text_chunk: str) -> None:
"""
Process a text chunk from LLM and trigger TTS when sentence is complete.
Args:
text_chunk: Text chunk from LLM streaming
"""
if self._cancel_event.is_set():
return
self._buffer += text_chunk
# Check for sentence completion
while True:
split_result = extract_tts_sentence(
self._buffer,
end_chars=frozenset(self.SENTENCE_ENDS),
trailing_chars=frozenset(self.SENTENCE_ENDS),
closers=self.SENTENCE_CLOSERS,
force=False,
)
if not split_result:
break
sentence, self._buffer = split_result
if sentence and has_spoken_content(sentence):
await self._speak_sentence(sentence)
async def flush(self) -> None:
"""Flush remaining buffer."""
if self._buffer.strip() and not self._cancel_event.is_set():
await self._speak_sentence(self._buffer.strip())
self._buffer = ""
async def _speak_sentence(self, text: str) -> None:
"""Synthesize and send a sentence."""
if not text or self._cancel_event.is_set():
return
self._is_speaking = True
try:
async for chunk in self.tts_service.synthesize_stream(text):
if self._cancel_event.is_set():
break
await self.transport.send_audio(chunk.audio)
await asyncio.sleep(0.01) # Prevent flooding
except Exception as e:
logger.error(f"TTS speak error: {e}")
finally:
self._is_speaking = False
def cancel(self) -> None:
"""Cancel ongoing speech."""
self._cancel_event.set()
self._buffer = ""
def reset(self) -> None:
"""Reset for new turn."""
self._cancel_event.clear()
self._buffer = ""
self._is_speaking = False
@property
def is_speaking(self) -> bool:
return self._is_speaking