Refactor backend integration and service architecture

- Removed the backend client compatibility wrapper and associated methods to streamline backend integration.
- Updated session management to utilize control plane gateways and runtime configuration providers.
- Adjusted TTS service implementations to remove the EdgeTTS service and simplify service dependencies.
- Enhanced documentation to reflect changes in backend integration and service architecture.
- Updated configuration files to remove deprecated TTS provider options and clarify available settings.
This commit is contained in:
Xin Wang
2026-03-06 09:00:43 +08:00
parent 6b589a1b7c
commit 4e2450e800
22 changed files with 632 additions and 452 deletions

View File

@@ -26,21 +26,25 @@ import aiohttp
from loguru import logger
from app.config import settings
from app.service_factory import DefaultRealtimeServiceFactory
from core.conversation import ConversationManager, ConversationState
from core.events import get_event_bus
from core.ports import (
ASRPort,
ASRServiceSpec,
LLMPort,
LLMServiceSpec,
RealtimeServiceFactory,
TTSPort,
TTSServiceSpec,
)
from core.tool_executor import execute_server_tool
from core.transports import BaseTransport
from models.ws_v1 import ev
from processors.eou import EouDetector
from processors.vad import SileroVAD, VADProcessor
from services.asr import BufferedASRService
from services.base import BaseASRService, BaseLLMService, BaseTTSService, LLMMessage, LLMStreamEvent
from services.dashscope_tts import DashScopeTTSService
from services.llm import MockLLMService, OpenAILLMService
from services.openai_compatible_asr import OpenAICompatibleASRService
from services.openai_compatible_tts import OpenAICompatibleTTSService
from services.base import LLMMessage, LLMStreamEvent
from services.streaming_text import extract_tts_sentence, has_spoken_content
from services.tts import EdgeTTSService, MockTTSService
class DuplexPipeline:
@@ -258,9 +262,9 @@ class DuplexPipeline:
self,
transport: BaseTransport,
session_id: str,
llm_service: Optional[BaseLLMService] = None,
tts_service: Optional[BaseTTSService] = None,
asr_service: Optional[BaseASRService] = None,
llm_service: Optional[LLMPort] = None,
tts_service: Optional[TTSPort] = None,
asr_service: Optional[ASRPort] = None,
system_prompt: Optional[str] = None,
greeting: Optional[str] = None,
knowledge_searcher: Optional[
@@ -272,6 +276,7 @@ class DuplexPipeline:
server_tool_executor: Optional[
Callable[[Dict[str, Any]], Awaitable[Dict[str, Any]]]
] = None,
service_factory: Optional[RealtimeServiceFactory] = None,
):
"""
Initialize duplex pipeline.
@@ -279,8 +284,8 @@ class DuplexPipeline:
Args:
transport: Transport for sending audio/events
session_id: Session identifier
llm_service: LLM service (defaults to OpenAI)
tts_service: TTS service (defaults to EdgeTTS)
llm_service: Optional injected LLM port implementation
tts_service: Optional injected TTS port implementation
asr_service: ASR service (optional)
system_prompt: System prompt for LLM
greeting: Optional greeting to speak on start
@@ -312,6 +317,7 @@ class DuplexPipeline:
self.llm_service = llm_service
self.tts_service = tts_service
self.asr_service = asr_service # Will be initialized in start()
self._service_factory = service_factory or DefaultRealtimeServiceFactory()
self._knowledge_searcher = knowledge_searcher
self._tool_resource_resolver = tool_resource_resolver
self._server_tool_executor = server_tool_executor
@@ -776,21 +782,11 @@ class DuplexPipeline:
return False
return None
@staticmethod
def _is_openai_compatible_provider(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
return normalized in {"openai_compatible", "openai-compatible", "siliconflow"}
@staticmethod
def _is_dashscope_tts_provider(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
return normalized == "dashscope"
@staticmethod
def _is_llm_provider_supported(provider: Any) -> bool:
normalized = str(provider or "").strip().lower()
return normalized in {"openai", "openai_compatible", "openai-compatible", "siliconflow"}
@staticmethod
def _default_llm_base_url(provider: Any) -> Optional[str]:
normalized = str(provider or "").strip().lower()
@@ -798,10 +794,6 @@ class DuplexPipeline:
return "https://api.siliconflow.cn/v1"
return None
@staticmethod
def _default_dashscope_tts_realtime_url() -> str:
return "wss://dashscope.aliyuncs.com/api-ws/v1/realtime"
@staticmethod
def _default_dashscope_tts_model() -> str:
return "qwen3-tts-flash-realtime"
@@ -900,18 +892,18 @@ class DuplexPipeline:
or self._default_llm_base_url(llm_provider)
)
llm_model = self._runtime_llm.get("model") or settings.llm_model
if self._is_llm_provider_supported(llm_provider) and llm_api_key:
self.llm_service = OpenAILLMService(
api_key=llm_api_key,
base_url=llm_base_url,
model=llm_model,
self.llm_service = self._service_factory.create_llm_service(
LLMServiceSpec(
provider=llm_provider,
model=str(llm_model),
api_key=str(llm_api_key).strip() if llm_api_key else None,
base_url=str(llm_base_url).strip() if llm_base_url else None,
system_prompt=self.conversation.system_prompt,
temperature=settings.llm_temperature,
knowledge_config=self._resolved_knowledge_config(),
knowledge_searcher=self._knowledge_searcher,
)
else:
logger.warning("LLM provider unsupported or API key missing - using mock LLM")
self.llm_service = MockLLMService()
)
if hasattr(self.llm_service, "set_knowledge_config"):
self.llm_service.set_knowledge_config(self._resolved_knowledge_config())
@@ -938,41 +930,29 @@ class DuplexPipeline:
"services.tts.mode is DashScope-only and will be ignored "
f"for provider={tts_provider}"
)
if self._is_dashscope_tts_provider(tts_provider) and tts_api_key:
self.tts_service = DashScopeTTSService(
api_key=tts_api_key,
api_url=tts_api_url or self._default_dashscope_tts_realtime_url(),
voice=tts_voice,
model=tts_model or self._default_dashscope_tts_model(),
self.tts_service = self._service_factory.create_tts_service(
TTSServiceSpec(
provider=tts_provider,
api_key=str(tts_api_key).strip() if tts_api_key else None,
api_url=str(tts_api_url).strip() if tts_api_url else None,
voice=str(tts_voice),
model=str(tts_model).strip() if tts_model else None,
sample_rate=settings.sample_rate,
speed=tts_speed,
mode=str(tts_mode),
sample_rate=settings.sample_rate,
speed=tts_speed
)
logger.info("Using DashScope realtime TTS service")
elif self._is_openai_compatible_provider(tts_provider) and tts_api_key:
self.tts_service = OpenAICompatibleTTSService(
api_key=tts_api_key,
api_url=tts_api_url,
voice=tts_voice,
model=tts_model or "FunAudioLLM/CosyVoice2-0.5B",
sample_rate=settings.sample_rate,
speed=tts_speed
)
logger.info(f"Using OpenAI-compatible TTS service (provider={tts_provider})")
else:
self.tts_service = EdgeTTSService(
voice=tts_voice,
sample_rate=settings.sample_rate
)
logger.info("Using Edge TTS service")
)
try:
await self.tts_service.connect()
except Exception as e:
logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS")
self.tts_service = MockTTSService(
sample_rate=settings.sample_rate
logger.warning(f"TTS backend unavailable ({e}); falling back to default TTS adapter")
self.tts_service = self._service_factory.create_tts_service(
TTSServiceSpec(
provider="mock",
voice="mock",
sample_rate=settings.sample_rate,
)
)
await self.tts_service.connect()
else:
@@ -988,22 +968,19 @@ class DuplexPipeline:
asr_interim_interval = int(self._runtime_asr.get("interimIntervalMs") or settings.asr_interim_interval_ms)
asr_min_audio_ms = int(self._runtime_asr.get("minAudioMs") or settings.asr_min_audio_ms)
if self._is_openai_compatible_provider(asr_provider) and asr_api_key:
self.asr_service = OpenAICompatibleASRService(
api_key=asr_api_key,
api_url=asr_api_url,
model=asr_model or "FunAudioLLM/SenseVoiceSmall",
self.asr_service = self._service_factory.create_asr_service(
ASRServiceSpec(
provider=asr_provider,
sample_rate=settings.sample_rate,
language="auto",
api_key=str(asr_api_key).strip() if asr_api_key else None,
api_url=str(asr_api_url).strip() if asr_api_url else None,
model=str(asr_model).strip() if asr_model else None,
interim_interval_ms=asr_interim_interval,
min_audio_for_interim_ms=asr_min_audio_ms,
on_transcript=self._on_transcript_callback
on_transcript=self._on_transcript_callback,
)
logger.info(f"Using OpenAI-compatible ASR service (provider={asr_provider})")
else:
self.asr_service = BufferedASRService(
sample_rate=settings.sample_rate
)
logger.info("Using Buffered ASR service (no real transcription)")
)
await self.asr_service.connect()

View File

@@ -5,10 +5,12 @@ from __future__ import annotations
import asyncio
import time
from dataclasses import dataclass
from typing import Any, Optional
from typing import Optional
from loguru import logger
from core.ports import ConversationHistoryStore
@dataclass
class _HistoryTranscriptJob:
@@ -29,7 +31,7 @@ class SessionHistoryBridge:
def __init__(
self,
*,
history_writer: Any,
history_writer: ConversationHistoryStore | None,
enabled: bool,
queue_max_size: int,
retry_max_attempts: int,

View File

@@ -1,17 +1,32 @@
"""Port interfaces for engine-side integration boundaries."""
from core.ports.backend import (
AssistantConfigProvider,
BackendGateway,
HistoryWriter,
KnowledgeSearcher,
ToolResourceResolver,
from core.ports.asr import ASRBufferControl, ASRInterimControl, ASRPort, ASRServiceSpec
from core.ports.control_plane import (
AssistantRuntimeConfigProvider,
ControlPlaneGateway,
ConversationHistoryStore,
KnowledgeRetriever,
ToolCatalog,
)
from core.ports.llm import LLMCancellable, LLMPort, LLMRuntimeConfigurable, LLMServiceSpec
from core.ports.service_factory import RealtimeServiceFactory
from core.ports.tts import TTSPort, TTSServiceSpec
__all__ = [
"AssistantConfigProvider",
"BackendGateway",
"HistoryWriter",
"KnowledgeSearcher",
"ToolResourceResolver",
"ASRPort",
"ASRServiceSpec",
"ASRInterimControl",
"ASRBufferControl",
"AssistantRuntimeConfigProvider",
"ControlPlaneGateway",
"ConversationHistoryStore",
"KnowledgeRetriever",
"ToolCatalog",
"LLMCancellable",
"LLMPort",
"LLMRuntimeConfigurable",
"LLMServiceSpec",
"RealtimeServiceFactory",
"TTSPort",
"TTSServiceSpec",
]

64
engine/core/ports/asr.py Normal file
View File

@@ -0,0 +1,64 @@
"""ASR extension port contracts."""
from __future__ import annotations
from dataclasses import dataclass
from typing import AsyncIterator, Awaitable, Callable, Optional, Protocol
from services.base import ASRResult
TranscriptCallback = Callable[[str, bool], Awaitable[None]]
@dataclass(frozen=True)
class ASRServiceSpec:
"""Resolved runtime configuration for ASR service creation."""
provider: str
sample_rate: int
language: str = "auto"
api_key: Optional[str] = None
api_url: Optional[str] = None
model: Optional[str] = None
interim_interval_ms: int = 500
min_audio_for_interim_ms: int = 300
on_transcript: Optional[TranscriptCallback] = None
class ASRPort(Protocol):
"""Port for speech recognition providers."""
async def connect(self) -> None:
"""Establish connection to ASR provider."""
async def disconnect(self) -> None:
"""Release ASR resources."""
async def send_audio(self, audio: bytes) -> None:
"""Push one PCM audio chunk for recognition."""
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""Stream partial/final recognition results."""
class ASRInterimControl(Protocol):
"""Optional extension for explicit interim transcription control."""
async def start_interim_transcription(self) -> None:
"""Start interim transcription loop if supported."""
async def stop_interim_transcription(self) -> None:
"""Stop interim transcription loop if supported."""
class ASRBufferControl(Protocol):
"""Optional extension for explicit ASR buffer lifecycle control."""
def clear_buffer(self) -> None:
"""Clear provider-side ASR buffer."""
async def get_final_transcription(self) -> str:
"""Return final transcription for the current utterance."""
def get_and_clear_text(self) -> str:
"""Return buffered text and clear internal state."""

View File

@@ -1,7 +1,7 @@
"""Backend integration ports.
"""Control-plane integration ports.
These interfaces define the boundary between engine runtime logic and
backend-side capabilities (config lookup, history persistence, retrieval,
control-plane capabilities (config lookup, history persistence, retrieval,
and tool resource discovery).
"""
@@ -10,14 +10,14 @@ from __future__ import annotations
from typing import Any, Dict, List, Optional, Protocol
class AssistantConfigProvider(Protocol):
class AssistantRuntimeConfigProvider(Protocol):
"""Port for loading trusted assistant runtime configuration."""
async def fetch_assistant_config(self, assistant_id: str) -> Optional[Dict[str, Any]]:
"""Fetch assistant configuration payload."""
class HistoryWriter(Protocol):
class ConversationHistoryStore(Protocol):
"""Port for persisting call and transcript history."""
async def create_call_record(
@@ -27,7 +27,7 @@ class HistoryWriter(Protocol):
assistant_id: Optional[str],
source: str = "debug",
) -> Optional[str]:
"""Create a call record and return backend call ID."""
"""Create a call record and return control-plane call ID."""
async def add_transcript(
self,
@@ -53,7 +53,7 @@ class HistoryWriter(Protocol):
"""Finalize a call record."""
class KnowledgeSearcher(Protocol):
class KnowledgeRetriever(Protocol):
"""Port for RAG / knowledge retrieval operations."""
async def search_knowledge_context(
@@ -66,19 +66,18 @@ class KnowledgeSearcher(Protocol):
"""Search a knowledge source and return ranked snippets."""
class ToolResourceResolver(Protocol):
class ToolCatalog(Protocol):
"""Port for resolving tool metadata/configuration."""
async def fetch_tool_resource(self, tool_id: str) -> Optional[Dict[str, Any]]:
"""Fetch tool resource configuration."""
class BackendGateway(
AssistantConfigProvider,
HistoryWriter,
KnowledgeSearcher,
ToolResourceResolver,
class ControlPlaneGateway(
AssistantRuntimeConfigProvider,
ConversationHistoryStore,
KnowledgeRetriever,
ToolCatalog,
Protocol,
):
"""Composite backend gateway interface used by engine services."""
"""Composite control-plane gateway used by engine services."""

67
engine/core/ports/llm.py Normal file
View File

@@ -0,0 +1,67 @@
"""LLM extension port contracts."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, AsyncIterator, Awaitable, Callable, Dict, List, Optional, Protocol
from services.base import LLMMessage, LLMStreamEvent
KnowledgeRetrieverFn = Callable[..., Awaitable[List[Dict[str, Any]]]]
@dataclass(frozen=True)
class LLMServiceSpec:
"""Resolved runtime configuration for LLM service creation."""
provider: str
model: str
api_key: Optional[str] = None
base_url: Optional[str] = None
system_prompt: Optional[str] = None
temperature: float = 0.7
knowledge_config: Dict[str, Any] = field(default_factory=dict)
knowledge_searcher: Optional[KnowledgeRetrieverFn] = None
class LLMPort(Protocol):
"""Port for LLM providers."""
async def connect(self) -> None:
"""Establish connection to LLM provider."""
async def disconnect(self) -> None:
"""Release LLM resources."""
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
) -> str:
"""Generate a complete assistant response."""
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
) -> AsyncIterator[LLMStreamEvent]:
"""Generate streaming assistant response events."""
class LLMCancellable(Protocol):
"""Optional extension for interrupting in-flight LLM generation."""
def cancel(self) -> None:
"""Cancel an in-flight generation request."""
class LLMRuntimeConfigurable(Protocol):
"""Optional extension for runtime config updates."""
def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None:
"""Apply runtime knowledge retrieval settings."""
def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None:
"""Apply runtime tool schemas used for tool calling."""

View File

@@ -0,0 +1,22 @@
"""Factory port for creating runtime ASR/LLM/TTS services."""
from __future__ import annotations
from typing import Protocol
from core.ports.asr import ASRPort, ASRServiceSpec
from core.ports.llm import LLMPort, LLMServiceSpec
from core.ports.tts import TTSPort, TTSServiceSpec
class RealtimeServiceFactory(Protocol):
"""Port for provider-specific service construction."""
def create_llm_service(self, spec: LLMServiceSpec) -> LLMPort:
"""Create an LLM service instance from a resolved spec."""
def create_tts_service(self, spec: TTSServiceSpec) -> TTSPort:
"""Create a TTS service instance from a resolved spec."""
def create_asr_service(self, spec: ASRServiceSpec) -> ASRPort:
"""Create an ASR service instance from a resolved spec."""

41
engine/core/ports/tts.py Normal file
View File

@@ -0,0 +1,41 @@
"""TTS extension port contracts."""
from __future__ import annotations
from dataclasses import dataclass
from typing import AsyncIterator, Optional, Protocol
from services.base import TTSChunk
@dataclass(frozen=True)
class TTSServiceSpec:
"""Resolved runtime configuration for TTS service creation."""
provider: str
voice: str
sample_rate: int
speed: float = 1.0
api_key: Optional[str] = None
api_url: Optional[str] = None
model: Optional[str] = None
mode: str = "commit"
class TTSPort(Protocol):
"""Port for speech synthesis providers."""
async def connect(self) -> None:
"""Establish connection to TTS provider."""
async def disconnect(self) -> None:
"""Release TTS resources."""
async def synthesize(self, text: str) -> bytes:
"""Synthesize complete PCM payload for text."""
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""Stream synthesized PCM chunks for text."""
async def cancel(self) -> None:
"""Cancel an in-flight synthesis request."""

View File

@@ -11,6 +11,13 @@ from loguru import logger
from app.backend_adapters import build_backend_adapter_from_settings
from core.transports import BaseTransport
from core.ports import (
AssistantRuntimeConfigProvider,
ControlPlaneGateway,
ConversationHistoryStore,
KnowledgeRetriever,
ToolCatalog,
)
from core.duplex_pipeline import DuplexPipeline
from core.conversation import ConversationTurn
from core.history_bridge import SessionHistoryBridge
@@ -97,7 +104,11 @@ class Session:
session_id: str,
transport: BaseTransport,
use_duplex: bool = None,
backend_gateway: Optional[Any] = None,
control_plane_gateway: Optional[ControlPlaneGateway] = None,
runtime_config_provider: Optional[AssistantRuntimeConfigProvider] = None,
history_store: Optional[ConversationHistoryStore] = None,
knowledge_retriever: Optional[KnowledgeRetriever] = None,
tool_catalog: Optional[ToolCatalog] = None,
assistant_id: Optional[str] = None,
):
"""
@@ -107,15 +118,24 @@ class Session:
session_id: Unique session identifier
transport: Transport instance for communication
use_duplex: Whether to use duplex pipeline (defaults to settings.duplex_enabled)
control_plane_gateway: Optional composite control-plane dependency
runtime_config_provider: Optional assistant runtime config provider
history_store: Optional conversation history store
knowledge_retriever: Optional knowledge retrieval dependency
tool_catalog: Optional tool resource catalog
"""
self.id = session_id
self.transport = transport
self.use_duplex = use_duplex if use_duplex is not None else settings.duplex_enabled
self.audio_frame_bytes = self._compute_audio_frame_bytes()
self._assistant_id = str(assistant_id or "").strip() or None
self._backend_gateway = backend_gateway or build_backend_adapter_from_settings()
self._control_plane_gateway = control_plane_gateway or build_backend_adapter_from_settings()
self._runtime_config_provider = runtime_config_provider or self._control_plane_gateway
self._history_store = history_store or self._control_plane_gateway
self._knowledge_retriever = knowledge_retriever or self._control_plane_gateway
self._tool_catalog = tool_catalog or self._control_plane_gateway
self._history_bridge = SessionHistoryBridge(
history_writer=self._backend_gateway,
history_writer=self._history_store,
enabled=settings.history_enabled,
queue_max_size=settings.history_queue_max_size,
retry_max_attempts=settings.history_retry_max_attempts,
@@ -128,8 +148,8 @@ class Session:
session_id=session_id,
system_prompt=settings.duplex_system_prompt,
greeting=settings.duplex_greeting,
knowledge_searcher=getattr(self._backend_gateway, "search_knowledge_context", None),
tool_resource_resolver=getattr(self._backend_gateway, "fetch_tool_resource", None),
knowledge_searcher=getattr(self._knowledge_retriever, "search_knowledge_context", None),
tool_resource_resolver=getattr(self._tool_catalog, "fetch_tool_resource", None),
)
# Session state
@@ -935,18 +955,18 @@ class Session:
self,
assistant_id: str,
) -> tuple[Dict[str, Any], Optional[Dict[str, str]]]:
"""Load trusted runtime metadata from backend assistant config."""
"""Load trusted runtime metadata from control-plane assistant config."""
if not assistant_id:
return {}, {
"code": "protocol.assistant_id_required",
"message": "Missing required query parameter assistant_id",
}
provider = getattr(self._backend_gateway, "fetch_assistant_config", None)
provider = getattr(self._runtime_config_provider, "fetch_assistant_config", None)
if not callable(provider):
return {}, {
"code": "assistant.config_unavailable",
"message": "Assistant config backend unavailable",
"message": "Assistant config control plane unavailable",
}
payload = await provider(str(assistant_id).strip())