Refactor project structure and enhance backend integration

- Expanded package inclusion in `pyproject.toml` to support new modules.
- Introduced new `adapters` and `protocol` packages for better organization.
- Added backend adapter implementations for control plane integration.
- Updated main application imports to reflect new package structure.
- Removed deprecated core components and adjusted documentation accordingly.
- Enhanced architecture documentation to clarify the new runtime and integration layers.
This commit is contained in:
Xin Wang
2026-03-06 09:51:56 +08:00
parent 4e2450e800
commit 7e0b777923
75 changed files with 274 additions and 688 deletions

View File

@@ -0,0 +1 @@
"""Common provider types."""

View File

@@ -0,0 +1,253 @@
"""Base classes for AI services.
Defines abstract interfaces for ASR, LLM, and TTS services,
inspired by pipecat's service architecture and active-call's
StreamEngine pattern.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import AsyncIterator, Optional, List, Dict, Any, Literal
from enum import Enum
class ServiceState(Enum):
"""Service connection state."""
DISCONNECTED = "disconnected"
CONNECTING = "connecting"
CONNECTED = "connected"
ERROR = "error"
@dataclass
class ASRResult:
"""ASR transcription result."""
text: str
is_final: bool = False
confidence: float = 1.0
language: Optional[str] = None
start_time: Optional[float] = None
end_time: Optional[float] = None
def __str__(self) -> str:
status = "FINAL" if self.is_final else "PARTIAL"
return f"[{status}] {self.text}"
@dataclass
class LLMMessage:
"""LLM conversation message."""
role: str # "system", "user", "assistant", "function"
content: str
name: Optional[str] = None # For function calls
function_call: Optional[Dict[str, Any]] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to API-compatible dict."""
d = {"role": self.role, "content": self.content}
if self.name:
d["name"] = self.name
if self.function_call:
d["function_call"] = self.function_call
return d
@dataclass
class LLMStreamEvent:
"""Structured LLM stream event."""
type: Literal["text_delta", "tool_call", "done"]
text: Optional[str] = None
tool_call: Optional[Dict[str, Any]] = None
@dataclass
class TTSChunk:
"""TTS audio chunk."""
audio: bytes # PCM audio data
sample_rate: int = 16000
channels: int = 1
bits_per_sample: int = 16
is_final: bool = False
text_offset: Optional[int] = None # Character offset in original text
class BaseASRService(ABC):
"""
Abstract base class for ASR (Speech-to-Text) services.
Supports both streaming and non-streaming transcription.
"""
def __init__(self, sample_rate: int = 16000, language: str = "en"):
self.sample_rate = sample_rate
self.language = language
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Establish connection to ASR service."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close connection to ASR service."""
pass
@abstractmethod
async def send_audio(self, audio: bytes) -> None:
"""
Send audio chunk for transcription.
Args:
audio: PCM audio data (16-bit, mono)
"""
pass
@abstractmethod
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
"""
Receive transcription results.
Yields:
ASRResult objects as they become available
"""
pass
async def transcribe(self, audio: bytes) -> ASRResult:
"""
Transcribe a complete audio buffer (non-streaming).
Args:
audio: Complete PCM audio data
Returns:
Final ASRResult
"""
# Default implementation using streaming
await self.send_audio(audio)
async for result in self.receive_transcripts():
if result.is_final:
return result
return ASRResult(text="", is_final=True)
class BaseLLMService(ABC):
"""
Abstract base class for LLM (Language Model) services.
Supports streaming responses for real-time conversation.
"""
def __init__(self, model: str = "gpt-4"):
self.model = model
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Initialize LLM service connection."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close LLM service connection."""
pass
@abstractmethod
async def generate(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> str:
"""
Generate a complete response.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Returns:
Complete response text
"""
pass
@abstractmethod
async def generate_stream(
self,
messages: List[LLMMessage],
temperature: float = 0.7,
max_tokens: Optional[int] = None
) -> AsyncIterator[LLMStreamEvent]:
"""
Generate response in streaming mode.
Args:
messages: Conversation history
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
Yields:
Stream events (text delta/tool call/done)
"""
pass
class BaseTTSService(ABC):
"""
Abstract base class for TTS (Text-to-Speech) services.
Supports streaming audio synthesis for low-latency playback.
"""
def __init__(
self,
voice: str = "default",
sample_rate: int = 16000,
speed: float = 1.0
):
self.voice = voice
self.sample_rate = sample_rate
self.speed = speed
self.state = ServiceState.DISCONNECTED
@abstractmethod
async def connect(self) -> None:
"""Initialize TTS service connection."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close TTS service connection."""
pass
@abstractmethod
async def synthesize(self, text: str) -> bytes:
"""
Synthesize complete audio for text (non-streaming).
Args:
text: Text to synthesize
Returns:
Complete PCM audio data
"""
pass
@abstractmethod
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
"""
Synthesize audio in streaming mode.
Args:
text: Text to synthesize
Yields:
TTSChunk objects as audio is generated
"""
pass
async def cancel(self) -> None:
"""Cancel ongoing synthesis (for barge-in support)."""
pass

View File

@@ -0,0 +1,86 @@
"""Shared text chunking helpers for streaming TTS."""
from typing import Optional
def is_non_sentence_period(text: str, idx: int) -> bool:
"""Check whether '.' should NOT be treated as a sentence delimiter."""
if idx < 0 or idx >= len(text) or text[idx] != ".":
return False
# Decimal/version segment: 1.2, v1.2.3
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
return True
# Number abbreviations: No.1 / No. 1
left_start = idx - 1
while left_start >= 0 and text[left_start].isalpha():
left_start -= 1
left_token = text[left_start + 1:idx].lower()
if left_token == "no":
j = idx + 1
while j < len(text) and text[j].isspace():
j += 1
if j < len(text) and text[j].isdigit():
return True
return False
def has_spoken_content(text: str) -> bool:
"""Check whether text contains pronounceable content (not punctuation-only)."""
return any(char.isalnum() for char in text)
def extract_tts_sentence(
text_buffer: str,
*,
end_chars: frozenset[str],
trailing_chars: frozenset[str],
closers: frozenset[str],
min_split_spoken_chars: int = 0,
hold_trailing_at_buffer_end: bool = False,
force: bool = False,
) -> Optional[tuple[str, str]]:
"""Extract one TTS sentence from text buffer."""
if not text_buffer:
return None
search_start = 0
while True:
split_idx = -1
for idx in range(search_start, len(text_buffer)):
char = text_buffer[idx]
if char == "." and is_non_sentence_period(text_buffer, idx):
continue
if char in end_chars:
split_idx = idx
break
if split_idx == -1:
return None
end_idx = split_idx + 1
while end_idx < len(text_buffer) and text_buffer[end_idx] in trailing_chars:
end_idx += 1
while end_idx < len(text_buffer) and text_buffer[end_idx] in closers:
end_idx += 1
if hold_trailing_at_buffer_end and not force and end_idx >= len(text_buffer):
return None
sentence = text_buffer[:end_idx].strip()
spoken_chars = sum(1 for ch in sentence if ch.isalnum())
if (
not force
and min_split_spoken_chars > 0
and 0 < spoken_chars < min_split_spoken_chars
and end_idx < len(text_buffer)
):
search_start = end_idx
continue
remainder = text_buffer[end_idx:]
return sentence, remainder