Refactor project structure and enhance backend integration

- Expanded package inclusion in `pyproject.toml` to support new modules. - Introduced new `adapters` and `protocol` packages for better organization. - Added backend adapter implementations for control plane integration. - Updated main application imports to reflect new package structure. - Removed deprecated core components and adjusted documentation accordingly. - Enhanced architecture documentation to clarify the new runtime and integration layers.
2026-03-06 09:51:56 +08:00
parent 4e2450e800
commit 7e0b777923
75 changed files with 274 additions and 688 deletions
--- a/engine/providers/common/init.py
+++ b/engine/providers/common/init.py
@@ -0,0 +1 @@
+"""Common provider types."""
--- a/engine/providers/common/base.py
+++ b/engine/providers/common/base.py
@@ -0,0 +1,253 @@
+"""Base classes for AI services.
+
+Defines abstract interfaces for ASR, LLM, and TTS services,
+inspired by pipecat's service architecture and active-call's
+StreamEngine pattern.
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AsyncIterator, Optional, List, Dict, Any, Literal
+from enum import Enum
+
+
+class ServiceState(Enum):
+    """Service connection state."""
+    DISCONNECTED = "disconnected"
+    CONNECTING = "connecting"
+    CONNECTED = "connected"
+    ERROR = "error"
+
+
+@dataclass
+class ASRResult:
+    """ASR transcription result."""
+    text: str
+    is_final: bool = False
+    confidence: float = 1.0
+    language: Optional[str] = None
+    start_time: Optional[float] = None
+    end_time: Optional[float] = None
+    
+    def __str__(self) -> str:
+        status = "FINAL" if self.is_final else "PARTIAL"
+        return f"[{status}] {self.text}"
+
+
+@dataclass
+class LLMMessage:
+    """LLM conversation message."""
+    role: str  # "system", "user", "assistant", "function"
+    content: str
+    name: Optional[str] = None  # For function calls
+    function_call: Optional[Dict[str, Any]] = None
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to API-compatible dict."""
+        d = {"role": self.role, "content": self.content}
+        if self.name:
+            d["name"] = self.name
+        if self.function_call:
+            d["function_call"] = self.function_call
+        return d
+
+
+@dataclass
+class LLMStreamEvent:
+    """Structured LLM stream event."""
+
+    type: Literal["text_delta", "tool_call", "done"]
+    text: Optional[str] = None
+    tool_call: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class TTSChunk:
+    """TTS audio chunk."""
+    audio: bytes  # PCM audio data
+    sample_rate: int = 16000
+    channels: int = 1
+    bits_per_sample: int = 16
+    is_final: bool = False
+    text_offset: Optional[int] = None  # Character offset in original text
+
+
+class BaseASRService(ABC):
+    """
+    Abstract base class for ASR (Speech-to-Text) services.
+    
+    Supports both streaming and non-streaming transcription.
+    """
+    
+    def __init__(self, sample_rate: int = 16000, language: str = "en"):
+        self.sample_rate = sample_rate
+        self.language = language
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Establish connection to ASR service."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close connection to ASR service."""
+        pass
+    
+    @abstractmethod
+    async def send_audio(self, audio: bytes) -> None:
+        """
+        Send audio chunk for transcription.
+        
+        Args:
+            audio: PCM audio data (16-bit, mono)
+        """
+        pass
+    
+    @abstractmethod
+    async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
+        """
+        Receive transcription results.
+        
+        Yields:
+            ASRResult objects as they become available
+        """
+        pass
+    
+    async def transcribe(self, audio: bytes) -> ASRResult:
+        """
+        Transcribe a complete audio buffer (non-streaming).
+        
+        Args:
+            audio: Complete PCM audio data
+            
+        Returns:
+            Final ASRResult
+        """
+        # Default implementation using streaming
+        await self.send_audio(audio)
+        async for result in self.receive_transcripts():
+            if result.is_final:
+                return result
+        return ASRResult(text="", is_final=True)
+
+
+class BaseLLMService(ABC):
+    """
+    Abstract base class for LLM (Language Model) services.
+    
+    Supports streaming responses for real-time conversation.
+    """
+    
+    def __init__(self, model: str = "gpt-4"):
+        self.model = model
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Initialize LLM service connection."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close LLM service connection."""
+        pass
+    
+    @abstractmethod
+    async def generate(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> str:
+        """
+        Generate a complete response.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Returns:
+            Complete response text
+        """
+        pass
+    
+    @abstractmethod
+    async def generate_stream(
+        self,
+        messages: List[LLMMessage],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None
+    ) -> AsyncIterator[LLMStreamEvent]:
+        """
+        Generate response in streaming mode.
+        
+        Args:
+            messages: Conversation history
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+            
+        Yields:
+            Stream events (text delta/tool call/done)
+        """
+        pass
+
+
+class BaseTTSService(ABC):
+    """
+    Abstract base class for TTS (Text-to-Speech) services.
+    
+    Supports streaming audio synthesis for low-latency playback.
+    """
+    
+    def __init__(
+        self,
+        voice: str = "default",
+        sample_rate: int = 16000,
+        speed: float = 1.0
+    ):
+        self.voice = voice
+        self.sample_rate = sample_rate
+        self.speed = speed
+        self.state = ServiceState.DISCONNECTED
+    
+    @abstractmethod
+    async def connect(self) -> None:
+        """Initialize TTS service connection."""
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close TTS service connection."""
+        pass
+    
+    @abstractmethod
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize complete audio for text (non-streaming).
+        
+        Args:
+            text: Text to synthesize
+            
+        Returns:
+            Complete PCM audio data
+        """
+        pass
+    
+    @abstractmethod
+    async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
+        """
+        Synthesize audio in streaming mode.
+        
+        Args:
+            text: Text to synthesize
+            
+        Yields:
+            TTSChunk objects as audio is generated
+        """
+        pass
+    
+    async def cancel(self) -> None:
+        """Cancel ongoing synthesis (for barge-in support)."""
+        pass
--- a/engine/providers/common/streaming_text.py
+++ b/engine/providers/common/streaming_text.py
@@ -0,0 +1,86 @@
+"""Shared text chunking helpers for streaming TTS."""
+
+from typing import Optional
+
+
+def is_non_sentence_period(text: str, idx: int) -> bool:
+    """Check whether '.' should NOT be treated as a sentence delimiter."""
+    if idx < 0 or idx >= len(text) or text[idx] != ".":
+        return False
+
+    # Decimal/version segment: 1.2, v1.2.3
+    if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
+        return True
+
+    # Number abbreviations: No.1 / No. 1
+    left_start = idx - 1
+    while left_start >= 0 and text[left_start].isalpha():
+        left_start -= 1
+    left_token = text[left_start + 1:idx].lower()
+    if left_token == "no":
+        j = idx + 1
+        while j < len(text) and text[j].isspace():
+            j += 1
+        if j < len(text) and text[j].isdigit():
+            return True
+
+    return False
+
+
+def has_spoken_content(text: str) -> bool:
+    """Check whether text contains pronounceable content (not punctuation-only)."""
+    return any(char.isalnum() for char in text)
+
+
+def extract_tts_sentence(
+    text_buffer: str,
+    *,
+    end_chars: frozenset[str],
+    trailing_chars: frozenset[str],
+    closers: frozenset[str],
+    min_split_spoken_chars: int = 0,
+    hold_trailing_at_buffer_end: bool = False,
+    force: bool = False,
+) -> Optional[tuple[str, str]]:
+    """Extract one TTS sentence from text buffer."""
+    if not text_buffer:
+        return None
+
+    search_start = 0
+    while True:
+        split_idx = -1
+        for idx in range(search_start, len(text_buffer)):
+            char = text_buffer[idx]
+            if char == "." and is_non_sentence_period(text_buffer, idx):
+                continue
+            if char in end_chars:
+                split_idx = idx
+                break
+
+        if split_idx == -1:
+            return None
+
+        end_idx = split_idx + 1
+        while end_idx < len(text_buffer) and text_buffer[end_idx] in trailing_chars:
+            end_idx += 1
+
+        while end_idx < len(text_buffer) and text_buffer[end_idx] in closers:
+            end_idx += 1
+
+        if hold_trailing_at_buffer_end and not force and end_idx >= len(text_buffer):
+            return None
+
+        sentence = text_buffer[:end_idx].strip()
+        spoken_chars = sum(1 for ch in sentence if ch.isalnum())
+
+        if (
+            not force
+            and min_split_spoken_chars > 0
+            and 0 < spoken_chars < min_split_spoken_chars
+            and end_idx < len(text_buffer)
+        ):
+            search_start = end_idx
+            continue
+
+        remainder = text_buffer[end_idx:]
+        return sentence, remainder