Init commit
This commit is contained in:
51
services/__init__.py
Normal file
51
services/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""AI Services package.
|
||||
|
||||
Provides ASR, LLM, TTS, and Realtime API services for voice conversation.
|
||||
"""
|
||||
|
||||
from services.base import (
|
||||
ServiceState,
|
||||
ASRResult,
|
||||
LLMMessage,
|
||||
TTSChunk,
|
||||
BaseASRService,
|
||||
BaseLLMService,
|
||||
BaseTTSService,
|
||||
)
|
||||
from services.llm import OpenAILLMService, MockLLMService
|
||||
from services.tts import EdgeTTSService, MockTTSService
|
||||
from services.asr import BufferedASRService, MockASRService
|
||||
from services.openai_compatible_asr import OpenAICompatibleASRService, SiliconFlowASRService
|
||||
from services.openai_compatible_tts import OpenAICompatibleTTSService, SiliconFlowTTSService
|
||||
from services.streaming_tts_adapter import StreamingTTSAdapter
|
||||
from services.realtime import RealtimeService, RealtimeConfig, RealtimePipeline
|
||||
|
||||
__all__ = [
|
||||
# Base classes
|
||||
"ServiceState",
|
||||
"ASRResult",
|
||||
"LLMMessage",
|
||||
"TTSChunk",
|
||||
"BaseASRService",
|
||||
"BaseLLMService",
|
||||
"BaseTTSService",
|
||||
# LLM
|
||||
"OpenAILLMService",
|
||||
"MockLLMService",
|
||||
# TTS
|
||||
"EdgeTTSService",
|
||||
"MockTTSService",
|
||||
# ASR
|
||||
"BufferedASRService",
|
||||
"MockASRService",
|
||||
"OpenAICompatibleASRService",
|
||||
"SiliconFlowASRService",
|
||||
# TTS (SiliconFlow)
|
||||
"OpenAICompatibleTTSService",
|
||||
"SiliconFlowTTSService",
|
||||
"StreamingTTSAdapter",
|
||||
# Realtime
|
||||
"RealtimeService",
|
||||
"RealtimeConfig",
|
||||
"RealtimePipeline",
|
||||
]
|
||||
147
services/asr.py
Normal file
147
services/asr.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""ASR (Automatic Speech Recognition) Service implementations.
|
||||
|
||||
Provides speech-to-text capabilities with streaming support.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import json
|
||||
from typing import AsyncIterator, Optional
|
||||
from loguru import logger
|
||||
|
||||
from services.base import BaseASRService, ASRResult, ServiceState
|
||||
|
||||
# Try to import websockets for streaming ASR
|
||||
try:
|
||||
import websockets
|
||||
WEBSOCKETS_AVAILABLE = True
|
||||
except ImportError:
|
||||
WEBSOCKETS_AVAILABLE = False
|
||||
|
||||
|
||||
class BufferedASRService(BaseASRService):
|
||||
"""
|
||||
Buffered ASR service that accumulates audio and provides
|
||||
a simple text accumulator for use with EOU detection.
|
||||
|
||||
This is a lightweight implementation that works with the
|
||||
existing VAD + EOU pattern without requiring external ASR.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sample_rate: int = 16000,
|
||||
language: str = "en"
|
||||
):
|
||||
super().__init__(sample_rate=sample_rate, language=language)
|
||||
|
||||
self._audio_buffer: bytes = b""
|
||||
self._current_text: str = ""
|
||||
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""No connection needed for buffered ASR."""
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("Buffered ASR service connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Clear buffers on disconnect."""
|
||||
self._audio_buffer = b""
|
||||
self._current_text = ""
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Buffered ASR service disconnected")
|
||||
|
||||
async def send_audio(self, audio: bytes) -> None:
|
||||
"""Buffer audio for later processing."""
|
||||
self._audio_buffer += audio
|
||||
|
||||
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
||||
"""Yield transcription results."""
|
||||
while True:
|
||||
try:
|
||||
result = await asyncio.wait_for(
|
||||
self._transcript_queue.get(),
|
||||
timeout=0.1
|
||||
)
|
||||
yield result
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
def set_text(self, text: str) -> None:
|
||||
"""
|
||||
Set the current transcript text directly.
|
||||
|
||||
This allows external integration (e.g., Whisper, other ASR)
|
||||
to provide transcripts.
|
||||
"""
|
||||
self._current_text = text
|
||||
result = ASRResult(text=text, is_final=False)
|
||||
asyncio.create_task(self._transcript_queue.put(result))
|
||||
|
||||
def get_and_clear_text(self) -> str:
|
||||
"""Get accumulated text and clear buffer."""
|
||||
text = self._current_text
|
||||
self._current_text = ""
|
||||
self._audio_buffer = b""
|
||||
return text
|
||||
|
||||
def get_audio_buffer(self) -> bytes:
|
||||
"""Get accumulated audio buffer."""
|
||||
return self._audio_buffer
|
||||
|
||||
def clear_audio_buffer(self) -> None:
|
||||
"""Clear audio buffer."""
|
||||
self._audio_buffer = b""
|
||||
|
||||
|
||||
class MockASRService(BaseASRService):
|
||||
"""
|
||||
Mock ASR service for testing without actual recognition.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_rate: int = 16000, language: str = "en"):
|
||||
super().__init__(sample_rate=sample_rate, language=language)
|
||||
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
|
||||
self._mock_texts = [
|
||||
"Hello, how are you?",
|
||||
"That's interesting.",
|
||||
"Tell me more about that.",
|
||||
"I understand.",
|
||||
]
|
||||
self._text_index = 0
|
||||
|
||||
async def connect(self) -> None:
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("Mock ASR service connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Mock ASR service disconnected")
|
||||
|
||||
async def send_audio(self, audio: bytes) -> None:
|
||||
"""Mock audio processing - generates fake transcripts periodically."""
|
||||
pass
|
||||
|
||||
def trigger_transcript(self) -> None:
|
||||
"""Manually trigger a transcript (for testing)."""
|
||||
text = self._mock_texts[self._text_index % len(self._mock_texts)]
|
||||
self._text_index += 1
|
||||
|
||||
result = ASRResult(text=text, is_final=True, confidence=0.95)
|
||||
asyncio.create_task(self._transcript_queue.put(result))
|
||||
|
||||
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
||||
"""Yield transcription results."""
|
||||
while True:
|
||||
try:
|
||||
result = await asyncio.wait_for(
|
||||
self._transcript_queue.get(),
|
||||
timeout=0.1
|
||||
)
|
||||
yield result
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
253
services/base.py
Normal file
253
services/base.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""Base classes for AI services.
|
||||
|
||||
Defines abstract interfaces for ASR, LLM, and TTS services,
|
||||
inspired by pipecat's service architecture and active-call's
|
||||
StreamEngine pattern.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from typing import AsyncIterator, Optional, List, Dict, Any, Literal
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ServiceState(Enum):
|
||||
"""Service connection state."""
|
||||
DISCONNECTED = "disconnected"
|
||||
CONNECTING = "connecting"
|
||||
CONNECTED = "connected"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ASRResult:
|
||||
"""ASR transcription result."""
|
||||
text: str
|
||||
is_final: bool = False
|
||||
confidence: float = 1.0
|
||||
language: Optional[str] = None
|
||||
start_time: Optional[float] = None
|
||||
end_time: Optional[float] = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
status = "FINAL" if self.is_final else "PARTIAL"
|
||||
return f"[{status}] {self.text}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMMessage:
|
||||
"""LLM conversation message."""
|
||||
role: str # "system", "user", "assistant", "function"
|
||||
content: str
|
||||
name: Optional[str] = None # For function calls
|
||||
function_call: Optional[Dict[str, Any]] = None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to API-compatible dict."""
|
||||
d = {"role": self.role, "content": self.content}
|
||||
if self.name:
|
||||
d["name"] = self.name
|
||||
if self.function_call:
|
||||
d["function_call"] = self.function_call
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMStreamEvent:
|
||||
"""Structured LLM stream event."""
|
||||
|
||||
type: Literal["text_delta", "tool_call", "done"]
|
||||
text: Optional[str] = None
|
||||
tool_call: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSChunk:
|
||||
"""TTS audio chunk."""
|
||||
audio: bytes # PCM audio data
|
||||
sample_rate: int = 16000
|
||||
channels: int = 1
|
||||
bits_per_sample: int = 16
|
||||
is_final: bool = False
|
||||
text_offset: Optional[int] = None # Character offset in original text
|
||||
|
||||
|
||||
class BaseASRService(ABC):
|
||||
"""
|
||||
Abstract base class for ASR (Speech-to-Text) services.
|
||||
|
||||
Supports both streaming and non-streaming transcription.
|
||||
"""
|
||||
|
||||
def __init__(self, sample_rate: int = 16000, language: str = "en"):
|
||||
self.sample_rate = sample_rate
|
||||
self.language = language
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
|
||||
@abstractmethod
|
||||
async def connect(self) -> None:
|
||||
"""Establish connection to ASR service."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def disconnect(self) -> None:
|
||||
"""Close connection to ASR service."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def send_audio(self, audio: bytes) -> None:
|
||||
"""
|
||||
Send audio chunk for transcription.
|
||||
|
||||
Args:
|
||||
audio: PCM audio data (16-bit, mono)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
||||
"""
|
||||
Receive transcription results.
|
||||
|
||||
Yields:
|
||||
ASRResult objects as they become available
|
||||
"""
|
||||
pass
|
||||
|
||||
async def transcribe(self, audio: bytes) -> ASRResult:
|
||||
"""
|
||||
Transcribe a complete audio buffer (non-streaming).
|
||||
|
||||
Args:
|
||||
audio: Complete PCM audio data
|
||||
|
||||
Returns:
|
||||
Final ASRResult
|
||||
"""
|
||||
# Default implementation using streaming
|
||||
await self.send_audio(audio)
|
||||
async for result in self.receive_transcripts():
|
||||
if result.is_final:
|
||||
return result
|
||||
return ASRResult(text="", is_final=True)
|
||||
|
||||
|
||||
class BaseLLMService(ABC):
|
||||
"""
|
||||
Abstract base class for LLM (Language Model) services.
|
||||
|
||||
Supports streaming responses for real-time conversation.
|
||||
"""
|
||||
|
||||
def __init__(self, model: str = "gpt-4"):
|
||||
self.model = model
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
|
||||
@abstractmethod
|
||||
async def connect(self) -> None:
|
||||
"""Initialize LLM service connection."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def disconnect(self) -> None:
|
||||
"""Close LLM service connection."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def generate(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate a complete response.
|
||||
|
||||
Args:
|
||||
messages: Conversation history
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Returns:
|
||||
Complete response text
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def generate_stream(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> AsyncIterator[LLMStreamEvent]:
|
||||
"""
|
||||
Generate response in streaming mode.
|
||||
|
||||
Args:
|
||||
messages: Conversation history
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Yields:
|
||||
Stream events (text delta/tool call/done)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class BaseTTSService(ABC):
|
||||
"""
|
||||
Abstract base class for TTS (Text-to-Speech) services.
|
||||
|
||||
Supports streaming audio synthesis for low-latency playback.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
voice: str = "default",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
self.voice = voice
|
||||
self.sample_rate = sample_rate
|
||||
self.speed = speed
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
|
||||
@abstractmethod
|
||||
async def connect(self) -> None:
|
||||
"""Initialize TTS service connection."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def disconnect(self) -> None:
|
||||
"""Close TTS service connection."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""
|
||||
Synthesize complete audio for text (non-streaming).
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Returns:
|
||||
Complete PCM audio data
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""
|
||||
Synthesize audio in streaming mode.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Yields:
|
||||
TTSChunk objects as audio is generated
|
||||
"""
|
||||
pass
|
||||
|
||||
async def cancel(self) -> None:
|
||||
"""Cancel ongoing synthesis (for barge-in support)."""
|
||||
pass
|
||||
443
services/llm.py
Normal file
443
services/llm.py
Normal file
@@ -0,0 +1,443 @@
|
||||
"""LLM (Large Language Model) Service implementations.
|
||||
|
||||
Provides OpenAI-compatible LLM integration with streaming support
|
||||
for real-time voice conversation.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import uuid
|
||||
from typing import AsyncIterator, Optional, List, Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from app.backend_client import search_knowledge_context
|
||||
from services.base import BaseLLMService, LLMMessage, LLMStreamEvent, ServiceState
|
||||
|
||||
# Try to import openai
|
||||
try:
|
||||
from openai import AsyncOpenAI
|
||||
OPENAI_AVAILABLE = True
|
||||
except ImportError:
|
||||
OPENAI_AVAILABLE = False
|
||||
logger.warning("openai package not available - LLM service will be disabled")
|
||||
|
||||
|
||||
class OpenAILLMService(BaseLLMService):
|
||||
"""
|
||||
OpenAI-compatible LLM service.
|
||||
|
||||
Supports streaming responses for low-latency voice conversation.
|
||||
Works with OpenAI API, Azure OpenAI, and compatible APIs.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "gpt-4o-mini",
|
||||
api_key: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
knowledge_config: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Initialize OpenAI LLM service.
|
||||
|
||||
Args:
|
||||
model: Model name (e.g., "gpt-4o-mini", "gpt-4o")
|
||||
api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
|
||||
base_url: Custom API base URL (for Azure or compatible APIs)
|
||||
system_prompt: Default system prompt for conversations
|
||||
"""
|
||||
super().__init__(model=model)
|
||||
|
||||
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
self.base_url = base_url or os.getenv("OPENAI_API_URL")
|
||||
self.system_prompt = system_prompt or (
|
||||
"You are a helpful, friendly voice assistant. "
|
||||
"Keep your responses concise and conversational. "
|
||||
"Respond naturally as if having a phone conversation."
|
||||
)
|
||||
|
||||
self.client: Optional[AsyncOpenAI] = None
|
||||
self._cancel_event = asyncio.Event()
|
||||
self._knowledge_config: Dict[str, Any] = knowledge_config or {}
|
||||
self._tool_schemas: List[Dict[str, Any]] = []
|
||||
|
||||
_RAG_DEFAULT_RESULTS = 5
|
||||
_RAG_MAX_RESULTS = 8
|
||||
_RAG_MAX_CONTEXT_CHARS = 4000
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Initialize OpenAI client."""
|
||||
if not OPENAI_AVAILABLE:
|
||||
raise RuntimeError("openai package not installed")
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("OpenAI API key not provided")
|
||||
|
||||
self.client = AsyncOpenAI(
|
||||
api_key=self.api_key,
|
||||
base_url=self.base_url
|
||||
)
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info(f"OpenAI LLM service connected: model={self.model}")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Close OpenAI client."""
|
||||
if self.client:
|
||||
await self.client.close()
|
||||
self.client = None
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("OpenAI LLM service disconnected")
|
||||
|
||||
def _prepare_messages(self, messages: List[LLMMessage]) -> List[Dict[str, Any]]:
|
||||
"""Prepare messages list with system prompt."""
|
||||
result = []
|
||||
|
||||
# Add system prompt if not already present
|
||||
has_system = any(m.role == "system" for m in messages)
|
||||
if not has_system and self.system_prompt:
|
||||
result.append({"role": "system", "content": self.system_prompt})
|
||||
|
||||
# Add all messages
|
||||
for msg in messages:
|
||||
result.append(msg.to_dict())
|
||||
|
||||
return result
|
||||
|
||||
def set_knowledge_config(self, config: Optional[Dict[str, Any]]) -> None:
|
||||
"""Update runtime knowledge retrieval config."""
|
||||
self._knowledge_config = config or {}
|
||||
|
||||
def set_tool_schemas(self, schemas: Optional[List[Dict[str, Any]]]) -> None:
|
||||
"""Update runtime tool schemas."""
|
||||
self._tool_schemas = []
|
||||
if not isinstance(schemas, list):
|
||||
return
|
||||
for item in schemas:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
fn = item.get("function")
|
||||
if isinstance(fn, dict) and fn.get("name"):
|
||||
self._tool_schemas.append(item)
|
||||
elif item.get("name"):
|
||||
self._tool_schemas.append(
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": str(item.get("name")),
|
||||
"description": str(item.get("description") or ""),
|
||||
"parameters": item.get("parameters") or {"type": "object", "properties": {}},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _coerce_int(value: Any, default: int) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
def _resolve_kb_id(self) -> Optional[str]:
|
||||
cfg = self._knowledge_config if isinstance(self._knowledge_config, dict) else {}
|
||||
kb_id = str(
|
||||
cfg.get("kbId")
|
||||
or cfg.get("knowledgeBaseId")
|
||||
or cfg.get("knowledge_base_id")
|
||||
or ""
|
||||
).strip()
|
||||
return kb_id or None
|
||||
|
||||
def _build_knowledge_prompt(self, results: List[Dict[str, Any]]) -> Optional[str]:
|
||||
if not results:
|
||||
return None
|
||||
|
||||
lines = [
|
||||
"You have retrieved the following knowledge base snippets.",
|
||||
"Use them only when relevant to the latest user request.",
|
||||
"If snippets are insufficient, say you are not sure instead of guessing.",
|
||||
"",
|
||||
]
|
||||
|
||||
used_chars = 0
|
||||
used_count = 0
|
||||
for item in results:
|
||||
content = str(item.get("content") or "").strip()
|
||||
if not content:
|
||||
continue
|
||||
if used_chars >= self._RAG_MAX_CONTEXT_CHARS:
|
||||
break
|
||||
|
||||
metadata = item.get("metadata") if isinstance(item.get("metadata"), dict) else {}
|
||||
doc_id = metadata.get("document_id")
|
||||
chunk_index = metadata.get("chunk_index")
|
||||
distance = item.get("distance")
|
||||
|
||||
source_parts = []
|
||||
if doc_id:
|
||||
source_parts.append(f"doc={doc_id}")
|
||||
if chunk_index is not None:
|
||||
source_parts.append(f"chunk={chunk_index}")
|
||||
source = f" ({', '.join(source_parts)})" if source_parts else ""
|
||||
|
||||
distance_text = ""
|
||||
try:
|
||||
if distance is not None:
|
||||
distance_text = f", distance={float(distance):.4f}"
|
||||
except (TypeError, ValueError):
|
||||
distance_text = ""
|
||||
|
||||
remaining = self._RAG_MAX_CONTEXT_CHARS - used_chars
|
||||
snippet = content[:remaining].strip()
|
||||
if not snippet:
|
||||
continue
|
||||
|
||||
used_count += 1
|
||||
lines.append(f"[{used_count}{source}{distance_text}] {snippet}")
|
||||
used_chars += len(snippet)
|
||||
|
||||
if used_count == 0:
|
||||
return None
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
async def _with_knowledge_context(self, messages: List[LLMMessage]) -> List[LLMMessage]:
|
||||
cfg = self._knowledge_config if isinstance(self._knowledge_config, dict) else {}
|
||||
enabled = cfg.get("enabled", True)
|
||||
if isinstance(enabled, str):
|
||||
enabled = enabled.strip().lower() not in {"false", "0", "off", "no"}
|
||||
if not enabled:
|
||||
return messages
|
||||
|
||||
kb_id = self._resolve_kb_id()
|
||||
if not kb_id:
|
||||
return messages
|
||||
|
||||
latest_user = ""
|
||||
for msg in reversed(messages):
|
||||
if msg.role == "user":
|
||||
latest_user = (msg.content or "").strip()
|
||||
break
|
||||
if not latest_user:
|
||||
return messages
|
||||
|
||||
n_results = self._coerce_int(cfg.get("nResults"), self._RAG_DEFAULT_RESULTS)
|
||||
n_results = max(1, min(n_results, self._RAG_MAX_RESULTS))
|
||||
|
||||
results = await search_knowledge_context(
|
||||
kb_id=kb_id,
|
||||
query=latest_user,
|
||||
n_results=n_results,
|
||||
)
|
||||
prompt = self._build_knowledge_prompt(results)
|
||||
if not prompt:
|
||||
return messages
|
||||
|
||||
logger.debug(f"RAG context injected (kb_id={kb_id}, chunks={len(results)})")
|
||||
rag_system = LLMMessage(role="system", content=prompt)
|
||||
if messages and messages[0].role == "system":
|
||||
return [messages[0], rag_system, *messages[1:]]
|
||||
return [rag_system, *messages]
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
Generate a complete response.
|
||||
|
||||
Args:
|
||||
messages: Conversation history
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Returns:
|
||||
Complete response text
|
||||
"""
|
||||
if not self.client:
|
||||
raise RuntimeError("LLM service not connected")
|
||||
|
||||
rag_messages = await self._with_knowledge_context(messages)
|
||||
prepared = self._prepare_messages(rag_messages)
|
||||
|
||||
try:
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=prepared,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content or ""
|
||||
logger.debug(f"LLM response: {content[:100]}...")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM generation error: {e}")
|
||||
raise
|
||||
|
||||
async def generate_stream(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> AsyncIterator[LLMStreamEvent]:
|
||||
"""
|
||||
Generate response in streaming mode.
|
||||
|
||||
Args:
|
||||
messages: Conversation history
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Yields:
|
||||
Structured stream events
|
||||
"""
|
||||
if not self.client:
|
||||
raise RuntimeError("LLM service not connected")
|
||||
|
||||
rag_messages = await self._with_knowledge_context(messages)
|
||||
prepared = self._prepare_messages(rag_messages)
|
||||
self._cancel_event.clear()
|
||||
tool_accumulator: Dict[int, Dict[str, str]] = {}
|
||||
openai_tools = self._tool_schemas or None
|
||||
|
||||
try:
|
||||
create_args: Dict[str, Any] = dict(
|
||||
model=self.model,
|
||||
messages=prepared,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
stream=True,
|
||||
)
|
||||
if openai_tools:
|
||||
create_args["tools"] = openai_tools
|
||||
create_args["tool_choice"] = "auto"
|
||||
stream = await self.client.chat.completions.create(**create_args)
|
||||
|
||||
async for chunk in stream:
|
||||
# Check for cancellation
|
||||
if self._cancel_event.is_set():
|
||||
logger.info("LLM stream cancelled")
|
||||
break
|
||||
|
||||
if not chunk.choices:
|
||||
continue
|
||||
|
||||
choice = chunk.choices[0]
|
||||
delta = getattr(choice, "delta", None)
|
||||
if delta and getattr(delta, "content", None):
|
||||
content = delta.content
|
||||
yield LLMStreamEvent(type="text_delta", text=content)
|
||||
|
||||
# OpenAI streams function calls via incremental tool_calls deltas.
|
||||
tool_calls = getattr(delta, "tool_calls", None) if delta else None
|
||||
if tool_calls:
|
||||
for tc in tool_calls:
|
||||
index = getattr(tc, "index", 0) or 0
|
||||
item = tool_accumulator.setdefault(
|
||||
int(index),
|
||||
{"id": "", "name": "", "arguments": ""},
|
||||
)
|
||||
tc_id = getattr(tc, "id", None)
|
||||
if tc_id:
|
||||
item["id"] = str(tc_id)
|
||||
fn = getattr(tc, "function", None)
|
||||
if fn:
|
||||
fn_name = getattr(fn, "name", None)
|
||||
if fn_name:
|
||||
item["name"] = str(fn_name)
|
||||
fn_args = getattr(fn, "arguments", None)
|
||||
if fn_args:
|
||||
item["arguments"] += str(fn_args)
|
||||
|
||||
finish_reason = getattr(choice, "finish_reason", None)
|
||||
if finish_reason == "tool_calls" and tool_accumulator:
|
||||
for _, payload in sorted(tool_accumulator.items(), key=lambda row: row[0]):
|
||||
call_name = payload.get("name", "").strip()
|
||||
if not call_name:
|
||||
continue
|
||||
call_id = payload.get("id", "").strip() or f"call_{uuid.uuid4().hex[:10]}"
|
||||
yield LLMStreamEvent(
|
||||
type="tool_call",
|
||||
tool_call={
|
||||
"id": call_id,
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": call_name,
|
||||
"arguments": payload.get("arguments", "") or "{}",
|
||||
},
|
||||
},
|
||||
)
|
||||
yield LLMStreamEvent(type="done")
|
||||
return
|
||||
|
||||
if finish_reason in {"stop", "length", "content_filter"}:
|
||||
yield LLMStreamEvent(type="done")
|
||||
return
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("LLM stream cancelled via asyncio")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM streaming error: {e}")
|
||||
raise
|
||||
|
||||
def cancel(self) -> None:
|
||||
"""Cancel ongoing generation."""
|
||||
self._cancel_event.set()
|
||||
|
||||
|
||||
class MockLLMService(BaseLLMService):
|
||||
"""
|
||||
Mock LLM service for testing without API calls.
|
||||
"""
|
||||
|
||||
def __init__(self, response_delay: float = 0.5):
|
||||
super().__init__(model="mock")
|
||||
self.response_delay = response_delay
|
||||
self.responses = [
|
||||
"Hello! How can I help you today?",
|
||||
"That's an interesting question. Let me think about it.",
|
||||
"I understand. Is there anything else you'd like to know?",
|
||||
"Great! I'm here if you need anything else.",
|
||||
]
|
||||
self._response_index = 0
|
||||
|
||||
async def connect(self) -> None:
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("Mock LLM service connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Mock LLM service disconnected")
|
||||
|
||||
async def generate(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> str:
|
||||
await asyncio.sleep(self.response_delay)
|
||||
response = self.responses[self._response_index % len(self.responses)]
|
||||
self._response_index += 1
|
||||
return response
|
||||
|
||||
async def generate_stream(
|
||||
self,
|
||||
messages: List[LLMMessage],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None
|
||||
) -> AsyncIterator[LLMStreamEvent]:
|
||||
response = await self.generate(messages, temperature, max_tokens)
|
||||
|
||||
# Stream word by word
|
||||
words = response.split()
|
||||
for i, word in enumerate(words):
|
||||
if i > 0:
|
||||
yield LLMStreamEvent(type="text_delta", text=" ")
|
||||
yield LLMStreamEvent(type="text_delta", text=word)
|
||||
await asyncio.sleep(0.05) # Simulate streaming delay
|
||||
yield LLMStreamEvent(type="done")
|
||||
321
services/openai_compatible_asr.py
Normal file
321
services/openai_compatible_asr.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""OpenAI-compatible ASR (Automatic Speech Recognition) Service.
|
||||
|
||||
Uses the SiliconFlow API for speech-to-text transcription.
|
||||
API: https://docs.siliconflow.cn/cn/api-reference/audio/create-audio-transcriptions
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import wave
|
||||
from typing import AsyncIterator, Optional, Callable, Awaitable
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
AIOHTTP_AVAILABLE = True
|
||||
except ImportError:
|
||||
AIOHTTP_AVAILABLE = False
|
||||
logger.warning("aiohttp not available - OpenAICompatibleASRService will not work")
|
||||
|
||||
from services.base import BaseASRService, ASRResult, ServiceState
|
||||
|
||||
|
||||
class OpenAICompatibleASRService(BaseASRService):
|
||||
"""
|
||||
OpenAI-compatible ASR service for speech-to-text transcription.
|
||||
|
||||
Features:
|
||||
- Buffers incoming audio chunks
|
||||
- Provides interim transcriptions periodically (for streaming to client)
|
||||
- Final transcription on EOU
|
||||
|
||||
API Details:
|
||||
- Endpoint: POST https://api.siliconflow.cn/v1/audio/transcriptions
|
||||
- Models: FunAudioLLM/SenseVoiceSmall (default), TeleAI/TeleSpeechASR
|
||||
- Input: Audio file (multipart/form-data)
|
||||
- Output: {"text": "transcribed text"}
|
||||
"""
|
||||
|
||||
# Supported models
|
||||
MODELS = {
|
||||
"sensevoice": "FunAudioLLM/SenseVoiceSmall",
|
||||
"telespeech": "TeleAI/TeleSpeechASR",
|
||||
}
|
||||
|
||||
API_URL = "https://api.siliconflow.cn/v1/audio/transcriptions"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
model: str = "FunAudioLLM/SenseVoiceSmall",
|
||||
sample_rate: int = 16000,
|
||||
language: str = "auto",
|
||||
interim_interval_ms: int = 500, # How often to send interim results
|
||||
min_audio_for_interim_ms: int = 300, # Min audio before first interim
|
||||
on_transcript: Optional[Callable[[str, bool], Awaitable[None]]] = None
|
||||
):
|
||||
"""
|
||||
Initialize OpenAI-compatible ASR service.
|
||||
|
||||
Args:
|
||||
api_key: Provider API key
|
||||
model: ASR model name or alias
|
||||
sample_rate: Audio sample rate (16000 recommended)
|
||||
language: Language code (auto for automatic detection)
|
||||
interim_interval_ms: How often to generate interim transcriptions
|
||||
min_audio_for_interim_ms: Minimum audio duration before first interim
|
||||
on_transcript: Callback for transcription results (text, is_final)
|
||||
"""
|
||||
super().__init__(sample_rate=sample_rate, language=language)
|
||||
|
||||
if not AIOHTTP_AVAILABLE:
|
||||
raise RuntimeError("aiohttp is required for OpenAICompatibleASRService")
|
||||
|
||||
self.api_key = api_key
|
||||
self.model = self.MODELS.get(model.lower(), model)
|
||||
self.interim_interval_ms = interim_interval_ms
|
||||
self.min_audio_for_interim_ms = min_audio_for_interim_ms
|
||||
self.on_transcript = on_transcript
|
||||
|
||||
# Session
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
# Audio buffer
|
||||
self._audio_buffer: bytes = b""
|
||||
self._current_text: str = ""
|
||||
self._last_interim_time: float = 0
|
||||
|
||||
# Transcript queue for async iteration
|
||||
self._transcript_queue: asyncio.Queue[ASRResult] = asyncio.Queue()
|
||||
|
||||
# Background task for interim results
|
||||
self._interim_task: Optional[asyncio.Task] = None
|
||||
self._running = False
|
||||
|
||||
logger.info(f"OpenAICompatibleASRService initialized with model: {self.model}")
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to the service."""
|
||||
self._session = aiohttp.ClientSession(
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}"
|
||||
}
|
||||
)
|
||||
self._running = True
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("OpenAICompatibleASRService connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Disconnect and cleanup."""
|
||||
self._running = False
|
||||
|
||||
if self._interim_task:
|
||||
self._interim_task.cancel()
|
||||
try:
|
||||
await self._interim_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._interim_task = None
|
||||
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
|
||||
self._audio_buffer = b""
|
||||
self._current_text = ""
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("OpenAICompatibleASRService disconnected")
|
||||
|
||||
async def send_audio(self, audio: bytes) -> None:
|
||||
"""
|
||||
Buffer incoming audio data.
|
||||
|
||||
Args:
|
||||
audio: PCM audio data (16-bit, mono)
|
||||
"""
|
||||
self._audio_buffer += audio
|
||||
|
||||
async def transcribe_buffer(self, is_final: bool = False) -> Optional[str]:
|
||||
"""
|
||||
Transcribe current audio buffer.
|
||||
|
||||
Args:
|
||||
is_final: Whether this is the final transcription
|
||||
|
||||
Returns:
|
||||
Transcribed text or None if not enough audio
|
||||
"""
|
||||
if not self._session:
|
||||
logger.warning("ASR session not connected")
|
||||
return None
|
||||
|
||||
# Check minimum audio duration
|
||||
audio_duration_ms = len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
||||
|
||||
if not is_final and audio_duration_ms < self.min_audio_for_interim_ms:
|
||||
return None
|
||||
|
||||
if audio_duration_ms < 100: # Less than 100ms - too short
|
||||
return None
|
||||
|
||||
try:
|
||||
# Convert PCM to WAV in memory
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, 'wb') as wav_file:
|
||||
wav_file.setnchannels(1)
|
||||
wav_file.setsampwidth(2) # 16-bit
|
||||
wav_file.setframerate(self.sample_rate)
|
||||
wav_file.writeframes(self._audio_buffer)
|
||||
|
||||
wav_buffer.seek(0)
|
||||
wav_data = wav_buffer.read()
|
||||
|
||||
# Send to API
|
||||
form_data = aiohttp.FormData()
|
||||
form_data.add_field(
|
||||
'file',
|
||||
wav_data,
|
||||
filename='audio.wav',
|
||||
content_type='audio/wav'
|
||||
)
|
||||
form_data.add_field('model', self.model)
|
||||
|
||||
async with self._session.post(self.API_URL, data=form_data) as response:
|
||||
if response.status == 200:
|
||||
result = await response.json()
|
||||
text = result.get("text", "").strip()
|
||||
|
||||
if text:
|
||||
self._current_text = text
|
||||
|
||||
# Notify via callback
|
||||
if self.on_transcript:
|
||||
await self.on_transcript(text, is_final)
|
||||
|
||||
# Queue result
|
||||
await self._transcript_queue.put(
|
||||
ASRResult(text=text, is_final=is_final)
|
||||
)
|
||||
|
||||
logger.debug(f"ASR {'final' if is_final else 'interim'}: {text[:50]}...")
|
||||
return text
|
||||
else:
|
||||
error_text = await response.text()
|
||||
logger.error(f"ASR API error {response.status}: {error_text}")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ASR transcription error: {e}")
|
||||
return None
|
||||
|
||||
async def get_final_transcription(self) -> str:
|
||||
"""
|
||||
Get final transcription and clear buffer.
|
||||
|
||||
Call this when EOU is detected.
|
||||
|
||||
Returns:
|
||||
Final transcribed text
|
||||
"""
|
||||
# Transcribe full buffer as final
|
||||
text = await self.transcribe_buffer(is_final=True)
|
||||
|
||||
# Clear buffer
|
||||
result = text or self._current_text
|
||||
self._audio_buffer = b""
|
||||
self._current_text = ""
|
||||
|
||||
return result
|
||||
|
||||
def get_and_clear_text(self) -> str:
|
||||
"""
|
||||
Get accumulated text and clear buffer.
|
||||
|
||||
Compatible with BufferedASRService interface.
|
||||
"""
|
||||
text = self._current_text
|
||||
self._current_text = ""
|
||||
self._audio_buffer = b""
|
||||
return text
|
||||
|
||||
def get_audio_buffer(self) -> bytes:
|
||||
"""Get current audio buffer."""
|
||||
return self._audio_buffer
|
||||
|
||||
def get_audio_duration_ms(self) -> float:
|
||||
"""Get current audio buffer duration in milliseconds."""
|
||||
return len(self._audio_buffer) / (self.sample_rate * 2) * 1000
|
||||
|
||||
def clear_buffer(self) -> None:
|
||||
"""Clear audio and text buffers."""
|
||||
self._audio_buffer = b""
|
||||
self._current_text = ""
|
||||
|
||||
async def receive_transcripts(self) -> AsyncIterator[ASRResult]:
|
||||
"""
|
||||
Async iterator for transcription results.
|
||||
|
||||
Yields:
|
||||
ASRResult with text and is_final flag
|
||||
"""
|
||||
while self._running:
|
||||
try:
|
||||
result = await asyncio.wait_for(
|
||||
self._transcript_queue.get(),
|
||||
timeout=0.1
|
||||
)
|
||||
yield result
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
|
||||
async def start_interim_transcription(self) -> None:
|
||||
"""
|
||||
Start background task for interim transcriptions.
|
||||
|
||||
This periodically transcribes buffered audio for
|
||||
real-time feedback to the user.
|
||||
"""
|
||||
if self._interim_task and not self._interim_task.done():
|
||||
return
|
||||
|
||||
self._interim_task = asyncio.create_task(self._interim_loop())
|
||||
|
||||
async def stop_interim_transcription(self) -> None:
|
||||
"""Stop interim transcription task."""
|
||||
if self._interim_task:
|
||||
self._interim_task.cancel()
|
||||
try:
|
||||
await self._interim_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
self._interim_task = None
|
||||
|
||||
async def _interim_loop(self) -> None:
|
||||
"""Background loop for interim transcriptions."""
|
||||
import time
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(self.interim_interval_ms / 1000)
|
||||
|
||||
# Check if we have enough new audio
|
||||
current_time = time.time()
|
||||
time_since_last = (current_time - self._last_interim_time) * 1000
|
||||
|
||||
if time_since_last >= self.interim_interval_ms:
|
||||
audio_duration = self.get_audio_duration_ms()
|
||||
|
||||
if audio_duration >= self.min_audio_for_interim_ms:
|
||||
await self.transcribe_buffer(is_final=False)
|
||||
self._last_interim_time = current_time
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Interim transcription error: {e}")
|
||||
|
||||
|
||||
# Backward-compatible alias
|
||||
SiliconFlowASRService = OpenAICompatibleASRService
|
||||
324
services/openai_compatible_tts.py
Normal file
324
services/openai_compatible_tts.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""OpenAI-compatible TTS Service with streaming support.
|
||||
|
||||
Uses SiliconFlow's CosyVoice2 or MOSS-TTSD models for low-latency
|
||||
text-to-speech synthesis with streaming.
|
||||
|
||||
API Docs: https://docs.siliconflow.cn/cn/api-reference/audio/create-speech
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from typing import AsyncIterator, Optional
|
||||
from loguru import logger
|
||||
|
||||
from services.base import BaseTTSService, TTSChunk, ServiceState
|
||||
from services.streaming_tts_adapter import StreamingTTSAdapter # backward-compatible re-export
|
||||
|
||||
|
||||
class OpenAICompatibleTTSService(BaseTTSService):
|
||||
"""
|
||||
OpenAI-compatible TTS service with streaming support.
|
||||
|
||||
Supports CosyVoice2-0.5B and MOSS-TTSD-v0.5 models.
|
||||
"""
|
||||
|
||||
# Available voices
|
||||
VOICES = {
|
||||
"alex": "FunAudioLLM/CosyVoice2-0.5B:alex",
|
||||
"anna": "FunAudioLLM/CosyVoice2-0.5B:anna",
|
||||
"bella": "FunAudioLLM/CosyVoice2-0.5B:bella",
|
||||
"benjamin": "FunAudioLLM/CosyVoice2-0.5B:benjamin",
|
||||
"charles": "FunAudioLLM/CosyVoice2-0.5B:charles",
|
||||
"claire": "FunAudioLLM/CosyVoice2-0.5B:claire",
|
||||
"david": "FunAudioLLM/CosyVoice2-0.5B:david",
|
||||
"diana": "FunAudioLLM/CosyVoice2-0.5B:diana",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: Optional[str] = None,
|
||||
voice: str = "anna",
|
||||
model: str = "FunAudioLLM/CosyVoice2-0.5B",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
"""
|
||||
Initialize OpenAI-compatible TTS service.
|
||||
|
||||
Args:
|
||||
api_key: Provider API key (defaults to SILICONFLOW_API_KEY env var)
|
||||
voice: Voice name (alex, anna, bella, benjamin, charles, claire, david, diana)
|
||||
model: Model name
|
||||
sample_rate: Output sample rate (8000, 16000, 24000, 32000, 44100)
|
||||
speed: Speech speed (0.25 to 4.0)
|
||||
"""
|
||||
# Resolve voice name (case-insensitive), and normalize "model:VoiceId" suffix.
|
||||
resolved_voice = (voice or "").strip()
|
||||
voice_lookup = resolved_voice.lower()
|
||||
if voice_lookup in self.VOICES:
|
||||
full_voice = self.VOICES[voice_lookup]
|
||||
elif ":" in resolved_voice:
|
||||
model_part, voice_part = resolved_voice.split(":", 1)
|
||||
normalized_voice_part = voice_part.strip().lower()
|
||||
if normalized_voice_part in self.VOICES:
|
||||
full_voice = f"{(model_part or model).strip()}:{normalized_voice_part}"
|
||||
else:
|
||||
full_voice = resolved_voice
|
||||
else:
|
||||
full_voice = resolved_voice
|
||||
|
||||
super().__init__(voice=full_voice, sample_rate=sample_rate, speed=speed)
|
||||
|
||||
self.api_key = api_key or os.getenv("SILICONFLOW_API_KEY")
|
||||
self.model = model
|
||||
self.api_url = "https://api.siliconflow.cn/v1/audio/speech"
|
||||
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
self._cancel_event = asyncio.Event()
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Initialize HTTP session."""
|
||||
if not self.api_key:
|
||||
raise ValueError("SiliconFlow API key not provided. Set SILICONFLOW_API_KEY env var.")
|
||||
|
||||
self._session = aiohttp.ClientSession(
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
)
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info(f"SiliconFlow TTS service ready: voice={self.voice}, model={self.model}")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Close HTTP session."""
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
self._session = None
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("SiliconFlow TTS service disconnected")
|
||||
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""Synthesize complete audio for text."""
|
||||
audio_data = b""
|
||||
async for chunk in self.synthesize_stream(text):
|
||||
audio_data += chunk.audio
|
||||
return audio_data
|
||||
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""
|
||||
Synthesize audio in streaming mode.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Yields:
|
||||
TTSChunk objects with PCM audio
|
||||
"""
|
||||
if not self._session:
|
||||
raise RuntimeError("TTS service not connected")
|
||||
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
self._cancel_event.clear()
|
||||
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"input": text,
|
||||
"voice": self.voice,
|
||||
"response_format": "pcm",
|
||||
"sample_rate": self.sample_rate,
|
||||
"stream": True,
|
||||
"speed": self.speed
|
||||
}
|
||||
|
||||
try:
|
||||
async with self._session.post(self.api_url, json=payload) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"SiliconFlow TTS error: {response.status} - {error_text}")
|
||||
return
|
||||
|
||||
# Stream audio chunks
|
||||
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
|
||||
buffer = b""
|
||||
pending_chunk = None
|
||||
|
||||
async for chunk in response.content.iter_any():
|
||||
if self._cancel_event.is_set():
|
||||
logger.info("TTS synthesis cancelled")
|
||||
return
|
||||
|
||||
buffer += chunk
|
||||
|
||||
# Yield complete chunks
|
||||
while len(buffer) >= chunk_size:
|
||||
audio_chunk = buffer[:chunk_size]
|
||||
buffer = buffer[chunk_size:]
|
||||
|
||||
# Keep one full chunk buffered so we can always tag the true
|
||||
# last full chunk as final when stream length is an exact multiple.
|
||||
if pending_chunk is not None:
|
||||
yield TTSChunk(
|
||||
audio=pending_chunk,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=False
|
||||
)
|
||||
pending_chunk = audio_chunk
|
||||
|
||||
# Flush pending chunk(s) and remaining tail.
|
||||
if pending_chunk is not None:
|
||||
if buffer:
|
||||
yield TTSChunk(
|
||||
audio=pending_chunk,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=False
|
||||
)
|
||||
pending_chunk = None
|
||||
else:
|
||||
yield TTSChunk(
|
||||
audio=pending_chunk,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=True
|
||||
)
|
||||
pending_chunk = None
|
||||
|
||||
if buffer:
|
||||
yield TTSChunk(
|
||||
audio=buffer,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=True
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("TTS synthesis cancelled via asyncio")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TTS synthesis error: {e}")
|
||||
raise
|
||||
|
||||
async def cancel(self) -> None:
|
||||
"""Cancel ongoing synthesis."""
|
||||
self._cancel_event.set()
|
||||
|
||||
|
||||
class StreamingTTSAdapter:
|
||||
"""
|
||||
Adapter for streaming LLM text to TTS with sentence-level chunking.
|
||||
|
||||
This reduces latency by starting TTS as soon as a complete sentence
|
||||
is received from the LLM, rather than waiting for the full response.
|
||||
"""
|
||||
|
||||
# Sentence delimiters
|
||||
SENTENCE_ENDS = {',', '。', '!', '?', '.', '!', '?', '\n'}
|
||||
|
||||
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
|
||||
self.tts_service = tts_service
|
||||
self.transport = transport
|
||||
self.session_id = session_id
|
||||
self._buffer = ""
|
||||
self._cancel_event = asyncio.Event()
|
||||
self._is_speaking = False
|
||||
|
||||
def _is_non_sentence_period(self, text: str, idx: int) -> bool:
|
||||
"""Check whether '.' should NOT be treated as a sentence delimiter."""
|
||||
if text[idx] != ".":
|
||||
return False
|
||||
|
||||
# Decimal/version segment: 1.2, v1.2.3
|
||||
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
|
||||
return True
|
||||
|
||||
# Number abbreviations: No.1 / No. 1
|
||||
left_start = idx - 1
|
||||
while left_start >= 0 and text[left_start].isalpha():
|
||||
left_start -= 1
|
||||
left_token = text[left_start + 1:idx].lower()
|
||||
if left_token == "no":
|
||||
j = idx + 1
|
||||
while j < len(text) and text[j].isspace():
|
||||
j += 1
|
||||
if j < len(text) and text[j].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def process_text_chunk(self, text_chunk: str) -> None:
|
||||
"""
|
||||
Process a text chunk from LLM and trigger TTS when sentence is complete.
|
||||
|
||||
Args:
|
||||
text_chunk: Text chunk from LLM streaming
|
||||
"""
|
||||
if self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
self._buffer += text_chunk
|
||||
|
||||
# Check for sentence completion
|
||||
while True:
|
||||
split_idx = -1
|
||||
for i, char in enumerate(self._buffer):
|
||||
if char == "." and self._is_non_sentence_period(self._buffer, i):
|
||||
continue
|
||||
if char in self.SENTENCE_ENDS:
|
||||
split_idx = i
|
||||
break
|
||||
if split_idx < 0:
|
||||
break
|
||||
|
||||
end_idx = split_idx + 1
|
||||
while end_idx < len(self._buffer) and self._buffer[end_idx] in self.SENTENCE_ENDS:
|
||||
end_idx += 1
|
||||
|
||||
sentence = self._buffer[:end_idx].strip()
|
||||
self._buffer = self._buffer[end_idx:]
|
||||
|
||||
if sentence and any(ch.isalnum() for ch in sentence):
|
||||
await self._speak_sentence(sentence)
|
||||
|
||||
async def flush(self) -> None:
|
||||
"""Flush remaining buffer."""
|
||||
if self._buffer.strip() and not self._cancel_event.is_set():
|
||||
await self._speak_sentence(self._buffer.strip())
|
||||
self._buffer = ""
|
||||
|
||||
async def _speak_sentence(self, text: str) -> None:
|
||||
"""Synthesize and send a sentence."""
|
||||
if not text or self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
self._is_speaking = True
|
||||
|
||||
try:
|
||||
async for chunk in self.tts_service.synthesize_stream(text):
|
||||
if self._cancel_event.is_set():
|
||||
break
|
||||
await self.transport.send_audio(chunk.audio)
|
||||
await asyncio.sleep(0.01) # Prevent flooding
|
||||
except Exception as e:
|
||||
logger.error(f"TTS speak error: {e}")
|
||||
finally:
|
||||
self._is_speaking = False
|
||||
|
||||
def cancel(self) -> None:
|
||||
"""Cancel ongoing speech."""
|
||||
self._cancel_event.set()
|
||||
self._buffer = ""
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset for new turn."""
|
||||
self._cancel_event.clear()
|
||||
self._buffer = ""
|
||||
self._is_speaking = False
|
||||
|
||||
@property
|
||||
def is_speaking(self) -> bool:
|
||||
return self._is_speaking
|
||||
|
||||
|
||||
# Backward-compatible alias
|
||||
SiliconFlowTTSService = OpenAICompatibleTTSService
|
||||
548
services/realtime.py
Normal file
548
services/realtime.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""OpenAI Realtime API Service.
|
||||
|
||||
Provides true duplex voice conversation using OpenAI's Realtime API,
|
||||
similar to active-call's RealtimeProcessor. This bypasses the need for
|
||||
separate ASR/LLM/TTS services by handling everything server-side.
|
||||
|
||||
The Realtime API provides:
|
||||
- Server-side VAD with turn detection
|
||||
- Streaming speech-to-text
|
||||
- Streaming LLM responses
|
||||
- Streaming text-to-speech
|
||||
- Function calling support
|
||||
- Barge-in/interruption handling
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import json
|
||||
import base64
|
||||
from typing import Optional, Dict, Any, Callable, Awaitable, List
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import websockets
|
||||
WEBSOCKETS_AVAILABLE = True
|
||||
except ImportError:
|
||||
WEBSOCKETS_AVAILABLE = False
|
||||
logger.warning("websockets not available - Realtime API will be disabled")
|
||||
|
||||
|
||||
class RealtimeState(Enum):
|
||||
"""Realtime API connection state."""
|
||||
DISCONNECTED = "disconnected"
|
||||
CONNECTING = "connecting"
|
||||
CONNECTED = "connected"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RealtimeConfig:
|
||||
"""Configuration for OpenAI Realtime API."""
|
||||
|
||||
# API Configuration
|
||||
api_key: Optional[str] = None
|
||||
model: str = "gpt-4o-realtime-preview"
|
||||
endpoint: Optional[str] = None # For Azure or custom endpoints
|
||||
|
||||
# Voice Configuration
|
||||
voice: str = "alloy" # alloy, echo, shimmer, etc.
|
||||
instructions: str = (
|
||||
"You are a helpful, friendly voice assistant. "
|
||||
"Keep your responses concise and conversational."
|
||||
)
|
||||
|
||||
# Turn Detection (Server-side VAD)
|
||||
turn_detection: Optional[Dict[str, Any]] = field(default_factory=lambda: {
|
||||
"type": "server_vad",
|
||||
"threshold": 0.5,
|
||||
"prefix_padding_ms": 300,
|
||||
"silence_duration_ms": 500
|
||||
})
|
||||
|
||||
# Audio Configuration
|
||||
input_audio_format: str = "pcm16"
|
||||
output_audio_format: str = "pcm16"
|
||||
|
||||
# Tools/Functions
|
||||
tools: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
class RealtimeService:
|
||||
"""
|
||||
OpenAI Realtime API service for true duplex voice conversation.
|
||||
|
||||
This service handles the entire voice conversation pipeline:
|
||||
1. Audio input → Server-side VAD → Speech-to-text
|
||||
2. Text → LLM processing → Response generation
|
||||
3. Response → Text-to-speech → Audio output
|
||||
|
||||
Events emitted:
|
||||
- on_audio: Audio output from the assistant
|
||||
- on_transcript: Text transcript (user or assistant)
|
||||
- on_speech_started: User started speaking
|
||||
- on_speech_stopped: User stopped speaking
|
||||
- on_response_started: Assistant started responding
|
||||
- on_response_done: Assistant finished responding
|
||||
- on_function_call: Function call requested
|
||||
- on_error: Error occurred
|
||||
"""
|
||||
|
||||
def __init__(self, config: Optional[RealtimeConfig] = None):
|
||||
"""
|
||||
Initialize Realtime API service.
|
||||
|
||||
Args:
|
||||
config: Realtime configuration (uses defaults if not provided)
|
||||
"""
|
||||
self.config = config or RealtimeConfig()
|
||||
self.config.api_key = self.config.api_key or os.getenv("OPENAI_API_KEY")
|
||||
|
||||
self.state = RealtimeState.DISCONNECTED
|
||||
self._ws = None
|
||||
self._receive_task: Optional[asyncio.Task] = None
|
||||
self._cancel_event = asyncio.Event()
|
||||
|
||||
# Event callbacks
|
||||
self._callbacks: Dict[str, List[Callable]] = {
|
||||
"on_audio": [],
|
||||
"on_transcript": [],
|
||||
"on_speech_started": [],
|
||||
"on_speech_stopped": [],
|
||||
"on_response_started": [],
|
||||
"on_response_done": [],
|
||||
"on_function_call": [],
|
||||
"on_error": [],
|
||||
"on_interrupted": [],
|
||||
}
|
||||
|
||||
logger.debug(f"RealtimeService initialized with model={self.config.model}")
|
||||
|
||||
def on(self, event: str, callback: Callable[..., Awaitable[None]]) -> None:
|
||||
"""
|
||||
Register event callback.
|
||||
|
||||
Args:
|
||||
event: Event name
|
||||
callback: Async callback function
|
||||
"""
|
||||
if event in self._callbacks:
|
||||
self._callbacks[event].append(callback)
|
||||
|
||||
async def _emit(self, event: str, *args, **kwargs) -> None:
|
||||
"""Emit event to all registered callbacks."""
|
||||
for callback in self._callbacks.get(event, []):
|
||||
try:
|
||||
await callback(*args, **kwargs)
|
||||
except Exception as e:
|
||||
logger.error(f"Event callback error ({event}): {e}")
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Connect to OpenAI Realtime API."""
|
||||
if not WEBSOCKETS_AVAILABLE:
|
||||
raise RuntimeError("websockets package not installed")
|
||||
|
||||
if not self.config.api_key:
|
||||
raise ValueError("OpenAI API key not provided")
|
||||
|
||||
self.state = RealtimeState.CONNECTING
|
||||
|
||||
# Build URL
|
||||
if self.config.endpoint:
|
||||
# Azure or custom endpoint
|
||||
url = f"{self.config.endpoint}/openai/realtime?api-version=2024-10-01-preview&deployment={self.config.model}"
|
||||
else:
|
||||
# OpenAI endpoint
|
||||
url = f"wss://api.openai.com/v1/realtime?model={self.config.model}"
|
||||
|
||||
# Build headers
|
||||
headers = {}
|
||||
if self.config.endpoint:
|
||||
headers["api-key"] = self.config.api_key
|
||||
else:
|
||||
headers["Authorization"] = f"Bearer {self.config.api_key}"
|
||||
headers["OpenAI-Beta"] = "realtime=v1"
|
||||
|
||||
try:
|
||||
logger.info(f"Connecting to Realtime API: {url}")
|
||||
self._ws = await websockets.connect(url, extra_headers=headers)
|
||||
|
||||
# Send session configuration
|
||||
await self._configure_session()
|
||||
|
||||
# Start receive loop
|
||||
self._receive_task = asyncio.create_task(self._receive_loop())
|
||||
|
||||
self.state = RealtimeState.CONNECTED
|
||||
logger.info("Realtime API connected successfully")
|
||||
|
||||
except Exception as e:
|
||||
self.state = RealtimeState.ERROR
|
||||
logger.error(f"Realtime API connection failed: {e}")
|
||||
raise
|
||||
|
||||
async def _configure_session(self) -> None:
|
||||
"""Send session configuration to server."""
|
||||
session_config = {
|
||||
"type": "session.update",
|
||||
"session": {
|
||||
"modalities": ["text", "audio"],
|
||||
"instructions": self.config.instructions,
|
||||
"voice": self.config.voice,
|
||||
"input_audio_format": self.config.input_audio_format,
|
||||
"output_audio_format": self.config.output_audio_format,
|
||||
"turn_detection": self.config.turn_detection,
|
||||
}
|
||||
}
|
||||
|
||||
if self.config.tools:
|
||||
session_config["session"]["tools"] = self.config.tools
|
||||
|
||||
await self._send(session_config)
|
||||
logger.debug("Session configuration sent")
|
||||
|
||||
async def _send(self, data: Dict[str, Any]) -> None:
|
||||
"""Send JSON data to server."""
|
||||
if self._ws:
|
||||
await self._ws.send(json.dumps(data))
|
||||
|
||||
async def send_audio(self, audio_bytes: bytes) -> None:
|
||||
"""
|
||||
Send audio to the Realtime API.
|
||||
|
||||
Args:
|
||||
audio_bytes: PCM audio data (16-bit, mono, 24kHz by default)
|
||||
"""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
# Encode audio as base64
|
||||
audio_b64 = base64.standard_b64encode(audio_bytes).decode()
|
||||
|
||||
await self._send({
|
||||
"type": "input_audio_buffer.append",
|
||||
"audio": audio_b64
|
||||
})
|
||||
|
||||
async def send_text(self, text: str) -> None:
|
||||
"""
|
||||
Send text input (bypassing audio).
|
||||
|
||||
Args:
|
||||
text: User text input
|
||||
"""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
# Create a conversation item with user text
|
||||
await self._send({
|
||||
"type": "conversation.item.create",
|
||||
"item": {
|
||||
"type": "message",
|
||||
"role": "user",
|
||||
"content": [{"type": "input_text", "text": text}]
|
||||
}
|
||||
})
|
||||
|
||||
# Trigger response
|
||||
await self._send({"type": "response.create"})
|
||||
|
||||
async def cancel_response(self) -> None:
|
||||
"""Cancel the current response (for barge-in)."""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
await self._send({"type": "response.cancel"})
|
||||
logger.debug("Response cancelled")
|
||||
|
||||
async def commit_audio(self) -> None:
|
||||
"""Commit the audio buffer and trigger response."""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
await self._send({"type": "input_audio_buffer.commit"})
|
||||
await self._send({"type": "response.create"})
|
||||
|
||||
async def clear_audio_buffer(self) -> None:
|
||||
"""Clear the input audio buffer."""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
await self._send({"type": "input_audio_buffer.clear"})
|
||||
|
||||
async def submit_function_result(self, call_id: str, result: str) -> None:
|
||||
"""
|
||||
Submit function call result.
|
||||
|
||||
Args:
|
||||
call_id: The function call ID
|
||||
result: JSON string result
|
||||
"""
|
||||
if self.state != RealtimeState.CONNECTED:
|
||||
return
|
||||
|
||||
await self._send({
|
||||
"type": "conversation.item.create",
|
||||
"item": {
|
||||
"type": "function_call_output",
|
||||
"call_id": call_id,
|
||||
"output": result
|
||||
}
|
||||
})
|
||||
|
||||
# Trigger response with the function result
|
||||
await self._send({"type": "response.create"})
|
||||
|
||||
async def _receive_loop(self) -> None:
|
||||
"""Receive and process messages from the Realtime API."""
|
||||
if not self._ws:
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in self._ws:
|
||||
try:
|
||||
data = json.loads(message)
|
||||
await self._handle_event(data)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Invalid JSON received: {message[:100]}")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.debug("Receive loop cancelled")
|
||||
except websockets.ConnectionClosed as e:
|
||||
logger.info(f"WebSocket closed: {e}")
|
||||
self.state = RealtimeState.DISCONNECTED
|
||||
except Exception as e:
|
||||
logger.error(f"Receive loop error: {e}")
|
||||
self.state = RealtimeState.ERROR
|
||||
|
||||
async def _handle_event(self, data: Dict[str, Any]) -> None:
|
||||
"""Handle incoming event from Realtime API."""
|
||||
event_type = data.get("type", "unknown")
|
||||
|
||||
# Audio delta - streaming audio output
|
||||
if event_type == "response.audio.delta":
|
||||
if "delta" in data:
|
||||
audio_bytes = base64.standard_b64decode(data["delta"])
|
||||
await self._emit("on_audio", audio_bytes)
|
||||
|
||||
# Audio transcript delta - streaming text
|
||||
elif event_type == "response.audio_transcript.delta":
|
||||
if "delta" in data:
|
||||
await self._emit("on_transcript", data["delta"], "assistant", False)
|
||||
|
||||
# Audio transcript done
|
||||
elif event_type == "response.audio_transcript.done":
|
||||
if "transcript" in data:
|
||||
await self._emit("on_transcript", data["transcript"], "assistant", True)
|
||||
|
||||
# Input audio transcript (user speech)
|
||||
elif event_type == "conversation.item.input_audio_transcription.completed":
|
||||
if "transcript" in data:
|
||||
await self._emit("on_transcript", data["transcript"], "user", True)
|
||||
|
||||
# Speech started (server VAD detected speech)
|
||||
elif event_type == "input_audio_buffer.speech_started":
|
||||
await self._emit("on_speech_started", data.get("audio_start_ms", 0))
|
||||
|
||||
# Speech stopped
|
||||
elif event_type == "input_audio_buffer.speech_stopped":
|
||||
await self._emit("on_speech_stopped", data.get("audio_end_ms", 0))
|
||||
|
||||
# Response started
|
||||
elif event_type == "response.created":
|
||||
await self._emit("on_response_started", data.get("response", {}))
|
||||
|
||||
# Response done
|
||||
elif event_type == "response.done":
|
||||
await self._emit("on_response_done", data.get("response", {}))
|
||||
|
||||
# Function call
|
||||
elif event_type == "response.function_call_arguments.done":
|
||||
call_id = data.get("call_id")
|
||||
name = data.get("name")
|
||||
arguments = data.get("arguments", "{}")
|
||||
await self._emit("on_function_call", call_id, name, arguments)
|
||||
|
||||
# Error
|
||||
elif event_type == "error":
|
||||
error = data.get("error", {})
|
||||
logger.error(f"Realtime API error: {error}")
|
||||
await self._emit("on_error", error)
|
||||
|
||||
# Session events
|
||||
elif event_type == "session.created":
|
||||
logger.info("Session created")
|
||||
elif event_type == "session.updated":
|
||||
logger.debug("Session updated")
|
||||
|
||||
else:
|
||||
logger.debug(f"Unhandled event type: {event_type}")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Disconnect from Realtime API."""
|
||||
self._cancel_event.set()
|
||||
|
||||
if self._receive_task:
|
||||
self._receive_task.cancel()
|
||||
try:
|
||||
await self._receive_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
if self._ws:
|
||||
await self._ws.close()
|
||||
self._ws = None
|
||||
|
||||
self.state = RealtimeState.DISCONNECTED
|
||||
logger.info("Realtime API disconnected")
|
||||
|
||||
|
||||
class RealtimePipeline:
|
||||
"""
|
||||
Pipeline adapter for RealtimeService.
|
||||
|
||||
Provides a compatible interface with DuplexPipeline but uses
|
||||
OpenAI Realtime API for all processing.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transport,
|
||||
session_id: str,
|
||||
config: Optional[RealtimeConfig] = None
|
||||
):
|
||||
"""
|
||||
Initialize Realtime pipeline.
|
||||
|
||||
Args:
|
||||
transport: Transport for sending audio/events
|
||||
session_id: Session identifier
|
||||
config: Realtime configuration
|
||||
"""
|
||||
self.transport = transport
|
||||
self.session_id = session_id
|
||||
|
||||
self.service = RealtimeService(config)
|
||||
|
||||
# Register callbacks
|
||||
self.service.on("on_audio", self._on_audio)
|
||||
self.service.on("on_transcript", self._on_transcript)
|
||||
self.service.on("on_speech_started", self._on_speech_started)
|
||||
self.service.on("on_speech_stopped", self._on_speech_stopped)
|
||||
self.service.on("on_response_started", self._on_response_started)
|
||||
self.service.on("on_response_done", self._on_response_done)
|
||||
self.service.on("on_error", self._on_error)
|
||||
|
||||
self._is_speaking = False
|
||||
self._running = True
|
||||
|
||||
logger.info(f"RealtimePipeline initialized for session {session_id}")
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Start the pipeline."""
|
||||
await self.service.connect()
|
||||
|
||||
async def process_audio(self, pcm_bytes: bytes) -> None:
|
||||
"""
|
||||
Process incoming audio.
|
||||
|
||||
Note: Realtime API expects 24kHz audio by default.
|
||||
You may need to resample from 16kHz.
|
||||
"""
|
||||
if not self._running:
|
||||
return
|
||||
|
||||
# TODO: Resample from 16kHz to 24kHz if needed
|
||||
await self.service.send_audio(pcm_bytes)
|
||||
|
||||
async def process_text(self, text: str) -> None:
|
||||
"""Process text input."""
|
||||
if not self._running:
|
||||
return
|
||||
|
||||
await self.service.send_text(text)
|
||||
|
||||
async def interrupt(self) -> None:
|
||||
"""Interrupt current response."""
|
||||
await self.service.cancel_response()
|
||||
await self.transport.send_event({
|
||||
"event": "interrupt",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms()
|
||||
})
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
"""Cleanup resources."""
|
||||
self._running = False
|
||||
await self.service.disconnect()
|
||||
|
||||
# Event handlers
|
||||
|
||||
async def _on_audio(self, audio_bytes: bytes) -> None:
|
||||
"""Handle audio output."""
|
||||
await self.transport.send_audio(audio_bytes)
|
||||
|
||||
async def _on_transcript(self, text: str, role: str, is_final: bool) -> None:
|
||||
"""Handle transcript."""
|
||||
logger.info(f"[{role.upper()}] {text[:50]}..." if len(text) > 50 else f"[{role.upper()}] {text}")
|
||||
|
||||
async def _on_speech_started(self, start_ms: int) -> None:
|
||||
"""Handle user speech start."""
|
||||
self._is_speaking = True
|
||||
await self.transport.send_event({
|
||||
"event": "speaking",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms(),
|
||||
"startTime": start_ms
|
||||
})
|
||||
|
||||
# Cancel any ongoing response (barge-in)
|
||||
await self.service.cancel_response()
|
||||
|
||||
async def _on_speech_stopped(self, end_ms: int) -> None:
|
||||
"""Handle user speech stop."""
|
||||
self._is_speaking = False
|
||||
await self.transport.send_event({
|
||||
"event": "silence",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms(),
|
||||
"duration": end_ms
|
||||
})
|
||||
|
||||
async def _on_response_started(self, response: Dict) -> None:
|
||||
"""Handle response start."""
|
||||
await self.transport.send_event({
|
||||
"event": "trackStart",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms()
|
||||
})
|
||||
|
||||
async def _on_response_done(self, response: Dict) -> None:
|
||||
"""Handle response complete."""
|
||||
await self.transport.send_event({
|
||||
"event": "trackEnd",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms()
|
||||
})
|
||||
|
||||
async def _on_error(self, error: Dict) -> None:
|
||||
"""Handle error."""
|
||||
await self.transport.send_event({
|
||||
"event": "error",
|
||||
"trackId": self.session_id,
|
||||
"timestamp": self._get_timestamp_ms(),
|
||||
"sender": "realtime",
|
||||
"error": str(error)
|
||||
})
|
||||
|
||||
def _get_timestamp_ms(self) -> int:
|
||||
"""Get current timestamp in milliseconds."""
|
||||
import time
|
||||
return int(time.time() * 1000)
|
||||
|
||||
@property
|
||||
def is_speaking(self) -> bool:
|
||||
"""Check if user is speaking."""
|
||||
return self._is_speaking
|
||||
8
services/siliconflow_asr.py
Normal file
8
services/siliconflow_asr.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Backward-compatible imports for legacy siliconflow_asr module."""
|
||||
|
||||
from services.openai_compatible_asr import OpenAICompatibleASRService
|
||||
|
||||
# Backward-compatible alias
|
||||
SiliconFlowASRService = OpenAICompatibleASRService
|
||||
|
||||
__all__ = ["OpenAICompatibleASRService", "SiliconFlowASRService"]
|
||||
8
services/siliconflow_tts.py
Normal file
8
services/siliconflow_tts.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Backward-compatible imports for legacy siliconflow_tts module."""
|
||||
|
||||
from services.openai_compatible_tts import OpenAICompatibleTTSService, StreamingTTSAdapter
|
||||
|
||||
# Backward-compatible alias
|
||||
SiliconFlowTTSService = OpenAICompatibleTTSService
|
||||
|
||||
__all__ = ["OpenAICompatibleTTSService", "SiliconFlowTTSService", "StreamingTTSAdapter"]
|
||||
86
services/streaming_text.py
Normal file
86
services/streaming_text.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Shared text chunking helpers for streaming TTS."""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def is_non_sentence_period(text: str, idx: int) -> bool:
|
||||
"""Check whether '.' should NOT be treated as a sentence delimiter."""
|
||||
if idx < 0 or idx >= len(text) or text[idx] != ".":
|
||||
return False
|
||||
|
||||
# Decimal/version segment: 1.2, v1.2.3
|
||||
if idx > 0 and idx < len(text) - 1 and text[idx - 1].isdigit() and text[idx + 1].isdigit():
|
||||
return True
|
||||
|
||||
# Number abbreviations: No.1 / No. 1
|
||||
left_start = idx - 1
|
||||
while left_start >= 0 and text[left_start].isalpha():
|
||||
left_start -= 1
|
||||
left_token = text[left_start + 1:idx].lower()
|
||||
if left_token == "no":
|
||||
j = idx + 1
|
||||
while j < len(text) and text[j].isspace():
|
||||
j += 1
|
||||
if j < len(text) and text[j].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def has_spoken_content(text: str) -> bool:
|
||||
"""Check whether text contains pronounceable content (not punctuation-only)."""
|
||||
return any(char.isalnum() for char in text)
|
||||
|
||||
|
||||
def extract_tts_sentence(
|
||||
text_buffer: str,
|
||||
*,
|
||||
end_chars: frozenset[str],
|
||||
trailing_chars: frozenset[str],
|
||||
closers: frozenset[str],
|
||||
min_split_spoken_chars: int = 0,
|
||||
hold_trailing_at_buffer_end: bool = False,
|
||||
force: bool = False,
|
||||
) -> Optional[tuple[str, str]]:
|
||||
"""Extract one TTS sentence from text buffer."""
|
||||
if not text_buffer:
|
||||
return None
|
||||
|
||||
search_start = 0
|
||||
while True:
|
||||
split_idx = -1
|
||||
for idx in range(search_start, len(text_buffer)):
|
||||
char = text_buffer[idx]
|
||||
if char == "." and is_non_sentence_period(text_buffer, idx):
|
||||
continue
|
||||
if char in end_chars:
|
||||
split_idx = idx
|
||||
break
|
||||
|
||||
if split_idx == -1:
|
||||
return None
|
||||
|
||||
end_idx = split_idx + 1
|
||||
while end_idx < len(text_buffer) and text_buffer[end_idx] in trailing_chars:
|
||||
end_idx += 1
|
||||
|
||||
while end_idx < len(text_buffer) and text_buffer[end_idx] in closers:
|
||||
end_idx += 1
|
||||
|
||||
if hold_trailing_at_buffer_end and not force and end_idx >= len(text_buffer):
|
||||
return None
|
||||
|
||||
sentence = text_buffer[:end_idx].strip()
|
||||
spoken_chars = sum(1 for ch in sentence if ch.isalnum())
|
||||
|
||||
if (
|
||||
not force
|
||||
and min_split_spoken_chars > 0
|
||||
and 0 < spoken_chars < min_split_spoken_chars
|
||||
and end_idx < len(text_buffer)
|
||||
):
|
||||
search_start = end_idx
|
||||
continue
|
||||
|
||||
remainder = text_buffer[end_idx:]
|
||||
return sentence, remainder
|
||||
95
services/streaming_tts_adapter.py
Normal file
95
services/streaming_tts_adapter.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Backend-agnostic streaming adapter from LLM text to TTS audio."""
|
||||
|
||||
import asyncio
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from services.base import BaseTTSService
|
||||
from services.streaming_text import extract_tts_sentence, has_spoken_content
|
||||
|
||||
|
||||
class StreamingTTSAdapter:
|
||||
"""
|
||||
Adapter for streaming LLM text to TTS with sentence-level chunking.
|
||||
|
||||
This reduces latency by starting TTS as soon as a complete sentence
|
||||
is received from the LLM, rather than waiting for the full response.
|
||||
"""
|
||||
|
||||
SENTENCE_ENDS = {"。", "!", "?", ".", "!", "?", "\n"}
|
||||
SENTENCE_CLOSERS = frozenset()
|
||||
|
||||
def __init__(self, tts_service: BaseTTSService, transport, session_id: str):
|
||||
self.tts_service = tts_service
|
||||
self.transport = transport
|
||||
self.session_id = session_id
|
||||
self._buffer = ""
|
||||
self._cancel_event = asyncio.Event()
|
||||
self._is_speaking = False
|
||||
|
||||
async def process_text_chunk(self, text_chunk: str) -> None:
|
||||
"""
|
||||
Process a text chunk from LLM and trigger TTS when sentence is complete.
|
||||
|
||||
Args:
|
||||
text_chunk: Text chunk from LLM streaming
|
||||
"""
|
||||
if self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
self._buffer += text_chunk
|
||||
|
||||
# Check for sentence completion
|
||||
while True:
|
||||
split_result = extract_tts_sentence(
|
||||
self._buffer,
|
||||
end_chars=frozenset(self.SENTENCE_ENDS),
|
||||
trailing_chars=frozenset(self.SENTENCE_ENDS),
|
||||
closers=self.SENTENCE_CLOSERS,
|
||||
force=False,
|
||||
)
|
||||
if not split_result:
|
||||
break
|
||||
|
||||
sentence, self._buffer = split_result
|
||||
if sentence and has_spoken_content(sentence):
|
||||
await self._speak_sentence(sentence)
|
||||
|
||||
async def flush(self) -> None:
|
||||
"""Flush remaining buffer."""
|
||||
if self._buffer.strip() and not self._cancel_event.is_set():
|
||||
await self._speak_sentence(self._buffer.strip())
|
||||
self._buffer = ""
|
||||
|
||||
async def _speak_sentence(self, text: str) -> None:
|
||||
"""Synthesize and send a sentence."""
|
||||
if not text or self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
self._is_speaking = True
|
||||
|
||||
try:
|
||||
async for chunk in self.tts_service.synthesize_stream(text):
|
||||
if self._cancel_event.is_set():
|
||||
break
|
||||
await self.transport.send_audio(chunk.audio)
|
||||
await asyncio.sleep(0.01) # Prevent flooding
|
||||
except Exception as e:
|
||||
logger.error(f"TTS speak error: {e}")
|
||||
finally:
|
||||
self._is_speaking = False
|
||||
|
||||
def cancel(self) -> None:
|
||||
"""Cancel ongoing speech."""
|
||||
self._cancel_event.set()
|
||||
self._buffer = ""
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset for new turn."""
|
||||
self._cancel_event.clear()
|
||||
self._buffer = ""
|
||||
self._is_speaking = False
|
||||
|
||||
@property
|
||||
def is_speaking(self) -> bool:
|
||||
return self._is_speaking
|
||||
271
services/tts.py
Normal file
271
services/tts.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""TTS (Text-to-Speech) Service implementations.
|
||||
|
||||
Provides multiple TTS backend options including edge-tts (free)
|
||||
and placeholder for cloud services.
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import asyncio
|
||||
import struct
|
||||
from typing import AsyncIterator, Optional
|
||||
from loguru import logger
|
||||
|
||||
from services.base import BaseTTSService, TTSChunk, ServiceState
|
||||
|
||||
# Try to import edge-tts
|
||||
try:
|
||||
import edge_tts
|
||||
EDGE_TTS_AVAILABLE = True
|
||||
except ImportError:
|
||||
EDGE_TTS_AVAILABLE = False
|
||||
logger.warning("edge-tts not available - EdgeTTS service will be disabled")
|
||||
|
||||
|
||||
class EdgeTTSService(BaseTTSService):
|
||||
"""
|
||||
Microsoft Edge TTS service.
|
||||
|
||||
Uses edge-tts library for free, high-quality speech synthesis.
|
||||
Supports streaming for low-latency playback.
|
||||
"""
|
||||
|
||||
# Voice mapping for common languages
|
||||
VOICE_MAP = {
|
||||
"en": "en-US-JennyNeural",
|
||||
"en-US": "en-US-JennyNeural",
|
||||
"en-GB": "en-GB-SoniaNeural",
|
||||
"zh": "zh-CN-XiaoxiaoNeural",
|
||||
"zh-CN": "zh-CN-XiaoxiaoNeural",
|
||||
"zh-TW": "zh-TW-HsiaoChenNeural",
|
||||
"ja": "ja-JP-NanamiNeural",
|
||||
"ko": "ko-KR-SunHiNeural",
|
||||
"fr": "fr-FR-DeniseNeural",
|
||||
"de": "de-DE-KatjaNeural",
|
||||
"es": "es-ES-ElviraNeural",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
voice: str = "en-US-JennyNeural",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
"""
|
||||
Initialize Edge TTS service.
|
||||
|
||||
Args:
|
||||
voice: Voice name (e.g., "en-US-JennyNeural") or language code (e.g., "en")
|
||||
sample_rate: Target sample rate (will be resampled)
|
||||
speed: Speech speed multiplier
|
||||
"""
|
||||
# Resolve voice from language code if needed
|
||||
if voice in self.VOICE_MAP:
|
||||
voice = self.VOICE_MAP[voice]
|
||||
|
||||
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
||||
self._cancel_event = asyncio.Event()
|
||||
|
||||
async def connect(self) -> None:
|
||||
"""Edge TTS doesn't require explicit connection."""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts package not installed")
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info(f"Edge TTS service ready: voice={self.voice}")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Edge TTS doesn't require explicit disconnection."""
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Edge TTS service disconnected")
|
||||
|
||||
def _get_rate_string(self) -> str:
|
||||
"""Convert speed to rate string for edge-tts."""
|
||||
# edge-tts uses percentage format: "+0%", "-10%", "+20%"
|
||||
percentage = int((self.speed - 1.0) * 100)
|
||||
if percentage >= 0:
|
||||
return f"+{percentage}%"
|
||||
return f"{percentage}%"
|
||||
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""
|
||||
Synthesize complete audio for text.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Returns:
|
||||
PCM audio data (16-bit, mono, 16kHz)
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not available")
|
||||
|
||||
# Collect all chunks
|
||||
audio_data = b""
|
||||
async for chunk in self.synthesize_stream(text):
|
||||
audio_data += chunk.audio
|
||||
|
||||
return audio_data
|
||||
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""
|
||||
Synthesize audio in streaming mode.
|
||||
|
||||
Args:
|
||||
text: Text to synthesize
|
||||
|
||||
Yields:
|
||||
TTSChunk objects with PCM audio
|
||||
"""
|
||||
if not EDGE_TTS_AVAILABLE:
|
||||
raise RuntimeError("edge-tts not available")
|
||||
|
||||
self._cancel_event.clear()
|
||||
|
||||
try:
|
||||
communicate = edge_tts.Communicate(
|
||||
text,
|
||||
voice=self.voice,
|
||||
rate=self._get_rate_string()
|
||||
)
|
||||
|
||||
# edge-tts outputs MP3, we need to decode to PCM
|
||||
# For now, collect MP3 chunks and yield after conversion
|
||||
mp3_data = b""
|
||||
|
||||
async for chunk in communicate.stream():
|
||||
# Check for cancellation
|
||||
if self._cancel_event.is_set():
|
||||
logger.info("TTS synthesis cancelled")
|
||||
return
|
||||
|
||||
if chunk["type"] == "audio":
|
||||
mp3_data += chunk["data"]
|
||||
|
||||
# Convert MP3 to PCM
|
||||
if mp3_data:
|
||||
pcm_data = await self._convert_mp3_to_pcm(mp3_data)
|
||||
if pcm_data:
|
||||
# Yield in chunks for streaming playback
|
||||
chunk_size = self.sample_rate * 2 // 10 # 100ms chunks
|
||||
for i in range(0, len(pcm_data), chunk_size):
|
||||
if self._cancel_event.is_set():
|
||||
return
|
||||
|
||||
chunk_data = pcm_data[i:i + chunk_size]
|
||||
yield TTSChunk(
|
||||
audio=chunk_data,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=(i + chunk_size >= len(pcm_data))
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info("TTS synthesis cancelled via asyncio")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"TTS synthesis error: {e}")
|
||||
raise
|
||||
|
||||
async def _convert_mp3_to_pcm(self, mp3_data: bytes) -> bytes:
|
||||
"""
|
||||
Convert MP3 audio to PCM.
|
||||
|
||||
Uses pydub or ffmpeg for conversion.
|
||||
"""
|
||||
try:
|
||||
# Try using pydub (requires ffmpeg)
|
||||
from pydub import AudioSegment
|
||||
|
||||
# Load MP3 from bytes
|
||||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||||
|
||||
# Convert to target format
|
||||
audio = audio.set_frame_rate(self.sample_rate)
|
||||
audio = audio.set_channels(1)
|
||||
audio = audio.set_sample_width(2) # 16-bit
|
||||
|
||||
# Export as raw PCM
|
||||
return audio.raw_data
|
||||
|
||||
except ImportError:
|
||||
logger.warning("pydub not available, trying fallback")
|
||||
# Fallback: Use subprocess to call ffmpeg directly
|
||||
return await self._ffmpeg_convert(mp3_data)
|
||||
except Exception as e:
|
||||
logger.error(f"Audio conversion error: {e}")
|
||||
return b""
|
||||
|
||||
async def _ffmpeg_convert(self, mp3_data: bytes) -> bytes:
|
||||
"""Convert MP3 to PCM using ffmpeg subprocess."""
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
"ffmpeg",
|
||||
"-i", "pipe:0",
|
||||
"-f", "s16le",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", str(self.sample_rate),
|
||||
"-ac", "1",
|
||||
"pipe:1",
|
||||
stdin=asyncio.subprocess.PIPE,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.DEVNULL
|
||||
)
|
||||
|
||||
stdout, _ = await process.communicate(input=mp3_data)
|
||||
return stdout
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"ffmpeg conversion error: {e}")
|
||||
return b""
|
||||
|
||||
async def cancel(self) -> None:
|
||||
"""Cancel ongoing synthesis."""
|
||||
self._cancel_event.set()
|
||||
|
||||
|
||||
class MockTTSService(BaseTTSService):
|
||||
"""
|
||||
Mock TTS service for testing without actual synthesis.
|
||||
|
||||
Generates silence or simple tones.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
voice: str = "mock",
|
||||
sample_rate: int = 16000,
|
||||
speed: float = 1.0
|
||||
):
|
||||
super().__init__(voice=voice, sample_rate=sample_rate, speed=speed)
|
||||
|
||||
async def connect(self) -> None:
|
||||
self.state = ServiceState.CONNECTED
|
||||
logger.info("Mock TTS service connected")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self.state = ServiceState.DISCONNECTED
|
||||
logger.info("Mock TTS service disconnected")
|
||||
|
||||
async def synthesize(self, text: str) -> bytes:
|
||||
"""Generate silence based on text length."""
|
||||
# Approximate: 100ms per word
|
||||
word_count = len(text.split())
|
||||
duration_ms = word_count * 100
|
||||
samples = int(self.sample_rate * duration_ms / 1000)
|
||||
|
||||
# Generate silence (zeros)
|
||||
return bytes(samples * 2) # 16-bit = 2 bytes per sample
|
||||
|
||||
async def synthesize_stream(self, text: str) -> AsyncIterator[TTSChunk]:
|
||||
"""Generate silence chunks."""
|
||||
audio = await self.synthesize(text)
|
||||
|
||||
# Yield in 100ms chunks
|
||||
chunk_size = self.sample_rate * 2 // 10
|
||||
for i in range(0, len(audio), chunk_size):
|
||||
chunk_data = audio[i:i + chunk_size]
|
||||
yield TTSChunk(
|
||||
audio=chunk_data,
|
||||
sample_rate=self.sample_rate,
|
||||
is_final=(i + chunk_size >= len(audio))
|
||||
)
|
||||
await asyncio.sleep(0.05) # Simulate processing time
|
||||
Reference in New Issue
Block a user