Add opener audio functionality to Assistant model and related schemas, enabling audio generation and playback features. Update API routes and frontend components to support opener audio management, including status retrieval and generation controls.
This commit is contained in:
@@ -12,12 +12,17 @@ event-driven design.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import audioop
|
||||
import io
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import aiohttp
|
||||
from loguru import logger
|
||||
|
||||
from app.config import settings
|
||||
@@ -203,6 +208,7 @@ class DuplexPipeline:
|
||||
self._runtime_first_turn_mode: str = "bot_first"
|
||||
self._runtime_greeting: Optional[str] = None
|
||||
self._runtime_generated_opener_enabled: Optional[bool] = None
|
||||
self._runtime_opener_audio: Dict[str, Any] = {}
|
||||
self._runtime_barge_in_enabled: Optional[bool] = None
|
||||
self._runtime_barge_in_min_duration_ms: Optional[int] = None
|
||||
self._runtime_knowledge: Dict[str, Any] = {}
|
||||
@@ -320,6 +326,9 @@ class DuplexPipeline:
|
||||
knowledge = metadata.get("knowledge")
|
||||
if isinstance(knowledge, dict):
|
||||
self._runtime_knowledge = knowledge
|
||||
opener_audio = metadata.get("openerAudio")
|
||||
if isinstance(opener_audio, dict):
|
||||
self._runtime_opener_audio = dict(opener_audio)
|
||||
kb_id = str(knowledge.get("kbId") or knowledge.get("knowledgeBaseId") or "").strip()
|
||||
if kb_id:
|
||||
self._runtime_knowledge_base_id = kb_id
|
||||
@@ -770,10 +779,117 @@ class DuplexPipeline:
|
||||
)
|
||||
await self.conversation.add_assistant_turn(greeting_to_speak)
|
||||
|
||||
if self._tts_output_enabled():
|
||||
used_preloaded_audio = await self._play_preloaded_opener_audio()
|
||||
if self._tts_output_enabled() and not used_preloaded_audio:
|
||||
# Keep opener text ahead of opener voice start.
|
||||
await self._speak(greeting_to_speak, audio_event_priority=30)
|
||||
|
||||
async def _play_preloaded_opener_audio(self) -> bool:
|
||||
"""
|
||||
Play opener audio from runtime metadata cache or YAML-configured local file.
|
||||
|
||||
Returns True when preloaded audio is played successfully.
|
||||
"""
|
||||
if not self._tts_output_enabled():
|
||||
return False
|
||||
|
||||
pcm_bytes = await self._load_preloaded_opener_pcm()
|
||||
if not pcm_bytes:
|
||||
return False
|
||||
|
||||
try:
|
||||
self._drop_outbound_audio = False
|
||||
self._start_tts()
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"output.audio.start",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
},
|
||||
priority=30,
|
||||
)
|
||||
|
||||
self._is_bot_speaking = True
|
||||
await self._send_audio(pcm_bytes, priority=50)
|
||||
await self._flush_audio_out_frames(priority=50)
|
||||
await self._send_event(
|
||||
{
|
||||
**ev(
|
||||
"output.audio.end",
|
||||
trackId=self.track_audio_out,
|
||||
)
|
||||
},
|
||||
priority=30,
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to play preloaded opener audio, fallback to TTS: {e}")
|
||||
return False
|
||||
finally:
|
||||
self._is_bot_speaking = False
|
||||
|
||||
async def _load_preloaded_opener_pcm(self) -> Optional[bytes]:
|
||||
# 1) Runtime metadata from backend config
|
||||
opener_audio = self._runtime_opener_audio if isinstance(self._runtime_opener_audio, dict) else {}
|
||||
if bool(opener_audio.get("enabled")) and bool(opener_audio.get("ready")):
|
||||
pcm_url = str(opener_audio.get("pcmUrl") or "").strip()
|
||||
if pcm_url:
|
||||
resolved_url = pcm_url
|
||||
if pcm_url.startswith("/"):
|
||||
backend_url = str(settings.backend_url or "").strip().rstrip("/")
|
||||
if backend_url:
|
||||
resolved_url = f"{backend_url}{pcm_url}"
|
||||
try:
|
||||
timeout = aiohttp.ClientTimeout(total=10)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(resolved_url) as resp:
|
||||
resp.raise_for_status()
|
||||
payload = await resp.read()
|
||||
if payload:
|
||||
return payload
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch opener audio from backend ({resolved_url}): {e}")
|
||||
|
||||
# 2) Standalone fallback via YAML
|
||||
opener_audio_file = str(settings.duplex_opener_audio_file or "").strip()
|
||||
if not opener_audio_file:
|
||||
return None
|
||||
path = Path(opener_audio_file)
|
||||
if not path.is_absolute():
|
||||
path = (Path.cwd() / path).resolve()
|
||||
if not path.exists() or not path.is_file():
|
||||
logger.warning(f"Configured opener audio file does not exist: {path}")
|
||||
return None
|
||||
try:
|
||||
raw = path.read_bytes()
|
||||
suffix = path.suffix.lower()
|
||||
if suffix == ".wav":
|
||||
pcm, _ = self._wav_to_pcm16_mono_16k(raw)
|
||||
return pcm
|
||||
# .pcm raw pcm_s16le 16k mono
|
||||
return raw
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to read opener audio file {path}: {e}")
|
||||
return None
|
||||
|
||||
def _wav_to_pcm16_mono_16k(self, wav_bytes: bytes) -> Tuple[bytes, int]:
|
||||
with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file:
|
||||
channels = wav_file.getnchannels()
|
||||
sample_width = wav_file.getsampwidth()
|
||||
sample_rate = wav_file.getframerate()
|
||||
nframes = wav_file.getnframes()
|
||||
raw = wav_file.readframes(nframes)
|
||||
|
||||
if sample_width != 2:
|
||||
raise ValueError(f"Unsupported WAV sample width: {sample_width * 8}bit")
|
||||
if channels > 1:
|
||||
raw = audioop.tomono(raw, sample_width, 0.5, 0.5)
|
||||
if sample_rate != 16000:
|
||||
raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None)
|
||||
duration_ms = int((len(raw) / (16000 * 2)) * 1000)
|
||||
return raw, duration_ms
|
||||
|
||||
async def _enqueue_outbound(self, kind: str, payload: Any, priority: int) -> None:
|
||||
"""Queue outbound message with priority ordering."""
|
||||
self._outbound_seq += 1
|
||||
|
||||
@@ -59,6 +59,7 @@ class Session:
|
||||
"bargeIn",
|
||||
"knowledge",
|
||||
"knowledgeBaseId",
|
||||
"openerAudio",
|
||||
"history",
|
||||
"userId",
|
||||
"assistantId",
|
||||
@@ -840,6 +841,7 @@ class Session:
|
||||
"bargeIn",
|
||||
"knowledgeBaseId",
|
||||
"knowledge",
|
||||
"openerAudio",
|
||||
"history",
|
||||
"userId",
|
||||
"source",
|
||||
|
||||
Reference in New Issue
Block a user