Add opener audio functionality to Assistant model and related schemas, enabling audio generation and playback features. Update API routes and frontend components to support opener audio management, including status retrieval and generation controls.

This commit is contained in:
Xin Wang
2026-02-26 14:31:50 +08:00
parent 833cb0d4c4
commit fb95e2abe2
9 changed files with 551 additions and 4 deletions

View File

@@ -12,12 +12,17 @@ event-driven design.
"""
import asyncio
import audioop
import io
import json
import time
import uuid
import wave
from pathlib import Path
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
import numpy as np
import aiohttp
from loguru import logger
from app.config import settings
@@ -203,6 +208,7 @@ class DuplexPipeline:
self._runtime_first_turn_mode: str = "bot_first"
self._runtime_greeting: Optional[str] = None
self._runtime_generated_opener_enabled: Optional[bool] = None
self._runtime_opener_audio: Dict[str, Any] = {}
self._runtime_barge_in_enabled: Optional[bool] = None
self._runtime_barge_in_min_duration_ms: Optional[int] = None
self._runtime_knowledge: Dict[str, Any] = {}
@@ -320,6 +326,9 @@ class DuplexPipeline:
knowledge = metadata.get("knowledge")
if isinstance(knowledge, dict):
self._runtime_knowledge = knowledge
opener_audio = metadata.get("openerAudio")
if isinstance(opener_audio, dict):
self._runtime_opener_audio = dict(opener_audio)
kb_id = str(knowledge.get("kbId") or knowledge.get("knowledgeBaseId") or "").strip()
if kb_id:
self._runtime_knowledge_base_id = kb_id
@@ -770,10 +779,117 @@ class DuplexPipeline:
)
await self.conversation.add_assistant_turn(greeting_to_speak)
if self._tts_output_enabled():
used_preloaded_audio = await self._play_preloaded_opener_audio()
if self._tts_output_enabled() and not used_preloaded_audio:
# Keep opener text ahead of opener voice start.
await self._speak(greeting_to_speak, audio_event_priority=30)
async def _play_preloaded_opener_audio(self) -> bool:
"""
Play opener audio from runtime metadata cache or YAML-configured local file.
Returns True when preloaded audio is played successfully.
"""
if not self._tts_output_enabled():
return False
pcm_bytes = await self._load_preloaded_opener_pcm()
if not pcm_bytes:
return False
try:
self._drop_outbound_audio = False
self._start_tts()
await self._send_event(
{
**ev(
"output.audio.start",
trackId=self.track_audio_out,
)
},
priority=30,
)
self._is_bot_speaking = True
await self._send_audio(pcm_bytes, priority=50)
await self._flush_audio_out_frames(priority=50)
await self._send_event(
{
**ev(
"output.audio.end",
trackId=self.track_audio_out,
)
},
priority=30,
)
return True
except Exception as e:
logger.warning(f"Failed to play preloaded opener audio, fallback to TTS: {e}")
return False
finally:
self._is_bot_speaking = False
async def _load_preloaded_opener_pcm(self) -> Optional[bytes]:
# 1) Runtime metadata from backend config
opener_audio = self._runtime_opener_audio if isinstance(self._runtime_opener_audio, dict) else {}
if bool(opener_audio.get("enabled")) and bool(opener_audio.get("ready")):
pcm_url = str(opener_audio.get("pcmUrl") or "").strip()
if pcm_url:
resolved_url = pcm_url
if pcm_url.startswith("/"):
backend_url = str(settings.backend_url or "").strip().rstrip("/")
if backend_url:
resolved_url = f"{backend_url}{pcm_url}"
try:
timeout = aiohttp.ClientTimeout(total=10)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(resolved_url) as resp:
resp.raise_for_status()
payload = await resp.read()
if payload:
return payload
except Exception as e:
logger.warning(f"Failed to fetch opener audio from backend ({resolved_url}): {e}")
# 2) Standalone fallback via YAML
opener_audio_file = str(settings.duplex_opener_audio_file or "").strip()
if not opener_audio_file:
return None
path = Path(opener_audio_file)
if not path.is_absolute():
path = (Path.cwd() / path).resolve()
if not path.exists() or not path.is_file():
logger.warning(f"Configured opener audio file does not exist: {path}")
return None
try:
raw = path.read_bytes()
suffix = path.suffix.lower()
if suffix == ".wav":
pcm, _ = self._wav_to_pcm16_mono_16k(raw)
return pcm
# .pcm raw pcm_s16le 16k mono
return raw
except Exception as e:
logger.warning(f"Failed to read opener audio file {path}: {e}")
return None
def _wav_to_pcm16_mono_16k(self, wav_bytes: bytes) -> Tuple[bytes, int]:
with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file:
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
sample_rate = wav_file.getframerate()
nframes = wav_file.getnframes()
raw = wav_file.readframes(nframes)
if sample_width != 2:
raise ValueError(f"Unsupported WAV sample width: {sample_width * 8}bit")
if channels > 1:
raw = audioop.tomono(raw, sample_width, 0.5, 0.5)
if sample_rate != 16000:
raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None)
duration_ms = int((len(raw) / (16000 * 2)) * 1000)
return raw, duration_ms
async def _enqueue_outbound(self, kind: str, payload: Any, priority: int) -> None:
"""Queue outbound message with priority ordering."""
self._outbound_seq += 1

View File

@@ -59,6 +59,7 @@ class Session:
"bargeIn",
"knowledge",
"knowledgeBaseId",
"openerAudio",
"history",
"userId",
"assistantId",
@@ -840,6 +841,7 @@ class Session:
"bargeIn",
"knowledgeBaseId",
"knowledge",
"openerAudio",
"history",
"userId",
"source",