Add opener audio functionality to Assistant model and related schemas, enabling audio generation and playback features. Update API routes and frontend components to support opener audio management, including status retrieval and generation controls.

2026-02-26 14:31:50 +08:00
parent 833cb0d4c4
commit fb95e2abe2
9 changed files with 551 additions and 4 deletions
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -12,12 +12,17 @@ event-driven design.
 """

 import asyncio
+import audioop
+import io
 import json
 import time
 import uuid
+import wave
+from pathlib import Path
 from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple

 import numpy as np
+import aiohttp
 from loguru import logger

 from app.config import settings
@@ -203,6 +208,7 @@ class DuplexPipeline:
        self._runtime_first_turn_mode: str = "bot_first"
        self._runtime_greeting: Optional[str] = None
        self._runtime_generated_opener_enabled: Optional[bool] = None
+        self._runtime_opener_audio: Dict[str, Any] = {}
        self._runtime_barge_in_enabled: Optional[bool] = None
        self._runtime_barge_in_min_duration_ms: Optional[int] = None
        self._runtime_knowledge: Dict[str, Any] = {}
@@ -320,6 +326,9 @@ class DuplexPipeline:
        knowledge = metadata.get("knowledge")
        if isinstance(knowledge, dict):
            self._runtime_knowledge = knowledge
+        opener_audio = metadata.get("openerAudio")
+        if isinstance(opener_audio, dict):
+            self._runtime_opener_audio = dict(opener_audio)
            kb_id = str(knowledge.get("kbId") or knowledge.get("knowledgeBaseId") or "").strip()
            if kb_id:
                self._runtime_knowledge_base_id = kb_id
@@ -770,10 +779,117 @@ class DuplexPipeline:
        )
        await self.conversation.add_assistant_turn(greeting_to_speak)

-        if self._tts_output_enabled():
+        used_preloaded_audio = await self._play_preloaded_opener_audio()
+        if self._tts_output_enabled() and not used_preloaded_audio:
            # Keep opener text ahead of opener voice start.
            await self._speak(greeting_to_speak, audio_event_priority=30)

+    async def _play_preloaded_opener_audio(self) -> bool:
+        """
+        Play opener audio from runtime metadata cache or YAML-configured local file.
+
+        Returns True when preloaded audio is played successfully.
+        """
+        if not self._tts_output_enabled():
+            return False
+
+        pcm_bytes = await self._load_preloaded_opener_pcm()
+        if not pcm_bytes:
+            return False
+
+        try:
+            self._drop_outbound_audio = False
+            self._start_tts()
+            await self._send_event(
+                {
+                    **ev(
+                        "output.audio.start",
+                        trackId=self.track_audio_out,
+                    )
+                },
+                priority=30,
+            )
+
+            self._is_bot_speaking = True
+            await self._send_audio(pcm_bytes, priority=50)
+            await self._flush_audio_out_frames(priority=50)
+            await self._send_event(
+                {
+                    **ev(
+                        "output.audio.end",
+                        trackId=self.track_audio_out,
+                    )
+                },
+                priority=30,
+            )
+            return True
+        except Exception as e:
+            logger.warning(f"Failed to play preloaded opener audio, fallback to TTS: {e}")
+            return False
+        finally:
+            self._is_bot_speaking = False
+
+    async def _load_preloaded_opener_pcm(self) -> Optional[bytes]:
+        # 1) Runtime metadata from backend config
+        opener_audio = self._runtime_opener_audio if isinstance(self._runtime_opener_audio, dict) else {}
+        if bool(opener_audio.get("enabled")) and bool(opener_audio.get("ready")):
+            pcm_url = str(opener_audio.get("pcmUrl") or "").strip()
+            if pcm_url:
+                resolved_url = pcm_url
+                if pcm_url.startswith("/"):
+                    backend_url = str(settings.backend_url or "").strip().rstrip("/")
+                    if backend_url:
+                        resolved_url = f"{backend_url}{pcm_url}"
+                try:
+                    timeout = aiohttp.ClientTimeout(total=10)
+                    async with aiohttp.ClientSession(timeout=timeout) as session:
+                        async with session.get(resolved_url) as resp:
+                            resp.raise_for_status()
+                            payload = await resp.read()
+                            if payload:
+                                return payload
+                except Exception as e:
+                    logger.warning(f"Failed to fetch opener audio from backend ({resolved_url}): {e}")
+
+        # 2) Standalone fallback via YAML
+        opener_audio_file = str(settings.duplex_opener_audio_file or "").strip()
+        if not opener_audio_file:
+            return None
+        path = Path(opener_audio_file)
+        if not path.is_absolute():
+            path = (Path.cwd() / path).resolve()
+        if not path.exists() or not path.is_file():
+            logger.warning(f"Configured opener audio file does not exist: {path}")
+            return None
+        try:
+            raw = path.read_bytes()
+            suffix = path.suffix.lower()
+            if suffix == ".wav":
+                pcm, _ = self._wav_to_pcm16_mono_16k(raw)
+                return pcm
+            # .pcm raw pcm_s16le 16k mono
+            return raw
+        except Exception as e:
+            logger.warning(f"Failed to read opener audio file {path}: {e}")
+            return None
+
+    def _wav_to_pcm16_mono_16k(self, wav_bytes: bytes) -> Tuple[bytes, int]:
+        with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file:
+            channels = wav_file.getnchannels()
+            sample_width = wav_file.getsampwidth()
+            sample_rate = wav_file.getframerate()
+            nframes = wav_file.getnframes()
+            raw = wav_file.readframes(nframes)
+
+        if sample_width != 2:
+            raise ValueError(f"Unsupported WAV sample width: {sample_width * 8}bit")
+        if channels > 1:
+            raw = audioop.tomono(raw, sample_width, 0.5, 0.5)
+        if sample_rate != 16000:
+            raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None)
+        duration_ms = int((len(raw) / (16000 * 2)) * 1000)
+        return raw, duration_ms
+
    async def _enqueue_outbound(self, kind: str, payload: Any, priority: int) -> None:
        """Queue outbound message with priority ordering."""
        self._outbound_seq += 1
--- a/engine/core/session.py
+++ b/engine/core/session.py
@@ -59,6 +59,7 @@ class Session:
        "bargeIn",
        "knowledge",
        "knowledgeBaseId",
+        "openerAudio",
        "history",
        "userId",
        "assistantId",
@@ -840,6 +841,7 @@ class Session:
            "bargeIn",
            "knowledgeBaseId",
            "knowledge",
+            "openerAudio",
            "history",
            "userId",
            "source",