diff --git a/api/app/models.py b/api/app/models.py
index 7b6c03f..3aa5bf6 100644
--- a/api/app/models.py
+++ b/api/app/models.py
@@ -138,6 +138,25 @@ class Assistant(Base):
user = relationship("User")
call_records = relationship("CallRecord", back_populates="assistant")
+ opener_audio = relationship("AssistantOpenerAudio", back_populates="assistant", uselist=False, cascade="all, delete-orphan")
+
+
+class AssistantOpenerAudio(Base):
+ __tablename__ = "assistant_opener_audio"
+
+ assistant_id: Mapped[str] = mapped_column(String(64), ForeignKey("assistants.id"), primary_key=True)
+ enabled: Mapped[bool] = mapped_column(default=False)
+ file_path: Mapped[Optional[str]] = mapped_column(String(512), nullable=True)
+ encoding: Mapped[str] = mapped_column(String(32), default="pcm_s16le")
+ sample_rate_hz: Mapped[int] = mapped_column(Integer, default=16000)
+ channels: Mapped[int] = mapped_column(Integer, default=1)
+ duration_ms: Mapped[int] = mapped_column(Integer, default=0)
+ text_hash: Mapped[Optional[str]] = mapped_column(String(128), nullable=True)
+ tts_fingerprint: Mapped[Optional[str]] = mapped_column(String(256), nullable=True)
+ created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
+ updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow)
+
+ assistant = relationship("Assistant", back_populates="opener_audio")
# ============ Knowledge Base ============
diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py
index 09f338f..fcb1932 100644
--- a/api/app/routers/assistants.py
+++ b/api/app/routers/assistants.py
@@ -1,18 +1,33 @@
+import audioop
+import hashlib
+import io
+import os
+import wave
+from pathlib import Path
+import httpx
from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from typing import Any, Dict, List, Optional
import uuid
from datetime import datetime
from ..db import get_db
-from ..models import Assistant, LLMModel, ASRModel, Voice
+from ..models import Assistant, AssistantOpenerAudio, LLMModel, ASRModel, Voice
from ..schemas import (
- AssistantCreate, AssistantUpdate, AssistantOut, AssistantEngineConfigResponse
+ AssistantCreate,
+ AssistantUpdate,
+ AssistantOut,
+ AssistantEngineConfigResponse,
+ AssistantOpenerAudioGenerateRequest,
+ AssistantOpenerAudioOut,
)
router = APIRouter(prefix="/assistants", tags=["Assistants"])
OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
+OPENAI_COMPATIBLE_DEFAULT_BASE_URL = "https://api.siliconflow.cn/v1"
+OPENER_AUDIO_DIR = Path(__file__).resolve().parents[2] / "data" / "opener_audio"
OPENAI_COMPATIBLE_KNOWN_VOICES = {
"alex",
"anna",
@@ -163,6 +178,19 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s
"kbId": assistant.knowledge_base_id,
"nResults": 5,
}
+ opener_audio = assistant.opener_audio
+ opener_audio_ready = bool(opener_audio and opener_audio.file_path and Path(opener_audio.file_path).exists())
+ metadata["openerAudio"] = {
+ "enabled": bool(opener_audio.enabled) if opener_audio else False,
+ "ready": opener_audio_ready,
+ "encoding": opener_audio.encoding if opener_audio else "pcm_s16le",
+ "sampleRateHz": int(opener_audio.sample_rate_hz) if opener_audio else 16000,
+ "channels": int(opener_audio.channels) if opener_audio else 1,
+ "durationMs": int(opener_audio.duration_ms) if opener_audio else 0,
+ "textHash": opener_audio.text_hash if opener_audio else None,
+ "ttsFingerprint": opener_audio.tts_fingerprint if opener_audio else None,
+ "pcmUrl": f"/api/assistants/{assistant.id}/opener-audio/pcm" if opener_audio_ready else None,
+ }
return metadata, warnings
@@ -189,6 +217,8 @@ def _build_engine_assistant_config(db: Session, assistant: Assistant) -> Dict[st
def assistant_to_dict(assistant: Assistant) -> dict:
+ opener_audio = assistant.opener_audio
+ opener_audio_ready = bool(opener_audio and opener_audio.file_path and Path(opener_audio.file_path).exists())
return {
"id": assistant.id,
"name": assistant.name,
@@ -196,6 +226,10 @@ def assistant_to_dict(assistant: Assistant) -> dict:
"firstTurnMode": assistant.first_turn_mode or "bot_first",
"opener": assistant.opener or "",
"generatedOpenerEnabled": bool(assistant.generated_opener_enabled),
+ "openerAudioEnabled": bool(opener_audio.enabled) if opener_audio else False,
+ "openerAudioReady": opener_audio_ready,
+ "openerAudioDurationMs": int(opener_audio.duration_ms) if opener_audio else 0,
+ "openerAudioUpdatedAt": opener_audio.updated_at if opener_audio else None,
"prompt": assistant.prompt or "",
"knowledgeBaseId": assistant.knowledge_base_id,
"language": assistant.language,
@@ -238,6 +272,114 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None:
setattr(assistant, field_map.get(field, field), value)
+def _ensure_assistant_opener_audio(db: Session, assistant: Assistant) -> AssistantOpenerAudio:
+ record = assistant.opener_audio
+ if record:
+ return record
+ record = AssistantOpenerAudio(assistant_id=assistant.id, enabled=False)
+ db.add(record)
+ db.flush()
+ return record
+
+
+def _resolve_tts_runtime_for_assistant(db: Session, assistant: Assistant) -> tuple[Dict[str, Any], Optional[Voice]]:
+ metadata, _ = _resolve_runtime_metadata(db, assistant)
+ services = metadata.get("services") if isinstance(metadata.get("services"), dict) else {}
+ tts = services.get("tts") if isinstance(services, dict) and isinstance(services.get("tts"), dict) else {}
+ voice = db.query(Voice).filter(Voice.id == assistant.voice).first() if assistant.voice else None
+ return tts, voice
+
+
+def _tts_fingerprint(tts_cfg: Dict[str, Any], opener_text: str) -> str:
+ identity = {
+ "provider": tts_cfg.get("provider"),
+ "model": tts_cfg.get("model"),
+ "voice": tts_cfg.get("voice"),
+ "speed": tts_cfg.get("speed"),
+ "text": opener_text,
+ }
+ return hashlib.sha256(str(identity).encode("utf-8")).hexdigest()
+
+
+def _synthesize_openai_compatible_wav(
+ *,
+ text: str,
+ model: str,
+ voice_key: str,
+ speed: float,
+ api_key: str,
+ base_url: str,
+) -> bytes:
+ payload = {
+ "model": model or OPENAI_COMPATIBLE_DEFAULT_MODEL,
+ "input": text,
+ "voice": voice_key,
+ "response_format": "wav",
+ "speed": speed,
+ }
+ with httpx.Client(timeout=45.0) as client:
+ response = client.post(
+ f"{base_url.rstrip('/')}/audio/speech",
+ headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
+ json=payload,
+ )
+ if response.status_code != 200:
+ detail = response.text
+ try:
+ detail_json = response.json()
+ detail = detail_json.get("error", {}).get("message") or detail_json.get("detail") or detail
+ except Exception:
+ pass
+ raise HTTPException(status_code=502, detail=f"TTS vendor error: {detail}")
+ return response.content
+
+
+def _wav_to_pcm16_mono_16k(wav_bytes: bytes) -> tuple[bytes, int]:
+ with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file:
+ channels = wav_file.getnchannels()
+ sample_width = wav_file.getsampwidth()
+ sample_rate = wav_file.getframerate()
+ frames = wav_file.getnframes()
+ raw = wav_file.readframes(frames)
+
+ if sample_width != 2:
+ raise HTTPException(status_code=400, detail=f"Unsupported WAV sample width: {sample_width * 8}bit")
+
+ if channels > 1:
+ raw = audioop.tomono(raw, sample_width, 0.5, 0.5)
+
+ if sample_rate != 16000:
+ raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None)
+
+ duration_ms = int((len(raw) / (16000 * 2)) * 1000)
+ return raw, duration_ms
+
+
+def _persist_opener_audio_pcm(assistant_id: str, pcm_bytes: bytes) -> str:
+ OPENER_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+ file_path = OPENER_AUDIO_DIR / f"{assistant_id}.pcm"
+ with open(file_path, "wb") as f:
+ f.write(pcm_bytes)
+ return str(file_path)
+
+
+def _opener_audio_out(record: Optional[AssistantOpenerAudio]) -> AssistantOpenerAudioOut:
+ if not record:
+ return AssistantOpenerAudioOut()
+ ready = bool(record.file_path and Path(record.file_path).exists())
+ return AssistantOpenerAudioOut(
+ enabled=bool(record.enabled),
+ ready=ready,
+ encoding=record.encoding,
+ sample_rate_hz=record.sample_rate_hz,
+ channels=record.channels,
+ duration_ms=record.duration_ms,
+ updated_at=record.updated_at,
+ text_hash=record.text_hash,
+ tts_fingerprint=record.tts_fingerprint,
+ )
+
+
# ============ Assistants ============
@router.get("")
def list_assistants(
@@ -316,9 +458,132 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)):
db.add(assistant)
db.commit()
db.refresh(assistant)
+ opener_audio = _ensure_assistant_opener_audio(db, assistant)
+ opener_audio.enabled = bool(data.openerAudioEnabled)
+ opener_audio.updated_at = datetime.utcnow()
+ db.commit()
+ db.refresh(assistant)
return assistant_to_dict(assistant)
+@router.get("/{id}/opener-audio", response_model=AssistantOpenerAudioOut)
+def get_assistant_opener_audio(id: str, db: Session = Depends(get_db)):
+ assistant = db.query(Assistant).filter(Assistant.id == id).first()
+ if not assistant:
+ raise HTTPException(status_code=404, detail="Assistant not found")
+ return _opener_audio_out(assistant.opener_audio)
+
+
+@router.get("/{id}/opener-audio/pcm")
+def get_assistant_opener_audio_pcm(id: str, db: Session = Depends(get_db)):
+ assistant = db.query(Assistant).filter(Assistant.id == id).first()
+ if not assistant:
+ raise HTTPException(status_code=404, detail="Assistant not found")
+ record = assistant.opener_audio
+ if not record or not record.file_path:
+ raise HTTPException(status_code=404, detail="Opener audio not generated")
+ file_path = Path(record.file_path)
+ if not file_path.exists():
+ raise HTTPException(status_code=404, detail="Opener audio file missing")
+ return FileResponse(
+ str(file_path),
+ media_type="application/octet-stream",
+ filename=f"{assistant.id}.pcm",
+ )
+
+
+@router.post("/{id}/opener-audio/generate", response_model=AssistantOpenerAudioOut)
+def generate_assistant_opener_audio(
+ id: str,
+ data: AssistantOpenerAudioGenerateRequest,
+ db: Session = Depends(get_db),
+):
+ assistant = db.query(Assistant).filter(Assistant.id == id).first()
+ if not assistant:
+ raise HTTPException(status_code=404, detail="Assistant not found")
+ if not assistant.voice_output_enabled:
+ raise HTTPException(status_code=400, detail="Voice output is disabled")
+
+ opener_text = (data.text if data.text is not None else assistant.opener or "").strip()
+ if not opener_text:
+ raise HTTPException(status_code=400, detail="Opener text is empty")
+
+ tts_cfg, voice = _resolve_tts_runtime_for_assistant(db, assistant)
+ provider = str(tts_cfg.get("provider") or "").strip().lower()
+ if provider not in {"openai_compatible", "dashscope"}:
+ raise HTTPException(status_code=400, detail=f"Unsupported provider for preloaded opener audio: {provider or 'unknown'}")
+
+ speed = float(tts_cfg.get("speed") or assistant.speed or 1.0)
+ voice_key = str(tts_cfg.get("voice") or "").strip()
+ model = str(tts_cfg.get("model") or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL
+ api_key = str(tts_cfg.get("apiKey") or "").strip()
+ base_url = str(tts_cfg.get("baseUrl") or "").strip()
+
+ if provider == "openai_compatible":
+ if not api_key:
+ if voice and voice.api_key:
+ api_key = voice.api_key.strip()
+ if not api_key:
+ api_key = (os.getenv("SILICONFLOW_API_KEY", "") or os.getenv("TTS_API_KEY", "")).strip()
+ if not api_key:
+ raise HTTPException(status_code=400, detail="TTS API key is missing")
+ if not base_url:
+ base_url = OPENAI_COMPATIBLE_DEFAULT_BASE_URL
+ wav_bytes = _synthesize_openai_compatible_wav(
+ text=opener_text,
+ model=model,
+ voice_key=voice_key,
+ speed=speed,
+ api_key=api_key,
+ base_url=base_url,
+ )
+ else:
+ from .voices import _synthesize_dashscope_preview, DASHSCOPE_DEFAULT_BASE_URL, DASHSCOPE_DEFAULT_MODEL, DASHSCOPE_DEFAULT_VOICE_KEY
+ if not api_key:
+ if voice and voice.api_key:
+ api_key = voice.api_key.strip()
+ if not api_key:
+ api_key = (os.getenv("DASHSCOPE_API_KEY", "") or os.getenv("TTS_API_KEY", "")).strip()
+ if not api_key:
+ raise HTTPException(status_code=400, detail="DashScope API key is missing")
+ if not base_url:
+ base_url = DASHSCOPE_DEFAULT_BASE_URL
+ if not model:
+ model = DASHSCOPE_DEFAULT_MODEL
+ if not voice_key:
+ voice_key = DASHSCOPE_DEFAULT_VOICE_KEY
+ try:
+ wav_bytes = _synthesize_dashscope_preview(
+ text=opener_text,
+ api_key=api_key,
+ base_url=base_url,
+ model=model,
+ voice_key=voice_key,
+ speed=speed,
+ )
+ except Exception as exc:
+ raise HTTPException(status_code=502, detail=f"DashScope opener audio generation failed: {exc}") from exc
+
+ pcm_bytes, duration_ms = _wav_to_pcm16_mono_16k(wav_bytes)
+ record = _ensure_assistant_opener_audio(db, assistant)
+ record.enabled = True
+ record.file_path = _persist_opener_audio_pcm(assistant.id, pcm_bytes)
+ record.encoding = "pcm_s16le"
+ record.sample_rate_hz = 16000
+ record.channels = 1
+ record.duration_ms = duration_ms
+ record.text_hash = hashlib.sha256(opener_text.encode("utf-8")).hexdigest()
+ record.tts_fingerprint = _tts_fingerprint(tts_cfg, opener_text)
+ now = datetime.utcnow()
+ if not record.created_at:
+ record.created_at = now
+ record.updated_at = now
+ assistant.updated_at = now
+ db.commit()
+ db.refresh(assistant)
+ return _opener_audio_out(assistant.opener_audio)
+
+
@router.put("/{id}")
def update_assistant(id: str, data: AssistantUpdate, db: Session = Depends(get_db)):
"""更新助手"""
@@ -327,7 +592,12 @@ def update_assistant(id: str, data: AssistantUpdate, db: Session = Depends(get_d
raise HTTPException(status_code=404, detail="Assistant not found")
update_data = data.model_dump(exclude_unset=True)
+ opener_audio_enabled = update_data.pop("openerAudioEnabled", None)
_apply_assistant_update(assistant, update_data)
+ if opener_audio_enabled is not None:
+ record = _ensure_assistant_opener_audio(db, assistant)
+ record.enabled = bool(opener_audio_enabled)
+ record.updated_at = datetime.utcnow()
assistant.updated_at = datetime.utcnow()
db.commit()
diff --git a/api/app/schemas.py b/api/app/schemas.py
index 8a69287..f81efc8 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -275,6 +275,7 @@ class AssistantBase(BaseModel):
firstTurnMode: str = "bot_first"
opener: str = ""
generatedOpenerEnabled: bool = False
+ openerAudioEnabled: bool = False
prompt: str = ""
knowledgeBaseId: Optional[str] = None
language: str = "zh"
@@ -304,6 +305,7 @@ class AssistantUpdate(BaseModel):
firstTurnMode: Optional[str] = None
opener: Optional[str] = None
generatedOpenerEnabled: Optional[bool] = None
+ openerAudioEnabled: Optional[bool] = None
prompt: Optional[str] = None
knowledgeBaseId: Optional[str] = None
language: Optional[str] = None
@@ -349,6 +351,7 @@ class AssistantRuntimeMetadata(BaseModel):
knowledgeBaseId: Optional[str] = None
knowledge: Dict[str, Any] = Field(default_factory=dict)
history: Dict[str, Any] = Field(default_factory=dict)
+ openerAudio: Dict[str, Any] = Field(default_factory=dict)
assistantId: Optional[str] = None
configVersionId: Optional[str] = None
@@ -362,6 +365,22 @@ class AssistantEngineConfigResponse(BaseModel):
warnings: List[str] = Field(default_factory=list)
+class AssistantOpenerAudioGenerateRequest(BaseModel):
+ text: Optional[str] = None
+
+
+class AssistantOpenerAudioOut(BaseModel):
+ enabled: bool = False
+ ready: bool = False
+ encoding: str = "pcm_s16le"
+ sample_rate_hz: int = 16000
+ channels: int = 1
+ duration_ms: int = 0
+ updated_at: Optional[datetime] = None
+ text_hash: Optional[str] = None
+ tts_fingerprint: Optional[str] = None
+
+
class AssistantStats(BaseModel):
assistant_id: str
total_calls: int = 0
diff --git a/engine/app/config.py b/engine/app/config.py
index a441359..ba191f0 100644
--- a/engine/app/config.py
+++ b/engine/app/config.py
@@ -60,6 +60,7 @@ _AGENT_SECTION_KEY_MAP: Dict[str, Dict[str, str]] = {
"enabled": "duplex_enabled",
"greeting": "duplex_greeting",
"system_prompt": "duplex_system_prompt",
+ "opener_audio_file": "duplex_opener_audio_file",
},
"barge_in": {
"min_duration_ms": "barge_in_min_duration_ms",
@@ -96,6 +97,7 @@ _AGENT_SETTING_KEYS = {
"duplex_enabled",
"duplex_greeting",
"duplex_system_prompt",
+ "duplex_opener_audio_file",
"barge_in_min_duration_ms",
"barge_in_silence_tolerance_ms",
"tools",
@@ -452,6 +454,10 @@ class Settings(BaseSettings):
default="You are a helpful, friendly voice assistant. Keep your responses concise and conversational.",
description="System prompt for LLM"
)
+ duplex_opener_audio_file: Optional[str] = Field(
+ default=None,
+ description="Optional opener audio file path for standalone engine mode (.pcm or .wav)"
+ )
# Barge-in (interruption) Configuration
barge_in_min_duration_ms: int = Field(
diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py
index 93ca614..e360951 100644
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -12,12 +12,17 @@ event-driven design.
"""
import asyncio
+import audioop
+import io
import json
import time
import uuid
+import wave
+from pathlib import Path
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple
import numpy as np
+import aiohttp
from loguru import logger
from app.config import settings
@@ -203,6 +208,7 @@ class DuplexPipeline:
self._runtime_first_turn_mode: str = "bot_first"
self._runtime_greeting: Optional[str] = None
self._runtime_generated_opener_enabled: Optional[bool] = None
+ self._runtime_opener_audio: Dict[str, Any] = {}
self._runtime_barge_in_enabled: Optional[bool] = None
self._runtime_barge_in_min_duration_ms: Optional[int] = None
self._runtime_knowledge: Dict[str, Any] = {}
@@ -320,6 +326,9 @@ class DuplexPipeline:
knowledge = metadata.get("knowledge")
if isinstance(knowledge, dict):
self._runtime_knowledge = knowledge
+ opener_audio = metadata.get("openerAudio")
+ if isinstance(opener_audio, dict):
+ self._runtime_opener_audio = dict(opener_audio)
kb_id = str(knowledge.get("kbId") or knowledge.get("knowledgeBaseId") or "").strip()
if kb_id:
self._runtime_knowledge_base_id = kb_id
@@ -770,10 +779,117 @@ class DuplexPipeline:
)
await self.conversation.add_assistant_turn(greeting_to_speak)
- if self._tts_output_enabled():
+ used_preloaded_audio = await self._play_preloaded_opener_audio()
+ if self._tts_output_enabled() and not used_preloaded_audio:
# Keep opener text ahead of opener voice start.
await self._speak(greeting_to_speak, audio_event_priority=30)
+ async def _play_preloaded_opener_audio(self) -> bool:
+ """
+ Play opener audio from runtime metadata cache or YAML-configured local file.
+
+ Returns True when preloaded audio is played successfully.
+ """
+ if not self._tts_output_enabled():
+ return False
+
+ pcm_bytes = await self._load_preloaded_opener_pcm()
+ if not pcm_bytes:
+ return False
+
+ try:
+ self._drop_outbound_audio = False
+ self._start_tts()
+ await self._send_event(
+ {
+ **ev(
+ "output.audio.start",
+ trackId=self.track_audio_out,
+ )
+ },
+ priority=30,
+ )
+
+ self._is_bot_speaking = True
+ await self._send_audio(pcm_bytes, priority=50)
+ await self._flush_audio_out_frames(priority=50)
+ await self._send_event(
+ {
+ **ev(
+ "output.audio.end",
+ trackId=self.track_audio_out,
+ )
+ },
+ priority=30,
+ )
+ return True
+ except Exception as e:
+ logger.warning(f"Failed to play preloaded opener audio, fallback to TTS: {e}")
+ return False
+ finally:
+ self._is_bot_speaking = False
+
+ async def _load_preloaded_opener_pcm(self) -> Optional[bytes]:
+ # 1) Runtime metadata from backend config
+ opener_audio = self._runtime_opener_audio if isinstance(self._runtime_opener_audio, dict) else {}
+ if bool(opener_audio.get("enabled")) and bool(opener_audio.get("ready")):
+ pcm_url = str(opener_audio.get("pcmUrl") or "").strip()
+ if pcm_url:
+ resolved_url = pcm_url
+ if pcm_url.startswith("/"):
+ backend_url = str(settings.backend_url or "").strip().rstrip("/")
+ if backend_url:
+ resolved_url = f"{backend_url}{pcm_url}"
+ try:
+ timeout = aiohttp.ClientTimeout(total=10)
+ async with aiohttp.ClientSession(timeout=timeout) as session:
+ async with session.get(resolved_url) as resp:
+ resp.raise_for_status()
+ payload = await resp.read()
+ if payload:
+ return payload
+ except Exception as e:
+ logger.warning(f"Failed to fetch opener audio from backend ({resolved_url}): {e}")
+
+ # 2) Standalone fallback via YAML
+ opener_audio_file = str(settings.duplex_opener_audio_file or "").strip()
+ if not opener_audio_file:
+ return None
+ path = Path(opener_audio_file)
+ if not path.is_absolute():
+ path = (Path.cwd() / path).resolve()
+ if not path.exists() or not path.is_file():
+ logger.warning(f"Configured opener audio file does not exist: {path}")
+ return None
+ try:
+ raw = path.read_bytes()
+ suffix = path.suffix.lower()
+ if suffix == ".wav":
+ pcm, _ = self._wav_to_pcm16_mono_16k(raw)
+ return pcm
+ # .pcm raw pcm_s16le 16k mono
+ return raw
+ except Exception as e:
+ logger.warning(f"Failed to read opener audio file {path}: {e}")
+ return None
+
+ def _wav_to_pcm16_mono_16k(self, wav_bytes: bytes) -> Tuple[bytes, int]:
+ with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file:
+ channels = wav_file.getnchannels()
+ sample_width = wav_file.getsampwidth()
+ sample_rate = wav_file.getframerate()
+ nframes = wav_file.getnframes()
+ raw = wav_file.readframes(nframes)
+
+ if sample_width != 2:
+ raise ValueError(f"Unsupported WAV sample width: {sample_width * 8}bit")
+ if channels > 1:
+ raw = audioop.tomono(raw, sample_width, 0.5, 0.5)
+ if sample_rate != 16000:
+ raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None)
+ duration_ms = int((len(raw) / (16000 * 2)) * 1000)
+ return raw, duration_ms
+
async def _enqueue_outbound(self, kind: str, payload: Any, priority: int) -> None:
"""Queue outbound message with priority ordering."""
self._outbound_seq += 1
diff --git a/engine/core/session.py b/engine/core/session.py
index 3b51c48..dba4ca3 100644
--- a/engine/core/session.py
+++ b/engine/core/session.py
@@ -59,6 +59,7 @@ class Session:
"bargeIn",
"knowledge",
"knowledgeBaseId",
+ "openerAudio",
"history",
"userId",
"assistantId",
@@ -840,6 +841,7 @@ class Session:
"bargeIn",
"knowledgeBaseId",
"knowledge",
+ "openerAudio",
"history",
"userId",
"source",
diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx
index 98c818d..5529d68 100644
--- a/web/pages/Assistants.tsx
+++ b/web/pages/Assistants.tsx
@@ -3,7 +3,7 @@ import React, { useState, useEffect, useMemo, useRef } from 'react';
import { Plus, Search, Play, Copy, Trash2, Mic, MessageSquare, Save, Video, PhoneOff, Camera, ArrowLeftRight, Send, Phone, Rocket, AlertTriangle, PhoneCall, CameraOff, Image, Images, CloudSun, Calendar, TrendingUp, Coins, Wrench, Globe, Terminal, X, ClipboardCheck, Sparkles, Volume2, Timer, ChevronDown, Database, Server, Zap, ExternalLink, Key, BrainCircuit, Ear, Book, Filter } from 'lucide-react';
import { Button, Input, Badge, Drawer, Dialog } from '../components/UI';
import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types';
-import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, updateAssistant as updateAssistantApi } from '../services/backendApi';
+import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, generateAssistantOpenerAudio, updateAssistant as updateAssistantApi } from '../services/backendApi';
const isOpenAICompatibleVendor = (vendor?: string) => {
const normalized = String(vendor || '').trim().toLowerCase();
@@ -108,6 +108,7 @@ export const AssistantsPage: React.FC = () => {
const [isLoading, setIsLoading] = useState(true);
const [persistedAssistantSnapshotById, setPersistedAssistantSnapshotById] = useState
+ 状态: + {selectedAssistant.openerAudioReady + ? `已生成 (${Math.round((selectedAssistant.openerAudioDurationMs || 0) / 1000)}s)` + : '未生成'} +
+ ++ 使用当前 TTS 配置生成并保存到后端;引擎可直接播放以降低首包延迟。 +
+