diff --git a/api/app/models.py b/api/app/models.py index 7b6c03f..3aa5bf6 100644 --- a/api/app/models.py +++ b/api/app/models.py @@ -138,6 +138,25 @@ class Assistant(Base): user = relationship("User") call_records = relationship("CallRecord", back_populates="assistant") + opener_audio = relationship("AssistantOpenerAudio", back_populates="assistant", uselist=False, cascade="all, delete-orphan") + + +class AssistantOpenerAudio(Base): + __tablename__ = "assistant_opener_audio" + + assistant_id: Mapped[str] = mapped_column(String(64), ForeignKey("assistants.id"), primary_key=True) + enabled: Mapped[bool] = mapped_column(default=False) + file_path: Mapped[Optional[str]] = mapped_column(String(512), nullable=True) + encoding: Mapped[str] = mapped_column(String(32), default="pcm_s16le") + sample_rate_hz: Mapped[int] = mapped_column(Integer, default=16000) + channels: Mapped[int] = mapped_column(Integer, default=1) + duration_ms: Mapped[int] = mapped_column(Integer, default=0) + text_hash: Mapped[Optional[str]] = mapped_column(String(128), nullable=True) + tts_fingerprint: Mapped[Optional[str]] = mapped_column(String(256), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + + assistant = relationship("Assistant", back_populates="opener_audio") # ============ Knowledge Base ============ diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py index 09f338f..fcb1932 100644 --- a/api/app/routers/assistants.py +++ b/api/app/routers/assistants.py @@ -1,18 +1,33 @@ +import audioop +import hashlib +import io +import os +import wave +from pathlib import Path +import httpx from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import FileResponse from sqlalchemy.orm import Session from typing import Any, Dict, List, Optional import uuid from datetime import datetime from ..db import get_db -from ..models import Assistant, LLMModel, ASRModel, Voice +from ..models import Assistant, AssistantOpenerAudio, LLMModel, ASRModel, Voice from ..schemas import ( - AssistantCreate, AssistantUpdate, AssistantOut, AssistantEngineConfigResponse + AssistantCreate, + AssistantUpdate, + AssistantOut, + AssistantEngineConfigResponse, + AssistantOpenerAudioGenerateRequest, + AssistantOpenerAudioOut, ) router = APIRouter(prefix="/assistants", tags=["Assistants"]) OPENAI_COMPATIBLE_DEFAULT_MODEL = "FunAudioLLM/CosyVoice2-0.5B" +OPENAI_COMPATIBLE_DEFAULT_BASE_URL = "https://api.siliconflow.cn/v1" +OPENER_AUDIO_DIR = Path(__file__).resolve().parents[2] / "data" / "opener_audio" OPENAI_COMPATIBLE_KNOWN_VOICES = { "alex", "anna", @@ -163,6 +178,19 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> tuple[Dict[s "kbId": assistant.knowledge_base_id, "nResults": 5, } + opener_audio = assistant.opener_audio + opener_audio_ready = bool(opener_audio and opener_audio.file_path and Path(opener_audio.file_path).exists()) + metadata["openerAudio"] = { + "enabled": bool(opener_audio.enabled) if opener_audio else False, + "ready": opener_audio_ready, + "encoding": opener_audio.encoding if opener_audio else "pcm_s16le", + "sampleRateHz": int(opener_audio.sample_rate_hz) if opener_audio else 16000, + "channels": int(opener_audio.channels) if opener_audio else 1, + "durationMs": int(opener_audio.duration_ms) if opener_audio else 0, + "textHash": opener_audio.text_hash if opener_audio else None, + "ttsFingerprint": opener_audio.tts_fingerprint if opener_audio else None, + "pcmUrl": f"/api/assistants/{assistant.id}/opener-audio/pcm" if opener_audio_ready else None, + } return metadata, warnings @@ -189,6 +217,8 @@ def _build_engine_assistant_config(db: Session, assistant: Assistant) -> Dict[st def assistant_to_dict(assistant: Assistant) -> dict: + opener_audio = assistant.opener_audio + opener_audio_ready = bool(opener_audio and opener_audio.file_path and Path(opener_audio.file_path).exists()) return { "id": assistant.id, "name": assistant.name, @@ -196,6 +226,10 @@ def assistant_to_dict(assistant: Assistant) -> dict: "firstTurnMode": assistant.first_turn_mode or "bot_first", "opener": assistant.opener or "", "generatedOpenerEnabled": bool(assistant.generated_opener_enabled), + "openerAudioEnabled": bool(opener_audio.enabled) if opener_audio else False, + "openerAudioReady": opener_audio_ready, + "openerAudioDurationMs": int(opener_audio.duration_ms) if opener_audio else 0, + "openerAudioUpdatedAt": opener_audio.updated_at if opener_audio else None, "prompt": assistant.prompt or "", "knowledgeBaseId": assistant.knowledge_base_id, "language": assistant.language, @@ -238,6 +272,114 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None: setattr(assistant, field_map.get(field, field), value) +def _ensure_assistant_opener_audio(db: Session, assistant: Assistant) -> AssistantOpenerAudio: + record = assistant.opener_audio + if record: + return record + record = AssistantOpenerAudio(assistant_id=assistant.id, enabled=False) + db.add(record) + db.flush() + return record + + +def _resolve_tts_runtime_for_assistant(db: Session, assistant: Assistant) -> tuple[Dict[str, Any], Optional[Voice]]: + metadata, _ = _resolve_runtime_metadata(db, assistant) + services = metadata.get("services") if isinstance(metadata.get("services"), dict) else {} + tts = services.get("tts") if isinstance(services, dict) and isinstance(services.get("tts"), dict) else {} + voice = db.query(Voice).filter(Voice.id == assistant.voice).first() if assistant.voice else None + return tts, voice + + +def _tts_fingerprint(tts_cfg: Dict[str, Any], opener_text: str) -> str: + identity = { + "provider": tts_cfg.get("provider"), + "model": tts_cfg.get("model"), + "voice": tts_cfg.get("voice"), + "speed": tts_cfg.get("speed"), + "text": opener_text, + } + return hashlib.sha256(str(identity).encode("utf-8")).hexdigest() + + +def _synthesize_openai_compatible_wav( + *, + text: str, + model: str, + voice_key: str, + speed: float, + api_key: str, + base_url: str, +) -> bytes: + payload = { + "model": model or OPENAI_COMPATIBLE_DEFAULT_MODEL, + "input": text, + "voice": voice_key, + "response_format": "wav", + "speed": speed, + } + with httpx.Client(timeout=45.0) as client: + response = client.post( + f"{base_url.rstrip('/')}/audio/speech", + headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}, + json=payload, + ) + if response.status_code != 200: + detail = response.text + try: + detail_json = response.json() + detail = detail_json.get("error", {}).get("message") or detail_json.get("detail") or detail + except Exception: + pass + raise HTTPException(status_code=502, detail=f"TTS vendor error: {detail}") + return response.content + + +def _wav_to_pcm16_mono_16k(wav_bytes: bytes) -> tuple[bytes, int]: + with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file: + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + frames = wav_file.getnframes() + raw = wav_file.readframes(frames) + + if sample_width != 2: + raise HTTPException(status_code=400, detail=f"Unsupported WAV sample width: {sample_width * 8}bit") + + if channels > 1: + raw = audioop.tomono(raw, sample_width, 0.5, 0.5) + + if sample_rate != 16000: + raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None) + + duration_ms = int((len(raw) / (16000 * 2)) * 1000) + return raw, duration_ms + + +def _persist_opener_audio_pcm(assistant_id: str, pcm_bytes: bytes) -> str: + OPENER_AUDIO_DIR.mkdir(parents=True, exist_ok=True) + file_path = OPENER_AUDIO_DIR / f"{assistant_id}.pcm" + with open(file_path, "wb") as f: + f.write(pcm_bytes) + return str(file_path) + + +def _opener_audio_out(record: Optional[AssistantOpenerAudio]) -> AssistantOpenerAudioOut: + if not record: + return AssistantOpenerAudioOut() + ready = bool(record.file_path and Path(record.file_path).exists()) + return AssistantOpenerAudioOut( + enabled=bool(record.enabled), + ready=ready, + encoding=record.encoding, + sample_rate_hz=record.sample_rate_hz, + channels=record.channels, + duration_ms=record.duration_ms, + updated_at=record.updated_at, + text_hash=record.text_hash, + tts_fingerprint=record.tts_fingerprint, + ) + + # ============ Assistants ============ @router.get("") def list_assistants( @@ -316,9 +458,132 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)): db.add(assistant) db.commit() db.refresh(assistant) + opener_audio = _ensure_assistant_opener_audio(db, assistant) + opener_audio.enabled = bool(data.openerAudioEnabled) + opener_audio.updated_at = datetime.utcnow() + db.commit() + db.refresh(assistant) return assistant_to_dict(assistant) +@router.get("/{id}/opener-audio", response_model=AssistantOpenerAudioOut) +def get_assistant_opener_audio(id: str, db: Session = Depends(get_db)): + assistant = db.query(Assistant).filter(Assistant.id == id).first() + if not assistant: + raise HTTPException(status_code=404, detail="Assistant not found") + return _opener_audio_out(assistant.opener_audio) + + +@router.get("/{id}/opener-audio/pcm") +def get_assistant_opener_audio_pcm(id: str, db: Session = Depends(get_db)): + assistant = db.query(Assistant).filter(Assistant.id == id).first() + if not assistant: + raise HTTPException(status_code=404, detail="Assistant not found") + record = assistant.opener_audio + if not record or not record.file_path: + raise HTTPException(status_code=404, detail="Opener audio not generated") + file_path = Path(record.file_path) + if not file_path.exists(): + raise HTTPException(status_code=404, detail="Opener audio file missing") + return FileResponse( + str(file_path), + media_type="application/octet-stream", + filename=f"{assistant.id}.pcm", + ) + + +@router.post("/{id}/opener-audio/generate", response_model=AssistantOpenerAudioOut) +def generate_assistant_opener_audio( + id: str, + data: AssistantOpenerAudioGenerateRequest, + db: Session = Depends(get_db), +): + assistant = db.query(Assistant).filter(Assistant.id == id).first() + if not assistant: + raise HTTPException(status_code=404, detail="Assistant not found") + if not assistant.voice_output_enabled: + raise HTTPException(status_code=400, detail="Voice output is disabled") + + opener_text = (data.text if data.text is not None else assistant.opener or "").strip() + if not opener_text: + raise HTTPException(status_code=400, detail="Opener text is empty") + + tts_cfg, voice = _resolve_tts_runtime_for_assistant(db, assistant) + provider = str(tts_cfg.get("provider") or "").strip().lower() + if provider not in {"openai_compatible", "dashscope"}: + raise HTTPException(status_code=400, detail=f"Unsupported provider for preloaded opener audio: {provider or 'unknown'}") + + speed = float(tts_cfg.get("speed") or assistant.speed or 1.0) + voice_key = str(tts_cfg.get("voice") or "").strip() + model = str(tts_cfg.get("model") or "").strip() or OPENAI_COMPATIBLE_DEFAULT_MODEL + api_key = str(tts_cfg.get("apiKey") or "").strip() + base_url = str(tts_cfg.get("baseUrl") or "").strip() + + if provider == "openai_compatible": + if not api_key: + if voice and voice.api_key: + api_key = voice.api_key.strip() + if not api_key: + api_key = (os.getenv("SILICONFLOW_API_KEY", "") or os.getenv("TTS_API_KEY", "")).strip() + if not api_key: + raise HTTPException(status_code=400, detail="TTS API key is missing") + if not base_url: + base_url = OPENAI_COMPATIBLE_DEFAULT_BASE_URL + wav_bytes = _synthesize_openai_compatible_wav( + text=opener_text, + model=model, + voice_key=voice_key, + speed=speed, + api_key=api_key, + base_url=base_url, + ) + else: + from .voices import _synthesize_dashscope_preview, DASHSCOPE_DEFAULT_BASE_URL, DASHSCOPE_DEFAULT_MODEL, DASHSCOPE_DEFAULT_VOICE_KEY + if not api_key: + if voice and voice.api_key: + api_key = voice.api_key.strip() + if not api_key: + api_key = (os.getenv("DASHSCOPE_API_KEY", "") or os.getenv("TTS_API_KEY", "")).strip() + if not api_key: + raise HTTPException(status_code=400, detail="DashScope API key is missing") + if not base_url: + base_url = DASHSCOPE_DEFAULT_BASE_URL + if not model: + model = DASHSCOPE_DEFAULT_MODEL + if not voice_key: + voice_key = DASHSCOPE_DEFAULT_VOICE_KEY + try: + wav_bytes = _synthesize_dashscope_preview( + text=opener_text, + api_key=api_key, + base_url=base_url, + model=model, + voice_key=voice_key, + speed=speed, + ) + except Exception as exc: + raise HTTPException(status_code=502, detail=f"DashScope opener audio generation failed: {exc}") from exc + + pcm_bytes, duration_ms = _wav_to_pcm16_mono_16k(wav_bytes) + record = _ensure_assistant_opener_audio(db, assistant) + record.enabled = True + record.file_path = _persist_opener_audio_pcm(assistant.id, pcm_bytes) + record.encoding = "pcm_s16le" + record.sample_rate_hz = 16000 + record.channels = 1 + record.duration_ms = duration_ms + record.text_hash = hashlib.sha256(opener_text.encode("utf-8")).hexdigest() + record.tts_fingerprint = _tts_fingerprint(tts_cfg, opener_text) + now = datetime.utcnow() + if not record.created_at: + record.created_at = now + record.updated_at = now + assistant.updated_at = now + db.commit() + db.refresh(assistant) + return _opener_audio_out(assistant.opener_audio) + + @router.put("/{id}") def update_assistant(id: str, data: AssistantUpdate, db: Session = Depends(get_db)): """更新助手""" @@ -327,7 +592,12 @@ def update_assistant(id: str, data: AssistantUpdate, db: Session = Depends(get_d raise HTTPException(status_code=404, detail="Assistant not found") update_data = data.model_dump(exclude_unset=True) + opener_audio_enabled = update_data.pop("openerAudioEnabled", None) _apply_assistant_update(assistant, update_data) + if opener_audio_enabled is not None: + record = _ensure_assistant_opener_audio(db, assistant) + record.enabled = bool(opener_audio_enabled) + record.updated_at = datetime.utcnow() assistant.updated_at = datetime.utcnow() db.commit() diff --git a/api/app/schemas.py b/api/app/schemas.py index 8a69287..f81efc8 100644 --- a/api/app/schemas.py +++ b/api/app/schemas.py @@ -275,6 +275,7 @@ class AssistantBase(BaseModel): firstTurnMode: str = "bot_first" opener: str = "" generatedOpenerEnabled: bool = False + openerAudioEnabled: bool = False prompt: str = "" knowledgeBaseId: Optional[str] = None language: str = "zh" @@ -304,6 +305,7 @@ class AssistantUpdate(BaseModel): firstTurnMode: Optional[str] = None opener: Optional[str] = None generatedOpenerEnabled: Optional[bool] = None + openerAudioEnabled: Optional[bool] = None prompt: Optional[str] = None knowledgeBaseId: Optional[str] = None language: Optional[str] = None @@ -349,6 +351,7 @@ class AssistantRuntimeMetadata(BaseModel): knowledgeBaseId: Optional[str] = None knowledge: Dict[str, Any] = Field(default_factory=dict) history: Dict[str, Any] = Field(default_factory=dict) + openerAudio: Dict[str, Any] = Field(default_factory=dict) assistantId: Optional[str] = None configVersionId: Optional[str] = None @@ -362,6 +365,22 @@ class AssistantEngineConfigResponse(BaseModel): warnings: List[str] = Field(default_factory=list) +class AssistantOpenerAudioGenerateRequest(BaseModel): + text: Optional[str] = None + + +class AssistantOpenerAudioOut(BaseModel): + enabled: bool = False + ready: bool = False + encoding: str = "pcm_s16le" + sample_rate_hz: int = 16000 + channels: int = 1 + duration_ms: int = 0 + updated_at: Optional[datetime] = None + text_hash: Optional[str] = None + tts_fingerprint: Optional[str] = None + + class AssistantStats(BaseModel): assistant_id: str total_calls: int = 0 diff --git a/engine/app/config.py b/engine/app/config.py index a441359..ba191f0 100644 --- a/engine/app/config.py +++ b/engine/app/config.py @@ -60,6 +60,7 @@ _AGENT_SECTION_KEY_MAP: Dict[str, Dict[str, str]] = { "enabled": "duplex_enabled", "greeting": "duplex_greeting", "system_prompt": "duplex_system_prompt", + "opener_audio_file": "duplex_opener_audio_file", }, "barge_in": { "min_duration_ms": "barge_in_min_duration_ms", @@ -96,6 +97,7 @@ _AGENT_SETTING_KEYS = { "duplex_enabled", "duplex_greeting", "duplex_system_prompt", + "duplex_opener_audio_file", "barge_in_min_duration_ms", "barge_in_silence_tolerance_ms", "tools", @@ -452,6 +454,10 @@ class Settings(BaseSettings): default="You are a helpful, friendly voice assistant. Keep your responses concise and conversational.", description="System prompt for LLM" ) + duplex_opener_audio_file: Optional[str] = Field( + default=None, + description="Optional opener audio file path for standalone engine mode (.pcm or .wav)" + ) # Barge-in (interruption) Configuration barge_in_min_duration_ms: int = Field( diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py index 93ca614..e360951 100644 --- a/engine/core/duplex_pipeline.py +++ b/engine/core/duplex_pipeline.py @@ -12,12 +12,17 @@ event-driven design. """ import asyncio +import audioop +import io import json import time import uuid +import wave +from pathlib import Path from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple import numpy as np +import aiohttp from loguru import logger from app.config import settings @@ -203,6 +208,7 @@ class DuplexPipeline: self._runtime_first_turn_mode: str = "bot_first" self._runtime_greeting: Optional[str] = None self._runtime_generated_opener_enabled: Optional[bool] = None + self._runtime_opener_audio: Dict[str, Any] = {} self._runtime_barge_in_enabled: Optional[bool] = None self._runtime_barge_in_min_duration_ms: Optional[int] = None self._runtime_knowledge: Dict[str, Any] = {} @@ -320,6 +326,9 @@ class DuplexPipeline: knowledge = metadata.get("knowledge") if isinstance(knowledge, dict): self._runtime_knowledge = knowledge + opener_audio = metadata.get("openerAudio") + if isinstance(opener_audio, dict): + self._runtime_opener_audio = dict(opener_audio) kb_id = str(knowledge.get("kbId") or knowledge.get("knowledgeBaseId") or "").strip() if kb_id: self._runtime_knowledge_base_id = kb_id @@ -770,10 +779,117 @@ class DuplexPipeline: ) await self.conversation.add_assistant_turn(greeting_to_speak) - if self._tts_output_enabled(): + used_preloaded_audio = await self._play_preloaded_opener_audio() + if self._tts_output_enabled() and not used_preloaded_audio: # Keep opener text ahead of opener voice start. await self._speak(greeting_to_speak, audio_event_priority=30) + async def _play_preloaded_opener_audio(self) -> bool: + """ + Play opener audio from runtime metadata cache or YAML-configured local file. + + Returns True when preloaded audio is played successfully. + """ + if not self._tts_output_enabled(): + return False + + pcm_bytes = await self._load_preloaded_opener_pcm() + if not pcm_bytes: + return False + + try: + self._drop_outbound_audio = False + self._start_tts() + await self._send_event( + { + **ev( + "output.audio.start", + trackId=self.track_audio_out, + ) + }, + priority=30, + ) + + self._is_bot_speaking = True + await self._send_audio(pcm_bytes, priority=50) + await self._flush_audio_out_frames(priority=50) + await self._send_event( + { + **ev( + "output.audio.end", + trackId=self.track_audio_out, + ) + }, + priority=30, + ) + return True + except Exception as e: + logger.warning(f"Failed to play preloaded opener audio, fallback to TTS: {e}") + return False + finally: + self._is_bot_speaking = False + + async def _load_preloaded_opener_pcm(self) -> Optional[bytes]: + # 1) Runtime metadata from backend config + opener_audio = self._runtime_opener_audio if isinstance(self._runtime_opener_audio, dict) else {} + if bool(opener_audio.get("enabled")) and bool(opener_audio.get("ready")): + pcm_url = str(opener_audio.get("pcmUrl") or "").strip() + if pcm_url: + resolved_url = pcm_url + if pcm_url.startswith("/"): + backend_url = str(settings.backend_url or "").strip().rstrip("/") + if backend_url: + resolved_url = f"{backend_url}{pcm_url}" + try: + timeout = aiohttp.ClientTimeout(total=10) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get(resolved_url) as resp: + resp.raise_for_status() + payload = await resp.read() + if payload: + return payload + except Exception as e: + logger.warning(f"Failed to fetch opener audio from backend ({resolved_url}): {e}") + + # 2) Standalone fallback via YAML + opener_audio_file = str(settings.duplex_opener_audio_file or "").strip() + if not opener_audio_file: + return None + path = Path(opener_audio_file) + if not path.is_absolute(): + path = (Path.cwd() / path).resolve() + if not path.exists() or not path.is_file(): + logger.warning(f"Configured opener audio file does not exist: {path}") + return None + try: + raw = path.read_bytes() + suffix = path.suffix.lower() + if suffix == ".wav": + pcm, _ = self._wav_to_pcm16_mono_16k(raw) + return pcm + # .pcm raw pcm_s16le 16k mono + return raw + except Exception as e: + logger.warning(f"Failed to read opener audio file {path}: {e}") + return None + + def _wav_to_pcm16_mono_16k(self, wav_bytes: bytes) -> Tuple[bytes, int]: + with wave.open(io.BytesIO(wav_bytes), "rb") as wav_file: + channels = wav_file.getnchannels() + sample_width = wav_file.getsampwidth() + sample_rate = wav_file.getframerate() + nframes = wav_file.getnframes() + raw = wav_file.readframes(nframes) + + if sample_width != 2: + raise ValueError(f"Unsupported WAV sample width: {sample_width * 8}bit") + if channels > 1: + raw = audioop.tomono(raw, sample_width, 0.5, 0.5) + if sample_rate != 16000: + raw, _ = audioop.ratecv(raw, sample_width, 1, sample_rate, 16000, None) + duration_ms = int((len(raw) / (16000 * 2)) * 1000) + return raw, duration_ms + async def _enqueue_outbound(self, kind: str, payload: Any, priority: int) -> None: """Queue outbound message with priority ordering.""" self._outbound_seq += 1 diff --git a/engine/core/session.py b/engine/core/session.py index 3b51c48..dba4ca3 100644 --- a/engine/core/session.py +++ b/engine/core/session.py @@ -59,6 +59,7 @@ class Session: "bargeIn", "knowledge", "knowledgeBaseId", + "openerAudio", "history", "userId", "assistantId", @@ -840,6 +841,7 @@ class Session: "bargeIn", "knowledgeBaseId", "knowledge", + "openerAudio", "history", "userId", "source", diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx index 98c818d..5529d68 100644 --- a/web/pages/Assistants.tsx +++ b/web/pages/Assistants.tsx @@ -3,7 +3,7 @@ import React, { useState, useEffect, useMemo, useRef } from 'react'; import { Plus, Search, Play, Copy, Trash2, Mic, MessageSquare, Save, Video, PhoneOff, Camera, ArrowLeftRight, Send, Phone, Rocket, AlertTriangle, PhoneCall, CameraOff, Image, Images, CloudSun, Calendar, TrendingUp, Coins, Wrench, Globe, Terminal, X, ClipboardCheck, Sparkles, Volume2, Timer, ChevronDown, Database, Server, Zap, ExternalLink, Key, BrainCircuit, Ear, Book, Filter } from 'lucide-react'; import { Button, Input, Badge, Drawer, Dialog } from '../components/UI'; import { ASRModel, Assistant, KnowledgeBase, LLMModel, TabValue, Tool, Voice } from '../types'; -import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, updateAssistant as updateAssistantApi } from '../services/backendApi'; +import { createAssistant, deleteAssistant, fetchASRModels, fetchAssistants, fetchKnowledgeBases, fetchLLMModels, fetchTools, fetchVoices, generateAssistantOpenerAudio, updateAssistant as updateAssistantApi } from '../services/backendApi'; const isOpenAICompatibleVendor = (vendor?: string) => { const normalized = String(vendor || '').trim().toLowerCase(); @@ -108,6 +108,7 @@ export const AssistantsPage: React.FC = () => { const [isLoading, setIsLoading] = useState(true); const [persistedAssistantSnapshotById, setPersistedAssistantSnapshotById] = useState>({}); const [unsavedDebugConfirmOpen, setUnsavedDebugConfirmOpen] = useState(false); + const [openerAudioGenerating, setOpenerAudioGenerating] = useState(false); const selectedAssistant = assistants.find(a => a.id === selectedId) || null; const serializeAssistant = (assistant: Assistant) => JSON.stringify(assistant); @@ -164,6 +165,7 @@ export const AssistantsPage: React.FC = () => { firstTurnMode: 'bot_first', opener: '', generatedOpenerEnabled: false, + openerAudioEnabled: false, prompt: '', knowledgeBaseId: '', language: 'zh', @@ -269,6 +271,31 @@ export const AssistantsPage: React.FC = () => { setDebugOpen(true); }; + const handleGenerateOpenerAudio = async () => { + if (!selectedAssistant) return; + setOpenerAudioGenerating(true); + try { + const status = await generateAssistantOpenerAudio(selectedAssistant.id, { + text: selectedAssistant.opener || '', + }); + setAssistants((prev) => prev.map((item) => { + if (item.id !== selectedAssistant.id) return item; + return { + ...item, + openerAudioEnabled: status.enabled, + openerAudioReady: status.ready, + openerAudioDurationMs: status.duration_ms, + openerAudioUpdatedAt: status.updated_at || '', + }; + })); + } catch (error) { + console.error(error); + alert((error as Error)?.message || '生成开场白预加载音频失败'); + } finally { + setOpenerAudioGenerating(false); + } + }; + const handleConfirmOpenDebug = () => { setUnsavedDebugConfirmOpen(false); setDebugOpen(true); @@ -676,6 +703,58 @@ export const AssistantsPage: React.FC = () => { ? '通话接通后将根据提示词自动生成开场白。' : '接通通话后的第一句话。'}

+ +
+
+ +
+ + +
+
+
+

+ 状态: + {selectedAssistant.openerAudioReady + ? `已生成 (${Math.round((selectedAssistant.openerAudioDurationMs || 0) / 1000)}s)` + : '未生成'} +

+ +
+

+ 使用当前 TTS 配置生成并保存到后端;引擎可直接播放以降低首包延迟。 +

+
)} diff --git a/web/services/backendApi.ts b/web/services/backendApi.ts index f57a7a1..8404e9b 100644 --- a/web/services/backendApi.ts +++ b/web/services/backendApi.ts @@ -36,6 +36,10 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({ firstTurnMode: readField(raw, ['firstTurnMode', 'first_turn_mode'], 'bot_first') as 'bot_first' | 'user_first', opener: readField(raw, ['opener'], ''), generatedOpenerEnabled: Boolean(readField(raw, ['generatedOpenerEnabled', 'generated_opener_enabled'], false)), + openerAudioEnabled: Boolean(readField(raw, ['openerAudioEnabled', 'opener_audio_enabled'], false)), + openerAudioReady: Boolean(readField(raw, ['openerAudioReady', 'opener_audio_ready'], false)), + openerAudioDurationMs: Number(readField(raw, ['openerAudioDurationMs', 'opener_audio_duration_ms'], 0)), + openerAudioUpdatedAt: readField(raw, ['openerAudioUpdatedAt', 'opener_audio_updated_at'], ''), prompt: readField(raw, ['prompt'], ''), knowledgeBaseId: readField(raw, ['knowledgeBaseId', 'knowledge_base_id'], ''), language: readField(raw, ['language'], 'zh') as 'zh' | 'en', @@ -228,6 +232,7 @@ export const createAssistant = async (data: Partial): Promise): Pro firstTurnMode: data.firstTurnMode, opener: data.opener, generatedOpenerEnabled: data.generatedOpenerEnabled, + openerAudioEnabled: data.openerAudioEnabled, prompt: data.prompt, knowledgeBaseId: data.knowledgeBaseId, language: data.language, @@ -295,10 +301,36 @@ export interface AssistantRuntimeConfigResponse { warnings?: string[]; } +export interface AssistantOpenerAudioStatus { + enabled: boolean; + ready: boolean; + encoding: string; + sample_rate_hz: number; + channels: number; + duration_ms: number; + updated_at?: string | null; + text_hash?: string | null; + tts_fingerprint?: string | null; +} + export const fetchAssistantRuntimeConfig = async (assistantId: string): Promise => { return apiRequest(`/assistants/${assistantId}/config`); }; +export const fetchAssistantOpenerAudioStatus = async (assistantId: string): Promise => { + return apiRequest(`/assistants/${assistantId}/opener-audio`); +}; + +export const generateAssistantOpenerAudio = async ( + assistantId: string, + payload?: { text?: string } +): Promise => { + return apiRequest(`/assistants/${assistantId}/opener-audio/generate`, { + method: 'POST', + body: payload || {}, + }); +}; + export const fetchVoices = async (): Promise => { const response = await apiRequest<{ list?: AnyRecord[] } | AnyRecord[]>(withLimit('/voices')); const list = Array.isArray(response) ? response : (response.list || []); diff --git a/web/types.ts b/web/types.ts index 069a829..c41cdd9 100644 --- a/web/types.ts +++ b/web/types.ts @@ -6,6 +6,10 @@ export interface Assistant { firstTurnMode?: 'bot_first' | 'user_first'; opener: string; generatedOpenerEnabled?: boolean; + openerAudioEnabled?: boolean; + openerAudioReady?: boolean; + openerAudioDurationMs?: number; + openerAudioUpdatedAt?: string; prompt: string; knowledgeBaseId: string; language: 'zh' | 'en';