Add tts/text output schema
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
from sqlalchemy import create_engine
|
from sqlalchemy import create_engine, text
|
||||||
from sqlalchemy.orm import sessionmaker, DeclarativeBase
|
from sqlalchemy.orm import sessionmaker, DeclarativeBase
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@@ -14,6 +14,32 @@ class Base(DeclarativeBase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_schema_compatibility() -> None:
|
||||||
|
"""Best-effort lightweight migrations for SQLite deployments."""
|
||||||
|
if engine.dialect.name != "sqlite":
|
||||||
|
return
|
||||||
|
|
||||||
|
with engine.begin() as conn:
|
||||||
|
columns = {
|
||||||
|
row[1]
|
||||||
|
for row in conn.execute(text("PRAGMA table_info(assistants)"))
|
||||||
|
}
|
||||||
|
if "voice_output_enabled" not in columns:
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"ALTER TABLE assistants "
|
||||||
|
"ADD COLUMN voice_output_enabled BOOLEAN DEFAULT 1"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
text(
|
||||||
|
"UPDATE assistants "
|
||||||
|
"SET voice_output_enabled = 1 "
|
||||||
|
"WHERE voice_output_enabled IS NULL"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from fastapi.middleware.cors import CORSMiddleware
|
|||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from .db import Base, engine
|
from .db import Base, engine, ensure_schema_compatibility
|
||||||
from .routers import assistants, voices, workflows, history, knowledge, llm, asr, tools
|
from .routers import assistants, voices, workflows, history, knowledge, llm, asr, tools
|
||||||
|
|
||||||
|
|
||||||
@@ -11,6 +11,7 @@ from .routers import assistants, voices, workflows, history, knowledge, llm, asr
|
|||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
# 启动时创建表
|
# 启动时创建表
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
ensure_schema_compatibility()
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -112,6 +112,7 @@ class Assistant(Base):
|
|||||||
prompt: Mapped[str] = mapped_column(Text, default="")
|
prompt: Mapped[str] = mapped_column(Text, default="")
|
||||||
knowledge_base_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
|
knowledge_base_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
|
||||||
language: Mapped[str] = mapped_column(String(16), default="zh")
|
language: Mapped[str] = mapped_column(String(16), default="zh")
|
||||||
|
voice_output_enabled: Mapped[bool] = mapped_column(default=True)
|
||||||
voice: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
|
voice: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
|
||||||
speed: Mapped[float] = mapped_column(Float, default=1.0)
|
speed: Mapped[float] = mapped_column(Float, default=1.0)
|
||||||
hotwords: Mapped[dict] = mapped_column(JSON, default=list)
|
hotwords: Mapped[dict] = mapped_column(JSON, default=list)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
|
|||||||
metadata = {
|
metadata = {
|
||||||
"systemPrompt": assistant.prompt or "",
|
"systemPrompt": assistant.prompt or "",
|
||||||
"greeting": assistant.opener or "",
|
"greeting": assistant.opener or "",
|
||||||
|
"output": {"mode": "audio" if assistant.voice_output_enabled else "text"},
|
||||||
"services": {},
|
"services": {},
|
||||||
}
|
}
|
||||||
warnings = []
|
warnings = []
|
||||||
@@ -49,11 +50,14 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
|
|||||||
else:
|
else:
|
||||||
warnings.append(f"ASR model not found: {assistant.asr_model_id}")
|
warnings.append(f"ASR model not found: {assistant.asr_model_id}")
|
||||||
|
|
||||||
if assistant.voice:
|
if not assistant.voice_output_enabled:
|
||||||
|
metadata["services"]["tts"] = {"enabled": False}
|
||||||
|
elif assistant.voice:
|
||||||
voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
|
voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
|
||||||
if voice:
|
if voice:
|
||||||
tts_provider = "siliconflow" if _is_siliconflow_vendor(voice.vendor) else "edge"
|
tts_provider = "siliconflow" if _is_siliconflow_vendor(voice.vendor) else "edge"
|
||||||
metadata["services"]["tts"] = {
|
metadata["services"]["tts"] = {
|
||||||
|
"enabled": True,
|
||||||
"provider": tts_provider,
|
"provider": tts_provider,
|
||||||
"model": voice.model,
|
"model": voice.model,
|
||||||
"apiKey": voice.api_key if tts_provider == "siliconflow" else None,
|
"apiKey": voice.api_key if tts_provider == "siliconflow" else None,
|
||||||
@@ -63,6 +67,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
|
|||||||
else:
|
else:
|
||||||
# Keep assistant.voice as direct voice identifier fallback
|
# Keep assistant.voice as direct voice identifier fallback
|
||||||
metadata["services"]["tts"] = {
|
metadata["services"]["tts"] = {
|
||||||
|
"enabled": True,
|
||||||
"voice": assistant.voice,
|
"voice": assistant.voice,
|
||||||
"speed": assistant.speed or 1.0,
|
"speed": assistant.speed or 1.0,
|
||||||
}
|
}
|
||||||
@@ -98,6 +103,7 @@ def assistant_to_dict(assistant: Assistant) -> dict:
|
|||||||
"prompt": assistant.prompt or "",
|
"prompt": assistant.prompt or "",
|
||||||
"knowledgeBaseId": assistant.knowledge_base_id,
|
"knowledgeBaseId": assistant.knowledge_base_id,
|
||||||
"language": assistant.language,
|
"language": assistant.language,
|
||||||
|
"voiceOutputEnabled": assistant.voice_output_enabled,
|
||||||
"voice": assistant.voice,
|
"voice": assistant.voice,
|
||||||
"speed": assistant.speed,
|
"speed": assistant.speed,
|
||||||
"hotwords": assistant.hotwords or [],
|
"hotwords": assistant.hotwords or [],
|
||||||
@@ -120,6 +126,7 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None:
|
|||||||
"knowledgeBaseId": "knowledge_base_id",
|
"knowledgeBaseId": "knowledge_base_id",
|
||||||
"interruptionSensitivity": "interruption_sensitivity",
|
"interruptionSensitivity": "interruption_sensitivity",
|
||||||
"configMode": "config_mode",
|
"configMode": "config_mode",
|
||||||
|
"voiceOutputEnabled": "voice_output_enabled",
|
||||||
"apiUrl": "api_url",
|
"apiUrl": "api_url",
|
||||||
"apiKey": "api_key",
|
"apiKey": "api_key",
|
||||||
"llmModelId": "llm_model_id",
|
"llmModelId": "llm_model_id",
|
||||||
@@ -180,6 +187,7 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)):
|
|||||||
prompt=data.prompt,
|
prompt=data.prompt,
|
||||||
knowledge_base_id=data.knowledgeBaseId,
|
knowledge_base_id=data.knowledgeBaseId,
|
||||||
language=data.language,
|
language=data.language,
|
||||||
|
voice_output_enabled=data.voiceOutputEnabled,
|
||||||
voice=data.voice,
|
voice=data.voice,
|
||||||
speed=data.speed,
|
speed=data.speed,
|
||||||
hotwords=data.hotwords,
|
hotwords=data.hotwords,
|
||||||
|
|||||||
@@ -268,6 +268,7 @@ class AssistantBase(BaseModel):
|
|||||||
prompt: str = ""
|
prompt: str = ""
|
||||||
knowledgeBaseId: Optional[str] = None
|
knowledgeBaseId: Optional[str] = None
|
||||||
language: str = "zh"
|
language: str = "zh"
|
||||||
|
voiceOutputEnabled: bool = True
|
||||||
voice: Optional[str] = None
|
voice: Optional[str] = None
|
||||||
speed: float = 1.0
|
speed: float = 1.0
|
||||||
hotwords: List[str] = []
|
hotwords: List[str] = []
|
||||||
@@ -293,6 +294,7 @@ class AssistantUpdate(BaseModel):
|
|||||||
prompt: Optional[str] = None
|
prompt: Optional[str] = None
|
||||||
knowledgeBaseId: Optional[str] = None
|
knowledgeBaseId: Optional[str] = None
|
||||||
language: Optional[str] = None
|
language: Optional[str] = None
|
||||||
|
voiceOutputEnabled: Optional[bool] = None
|
||||||
voice: Optional[str] = None
|
voice: Optional[str] = None
|
||||||
speed: Optional[float] = None
|
speed: Optional[float] = None
|
||||||
hotwords: Optional[List[str]] = None
|
hotwords: Optional[List[str]] = None
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ def sample_assistant_data():
|
|||||||
"opener": "Hello, welcome!",
|
"opener": "Hello, welcome!",
|
||||||
"prompt": "You are a helpful assistant.",
|
"prompt": "You are a helpful assistant.",
|
||||||
"language": "zh",
|
"language": "zh",
|
||||||
|
"voiceOutputEnabled": True,
|
||||||
"speed": 1.0,
|
"speed": 1.0,
|
||||||
"hotwords": ["test", "hello"],
|
"hotwords": ["test", "hello"],
|
||||||
"tools": [],
|
"tools": [],
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ class TestAssistantAPI:
|
|||||||
assert data["opener"] == sample_assistant_data["opener"]
|
assert data["opener"] == sample_assistant_data["opener"]
|
||||||
assert data["prompt"] == sample_assistant_data["prompt"]
|
assert data["prompt"] == sample_assistant_data["prompt"]
|
||||||
assert data["language"] == sample_assistant_data["language"]
|
assert data["language"] == sample_assistant_data["language"]
|
||||||
|
assert data["voiceOutputEnabled"] is True
|
||||||
assert "id" in data
|
assert "id" in data
|
||||||
assert data["callCount"] == 0
|
assert data["callCount"] == 0
|
||||||
|
|
||||||
@@ -61,7 +62,8 @@ class TestAssistantAPI:
|
|||||||
update_data = {
|
update_data = {
|
||||||
"name": "Updated Assistant",
|
"name": "Updated Assistant",
|
||||||
"prompt": "You are an updated assistant.",
|
"prompt": "You are an updated assistant.",
|
||||||
"speed": 1.5
|
"speed": 1.5,
|
||||||
|
"voiceOutputEnabled": False,
|
||||||
}
|
}
|
||||||
response = client.put(f"/api/assistants/{assistant_id}", json=update_data)
|
response = client.put(f"/api/assistants/{assistant_id}", json=update_data)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
@@ -69,6 +71,7 @@ class TestAssistantAPI:
|
|||||||
assert data["name"] == "Updated Assistant"
|
assert data["name"] == "Updated Assistant"
|
||||||
assert data["prompt"] == "You are an updated assistant."
|
assert data["prompt"] == "You are an updated assistant."
|
||||||
assert data["speed"] == 1.5
|
assert data["speed"] == 1.5
|
||||||
|
assert data["voiceOutputEnabled"] is False
|
||||||
|
|
||||||
def test_delete_assistant(self, client, sample_assistant_data):
|
def test_delete_assistant(self, client, sample_assistant_data):
|
||||||
"""Test deleting an assistant"""
|
"""Test deleting an assistant"""
|
||||||
@@ -210,3 +213,15 @@ class TestAssistantAPI:
|
|||||||
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
|
||||||
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
|
||||||
assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"]
|
assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"]
|
||||||
|
|
||||||
|
def test_runtime_config_text_mode_when_voice_output_disabled(self, client, sample_assistant_data):
|
||||||
|
sample_assistant_data["voiceOutputEnabled"] = False
|
||||||
|
assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
|
||||||
|
assert assistant_resp.status_code == 200
|
||||||
|
assistant_id = assistant_resp.json()["id"]
|
||||||
|
|
||||||
|
runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
|
||||||
|
assert runtime_resp.status_code == 200
|
||||||
|
metadata = runtime_resp.json()["sessionStartMetadata"]
|
||||||
|
assert metadata["output"]["mode"] == "text"
|
||||||
|
assert metadata["services"]["tts"]["enabled"] is False
|
||||||
|
|||||||
@@ -211,6 +211,7 @@ class DuplexPipeline:
|
|||||||
self._runtime_llm: Dict[str, Any] = {}
|
self._runtime_llm: Dict[str, Any] = {}
|
||||||
self._runtime_asr: Dict[str, Any] = {}
|
self._runtime_asr: Dict[str, Any] = {}
|
||||||
self._runtime_tts: Dict[str, Any] = {}
|
self._runtime_tts: Dict[str, Any] = {}
|
||||||
|
self._runtime_output: Dict[str, Any] = {}
|
||||||
self._runtime_system_prompt: Optional[str] = None
|
self._runtime_system_prompt: Optional[str] = None
|
||||||
self._runtime_greeting: Optional[str] = None
|
self._runtime_greeting: Optional[str] = None
|
||||||
self._runtime_knowledge: Dict[str, Any] = {}
|
self._runtime_knowledge: Dict[str, Any] = {}
|
||||||
@@ -257,6 +258,9 @@ class DuplexPipeline:
|
|||||||
self._runtime_asr = services["asr"]
|
self._runtime_asr = services["asr"]
|
||||||
if isinstance(services.get("tts"), dict):
|
if isinstance(services.get("tts"), dict):
|
||||||
self._runtime_tts = services["tts"]
|
self._runtime_tts = services["tts"]
|
||||||
|
output = metadata.get("output") or {}
|
||||||
|
if isinstance(output, dict):
|
||||||
|
self._runtime_output = output
|
||||||
|
|
||||||
knowledge_base_id = metadata.get("knowledgeBaseId")
|
knowledge_base_id = metadata.get("knowledgeBaseId")
|
||||||
if knowledge_base_id is not None:
|
if knowledge_base_id is not None:
|
||||||
@@ -283,6 +287,31 @@ class DuplexPipeline:
|
|||||||
if self.llm_service and hasattr(self.llm_service, "set_tool_schemas"):
|
if self.llm_service and hasattr(self.llm_service, "set_tool_schemas"):
|
||||||
self.llm_service.set_tool_schemas(self._resolved_tool_schemas())
|
self.llm_service.set_tool_schemas(self._resolved_tool_schemas())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _coerce_bool(value: Any) -> Optional[bool]:
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return value
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return bool(value)
|
||||||
|
if isinstance(value, str):
|
||||||
|
normalized = value.strip().lower()
|
||||||
|
if normalized in {"1", "true", "yes", "on", "enabled"}:
|
||||||
|
return True
|
||||||
|
if normalized in {"0", "false", "no", "off", "disabled"}:
|
||||||
|
return False
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _tts_output_enabled(self) -> bool:
|
||||||
|
enabled = self._coerce_bool(self._runtime_tts.get("enabled"))
|
||||||
|
if enabled is not None:
|
||||||
|
return enabled
|
||||||
|
|
||||||
|
output_mode = str(self._runtime_output.get("mode") or "").strip().lower()
|
||||||
|
if output_mode in {"text", "text_only", "text-only"}:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
"""Start the pipeline and connect services."""
|
"""Start the pipeline and connect services."""
|
||||||
try:
|
try:
|
||||||
@@ -311,38 +340,44 @@ class DuplexPipeline:
|
|||||||
|
|
||||||
await self.llm_service.connect()
|
await self.llm_service.connect()
|
||||||
|
|
||||||
# Connect TTS service
|
tts_output_enabled = self._tts_output_enabled()
|
||||||
if not self.tts_service:
|
|
||||||
tts_provider = (self._runtime_tts.get("provider") or settings.tts_provider).lower()
|
|
||||||
tts_api_key = self._runtime_tts.get("apiKey") or settings.siliconflow_api_key
|
|
||||||
tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
|
|
||||||
tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model
|
|
||||||
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
|
|
||||||
|
|
||||||
if tts_provider == "siliconflow" and tts_api_key:
|
# Connect TTS service only when audio output is enabled.
|
||||||
self.tts_service = SiliconFlowTTSService(
|
if tts_output_enabled:
|
||||||
api_key=tts_api_key,
|
if not self.tts_service:
|
||||||
voice=tts_voice,
|
tts_provider = (self._runtime_tts.get("provider") or settings.tts_provider).lower()
|
||||||
model=tts_model,
|
tts_api_key = self._runtime_tts.get("apiKey") or settings.siliconflow_api_key
|
||||||
sample_rate=settings.sample_rate,
|
tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
|
||||||
speed=tts_speed
|
tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model
|
||||||
)
|
tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
|
||||||
logger.info("Using SiliconFlow TTS service")
|
|
||||||
else:
|
if tts_provider == "siliconflow" and tts_api_key:
|
||||||
self.tts_service = EdgeTTSService(
|
self.tts_service = SiliconFlowTTSService(
|
||||||
voice=tts_voice,
|
api_key=tts_api_key,
|
||||||
|
voice=tts_voice,
|
||||||
|
model=tts_model,
|
||||||
|
sample_rate=settings.sample_rate,
|
||||||
|
speed=tts_speed
|
||||||
|
)
|
||||||
|
logger.info("Using SiliconFlow TTS service")
|
||||||
|
else:
|
||||||
|
self.tts_service = EdgeTTSService(
|
||||||
|
voice=tts_voice,
|
||||||
|
sample_rate=settings.sample_rate
|
||||||
|
)
|
||||||
|
logger.info("Using Edge TTS service")
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self.tts_service.connect()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS")
|
||||||
|
self.tts_service = MockTTSService(
|
||||||
sample_rate=settings.sample_rate
|
sample_rate=settings.sample_rate
|
||||||
)
|
)
|
||||||
logger.info("Using Edge TTS service")
|
await self.tts_service.connect()
|
||||||
|
else:
|
||||||
try:
|
self.tts_service = None
|
||||||
await self.tts_service.connect()
|
logger.info("TTS output disabled by runtime metadata")
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS")
|
|
||||||
self.tts_service = MockTTSService(
|
|
||||||
sample_rate=settings.sample_rate
|
|
||||||
)
|
|
||||||
await self.tts_service.connect()
|
|
||||||
|
|
||||||
# Connect ASR service
|
# Connect ASR service
|
||||||
if not self.asr_service:
|
if not self.asr_service:
|
||||||
@@ -375,7 +410,7 @@ class DuplexPipeline:
|
|||||||
self._outbound_task = asyncio.create_task(self._outbound_loop())
|
self._outbound_task = asyncio.create_task(self._outbound_loop())
|
||||||
|
|
||||||
# Speak greeting if configured
|
# Speak greeting if configured
|
||||||
if self.conversation.greeting:
|
if self.conversation.greeting and tts_output_enabled:
|
||||||
await self._speak(self.conversation.greeting)
|
await self._speak(self.conversation.greeting)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -932,7 +967,7 @@ class DuplexPipeline:
|
|||||||
pending_punctuation = sentence
|
pending_punctuation = sentence
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not self._interrupt_event.is_set():
|
if self._tts_output_enabled() and not self._interrupt_event.is_set():
|
||||||
if not first_audio_sent:
|
if not first_audio_sent:
|
||||||
await self._send_event(
|
await self._send_event(
|
||||||
{
|
{
|
||||||
@@ -952,7 +987,12 @@ class DuplexPipeline:
|
|||||||
)
|
)
|
||||||
|
|
||||||
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
|
||||||
if remaining_text and has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
|
if (
|
||||||
|
self._tts_output_enabled()
|
||||||
|
and remaining_text
|
||||||
|
and has_spoken_content(remaining_text)
|
||||||
|
and not self._interrupt_event.is_set()
|
||||||
|
):
|
||||||
if not first_audio_sent:
|
if not first_audio_sent:
|
||||||
await self._send_event(
|
await self._send_event(
|
||||||
{
|
{
|
||||||
@@ -1066,7 +1106,10 @@ class DuplexPipeline:
|
|||||||
fade_in_ms: Fade-in duration for sentence start chunks
|
fade_in_ms: Fade-in duration for sentence start chunks
|
||||||
fade_out_ms: Fade-out duration for sentence end chunks
|
fade_out_ms: Fade-out duration for sentence end chunks
|
||||||
"""
|
"""
|
||||||
if not text.strip() or self._interrupt_event.is_set():
|
if not self._tts_output_enabled():
|
||||||
|
return
|
||||||
|
|
||||||
|
if not text.strip() or self._interrupt_event.is_set() or not self.tts_service:
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"[TTS] split sentence: {text!r}")
|
logger.info(f"[TTS] split sentence: {text!r}")
|
||||||
@@ -1153,7 +1196,10 @@ class DuplexPipeline:
|
|||||||
Args:
|
Args:
|
||||||
text: Text to speak
|
text: Text to speak
|
||||||
"""
|
"""
|
||||||
if not text.strip():
|
if not self._tts_output_enabled():
|
||||||
|
return
|
||||||
|
|
||||||
|
if not text.strip() or not self.tts_service:
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -53,6 +53,9 @@ Rules:
|
|||||||
},
|
},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"client": "web-debug",
|
"client": "web-debug",
|
||||||
|
"output": {
|
||||||
|
"mode": "audio"
|
||||||
|
},
|
||||||
"systemPrompt": "You are concise.",
|
"systemPrompt": "You are concise.",
|
||||||
"greeting": "Hi, how can I help?",
|
"greeting": "Hi, how can I help?",
|
||||||
"services": {
|
"services": {
|
||||||
@@ -70,6 +73,7 @@ Rules:
|
|||||||
"minAudioMs": 300
|
"minAudioMs": 300
|
||||||
},
|
},
|
||||||
"tts": {
|
"tts": {
|
||||||
|
"enabled": true,
|
||||||
"provider": "siliconflow",
|
"provider": "siliconflow",
|
||||||
"model": "FunAudioLLM/CosyVoice2-0.5B",
|
"model": "FunAudioLLM/CosyVoice2-0.5B",
|
||||||
"apiKey": "sf-...",
|
"apiKey": "sf-...",
|
||||||
@@ -83,6 +87,10 @@ Rules:
|
|||||||
|
|
||||||
`metadata.services` is optional. If omitted, server defaults to environment configuration.
|
`metadata.services` is optional. If omitted, server defaults to environment configuration.
|
||||||
|
|
||||||
|
Text-only mode:
|
||||||
|
- Set `metadata.output.mode = "text"` OR `metadata.services.tts.enabled = false`.
|
||||||
|
- In this mode server still sends `assistant.response.delta/final`, but will not emit audio frames or `output.audio.start/end`.
|
||||||
|
|
||||||
### `input.text`
|
### `input.text`
|
||||||
|
|
||||||
```json
|
```json
|
||||||
|
|||||||
@@ -125,6 +125,36 @@ async def test_turn_without_tool_keeps_streaming(monkeypatch):
|
|||||||
assert "assistant.tool_call" not in event_types
|
assert "assistant.tool_call" not in event_types
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"metadata",
|
||||||
|
[
|
||||||
|
{"output": {"mode": "text"}},
|
||||||
|
{"services": {"tts": {"enabled": False}}},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
async def test_text_output_mode_skips_audio_events(monkeypatch, metadata):
|
||||||
|
pipeline, events = _build_pipeline(
|
||||||
|
monkeypatch,
|
||||||
|
[
|
||||||
|
[
|
||||||
|
LLMStreamEvent(type="text_delta", text="hello "),
|
||||||
|
LLMStreamEvent(type="text_delta", text="world."),
|
||||||
|
LLMStreamEvent(type="done"),
|
||||||
|
]
|
||||||
|
],
|
||||||
|
)
|
||||||
|
pipeline.apply_runtime_overrides(metadata)
|
||||||
|
|
||||||
|
await pipeline._handle_turn("hi")
|
||||||
|
|
||||||
|
event_types = [e.get("type") for e in events]
|
||||||
|
assert "assistant.response.delta" in event_types
|
||||||
|
assert "assistant.response.final" in event_types
|
||||||
|
assert "output.audio.start" not in event_types
|
||||||
|
assert "output.audio.end" not in event_types
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_turn_with_tool_call_then_results(monkeypatch):
|
async def test_turn_with_tool_call_then_results(monkeypatch):
|
||||||
pipeline, events = _build_pipeline(
|
pipeline, events = _build_pipeline(
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
prompt: '',
|
prompt: '',
|
||||||
knowledgeBaseId: '',
|
knowledgeBaseId: '',
|
||||||
language: 'zh',
|
language: 'zh',
|
||||||
|
voiceOutputEnabled: true,
|
||||||
voice: voices[0]?.id || '',
|
voice: voices[0]?.id || '',
|
||||||
speed: 1,
|
speed: 1,
|
||||||
hotwords: [],
|
hotwords: [],
|
||||||
@@ -531,6 +532,7 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
placeholder="设定小助手的人设、语气、行为规范以及业务逻辑..."
|
placeholder="设定小助手的人设、语气、行为规范以及业务逻辑..."
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
@@ -624,15 +626,32 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className="space-y-2">
|
||||||
|
<label className="text-sm font-medium text-white flex items-center">
|
||||||
|
<Volume2 className="w-4 h-4 mr-2 text-primary" /> 启用语音输出
|
||||||
|
</label>
|
||||||
|
<label className="flex h-12 items-center justify-between rounded-xl border border-white/10 bg-white/5 px-4 text-sm">
|
||||||
|
<span className="text-foreground">TTS 输出</span>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={selectedAssistant.voiceOutputEnabled !== false}
|
||||||
|
onChange={(e) => updateAssistant('voiceOutputEnabled', e.target.checked)}
|
||||||
|
className="accent-primary"
|
||||||
|
/>
|
||||||
|
</label>
|
||||||
|
<p className="text-xs text-muted-foreground">关闭后将进入纯文本输出模式,不会产生语音音频。</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div className="space-y-2">
|
<div className="space-y-2">
|
||||||
<label className="text-sm font-medium text-white flex items-center">
|
<label className="text-sm font-medium text-white flex items-center">
|
||||||
<Volume2 className="w-4 h-4 mr-2 text-primary"/> 选择音色 (From Voice Library)
|
<Volume2 className="w-4 h-4 mr-2 text-primary"/> 选择音色 (From Voice Library)
|
||||||
</label>
|
</label>
|
||||||
<div className="relative group">
|
<div className="relative group">
|
||||||
<select
|
<select
|
||||||
className="flex h-12 w-full rounded-xl border border-white/10 bg-white/5 px-4 py-1 text-sm shadow-sm transition-all focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-primary/50 [&>option]:bg-card text-foreground appearance-none cursor-pointer"
|
className="flex h-12 w-full rounded-xl border border-white/10 bg-white/5 px-4 py-1 text-sm shadow-sm transition-all focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-primary/50 [&>option]:bg-card text-foreground appearance-none cursor-pointer disabled:opacity-50 disabled:cursor-not-allowed"
|
||||||
value={selectedAssistant.voice}
|
value={selectedAssistant.voice}
|
||||||
onChange={(e) => updateAssistant('voice', e.target.value)}
|
onChange={(e) => updateAssistant('voice', e.target.value)}
|
||||||
|
disabled={selectedAssistant.voiceOutputEnabled === false}
|
||||||
>
|
>
|
||||||
<option value="" disabled>请选择声音库中的声音...</option>
|
<option value="" disabled>请选择声音库中的声音...</option>
|
||||||
{voices.map(voice => (
|
{voices.map(voice => (
|
||||||
@@ -645,7 +664,9 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
</div>
|
</div>
|
||||||
<p className="text-xs text-muted-foreground flex items-center mt-1">
|
<p className="text-xs text-muted-foreground flex items-center mt-1">
|
||||||
<Sparkles className="w-3 h-3 mr-1 text-primary opacity-70" />
|
<Sparkles className="w-3 h-3 mr-1 text-primary opacity-70" />
|
||||||
音色配置同步自声音库。如需添加更多音色,请前往“声音库”模块。
|
{selectedAssistant.voiceOutputEnabled === false
|
||||||
|
? '启用语音输出后才可选择音色。'
|
||||||
|
: '音色配置同步自声音库。如需添加更多音色,请前往“声音库”模块。'}
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -916,6 +937,7 @@ export const AssistantsPage: React.FC = () => {
|
|||||||
llmModels={llmModels}
|
llmModels={llmModels}
|
||||||
asrModels={asrModels}
|
asrModels={asrModels}
|
||||||
tools={tools}
|
tools={tools}
|
||||||
|
textTtsEnabled={selectedAssistant.voiceOutputEnabled !== false}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
@@ -1030,6 +1052,7 @@ export const DebugDrawer: React.FC<{
|
|||||||
llmModels?: LLMModel[];
|
llmModels?: LLMModel[];
|
||||||
asrModels?: ASRModel[];
|
asrModels?: ASRModel[];
|
||||||
tools?: Tool[];
|
tools?: Tool[];
|
||||||
|
textTtsEnabled: boolean;
|
||||||
sessionMetadataExtras?: Record<string, any>;
|
sessionMetadataExtras?: Record<string, any>;
|
||||||
onProtocolEvent?: (event: Record<string, any>) => void;
|
onProtocolEvent?: (event: Record<string, any>) => void;
|
||||||
}> = ({
|
}> = ({
|
||||||
@@ -1040,6 +1063,7 @@ export const DebugDrawer: React.FC<{
|
|||||||
llmModels = [],
|
llmModels = [],
|
||||||
asrModels = [],
|
asrModels = [],
|
||||||
tools = [],
|
tools = [],
|
||||||
|
textTtsEnabled,
|
||||||
sessionMetadataExtras,
|
sessionMetadataExtras,
|
||||||
onProtocolEvent,
|
onProtocolEvent,
|
||||||
}) => {
|
}) => {
|
||||||
@@ -1117,7 +1141,6 @@ export const DebugDrawer: React.FC<{
|
|||||||
const [selectedCamera, setSelectedCamera] = useState<string>('');
|
const [selectedCamera, setSelectedCamera] = useState<string>('');
|
||||||
const [selectedMic, setSelectedMic] = useState<string>('');
|
const [selectedMic, setSelectedMic] = useState<string>('');
|
||||||
const [isSwapped, setIsSwapped] = useState(false);
|
const [isSwapped, setIsSwapped] = useState(false);
|
||||||
const [textTtsEnabled, setTextTtsEnabled] = useState(true);
|
|
||||||
const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
|
const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
|
||||||
const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
|
const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
|
||||||
const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
|
const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
|
||||||
@@ -1546,6 +1569,7 @@ export const DebugDrawer: React.FC<{
|
|||||||
const buildLocalResolvedRuntime = () => {
|
const buildLocalResolvedRuntime = () => {
|
||||||
const warnings: string[] = [];
|
const warnings: string[] = [];
|
||||||
const services: Record<string, any> = {};
|
const services: Record<string, any> = {};
|
||||||
|
const ttsEnabled = Boolean(textTtsEnabled);
|
||||||
const isExternalLlm = assistant.configMode === 'dify' || assistant.configMode === 'fastgpt';
|
const isExternalLlm = assistant.configMode === 'dify' || assistant.configMode === 'fastgpt';
|
||||||
const knowledgeBaseId = String(assistant.knowledgeBaseId || '').trim();
|
const knowledgeBaseId = String(assistant.knowledgeBaseId || '').trim();
|
||||||
const knowledge = knowledgeBaseId
|
const knowledge = knowledgeBaseId
|
||||||
@@ -1597,6 +1621,7 @@ export const DebugDrawer: React.FC<{
|
|||||||
if (voice) {
|
if (voice) {
|
||||||
const ttsProvider = isSiliconflowVendor(voice.vendor) ? 'siliconflow' : 'edge';
|
const ttsProvider = isSiliconflowVendor(voice.vendor) ? 'siliconflow' : 'edge';
|
||||||
services.tts = {
|
services.tts = {
|
||||||
|
enabled: ttsEnabled,
|
||||||
provider: ttsProvider,
|
provider: ttsProvider,
|
||||||
model: voice.model,
|
model: voice.model,
|
||||||
apiKey: ttsProvider === 'siliconflow' ? voice.apiKey : null,
|
apiKey: ttsProvider === 'siliconflow' ? voice.apiKey : null,
|
||||||
@@ -1605,17 +1630,25 @@ export const DebugDrawer: React.FC<{
|
|||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
services.tts = {
|
services.tts = {
|
||||||
|
enabled: ttsEnabled,
|
||||||
voice: assistant.voice,
|
voice: assistant.voice,
|
||||||
speed: assistant.speed || 1.0,
|
speed: assistant.speed || 1.0,
|
||||||
};
|
};
|
||||||
warnings.push(`Voice resource not found in loaded list: ${assistant.voice}`);
|
warnings.push(`Voice resource not found in loaded list: ${assistant.voice}`);
|
||||||
}
|
}
|
||||||
|
} else if (!ttsEnabled) {
|
||||||
|
services.tts = {
|
||||||
|
enabled: false,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const localResolved = {
|
const localResolved = {
|
||||||
assistantId: assistant.id,
|
assistantId: assistant.id,
|
||||||
warnings,
|
warnings,
|
||||||
sessionStartMetadata: {
|
sessionStartMetadata: {
|
||||||
|
output: {
|
||||||
|
mode: ttsEnabled ? 'audio' : 'text',
|
||||||
|
},
|
||||||
systemPrompt: assistant.prompt || '',
|
systemPrompt: assistant.prompt || '',
|
||||||
greeting: assistant.opener || '',
|
greeting: assistant.opener || '',
|
||||||
knowledgeBaseId,
|
knowledgeBaseId,
|
||||||
@@ -2007,15 +2040,9 @@ export const DebugDrawer: React.FC<{
|
|||||||
</div>
|
</div>
|
||||||
<div className="flex items-center justify-between gap-2">
|
<div className="flex items-center justify-between gap-2">
|
||||||
<Badge variant="outline" className="text-xs">WS: {wsStatus}</Badge>
|
<Badge variant="outline" className="text-xs">WS: {wsStatus}</Badge>
|
||||||
<label className="inline-flex items-center gap-1 text-xs text-muted-foreground px-2 py-1 rounded border border-white/10">
|
<Badge variant={textTtsEnabled ? 'outline' : 'secondary'} className="text-xs">
|
||||||
<input
|
TTS: {textTtsEnabled ? 'ON' : 'OFF'}
|
||||||
type="checkbox"
|
</Badge>
|
||||||
checked={textTtsEnabled}
|
|
||||||
onChange={(e) => setTextTtsEnabled(e.target.checked)}
|
|
||||||
className="accent-primary"
|
|
||||||
/>
|
|
||||||
TTS
|
|
||||||
</label>
|
|
||||||
</div>
|
</div>
|
||||||
<div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
|
<div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
|
||||||
<p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
|
<p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({
|
|||||||
prompt: readField(raw, ['prompt'], ''),
|
prompt: readField(raw, ['prompt'], ''),
|
||||||
knowledgeBaseId: readField(raw, ['knowledgeBaseId', 'knowledge_base_id'], ''),
|
knowledgeBaseId: readField(raw, ['knowledgeBaseId', 'knowledge_base_id'], ''),
|
||||||
language: readField(raw, ['language'], 'zh') as 'zh' | 'en',
|
language: readField(raw, ['language'], 'zh') as 'zh' | 'en',
|
||||||
|
voiceOutputEnabled: Boolean(readField(raw, ['voiceOutputEnabled', 'voice_output_enabled'], true)),
|
||||||
voice: readField(raw, ['voice'], ''),
|
voice: readField(raw, ['voice'], ''),
|
||||||
speed: Number(readField(raw, ['speed'], 1)),
|
speed: Number(readField(raw, ['speed'], 1)),
|
||||||
hotwords: readField(raw, ['hotwords'], []),
|
hotwords: readField(raw, ['hotwords'], []),
|
||||||
@@ -210,6 +211,7 @@ export const createAssistant = async (data: Partial<Assistant>): Promise<Assista
|
|||||||
prompt: data.prompt || '',
|
prompt: data.prompt || '',
|
||||||
knowledgeBaseId: data.knowledgeBaseId || '',
|
knowledgeBaseId: data.knowledgeBaseId || '',
|
||||||
language: data.language || 'zh',
|
language: data.language || 'zh',
|
||||||
|
voiceOutputEnabled: data.voiceOutputEnabled ?? true,
|
||||||
voice: data.voice || '',
|
voice: data.voice || '',
|
||||||
speed: data.speed ?? 1,
|
speed: data.speed ?? 1,
|
||||||
hotwords: data.hotwords || [],
|
hotwords: data.hotwords || [],
|
||||||
@@ -234,6 +236,7 @@ export const updateAssistant = async (id: string, data: Partial<Assistant>): Pro
|
|||||||
prompt: data.prompt,
|
prompt: data.prompt,
|
||||||
knowledgeBaseId: data.knowledgeBaseId,
|
knowledgeBaseId: data.knowledgeBaseId,
|
||||||
language: data.language,
|
language: data.language,
|
||||||
|
voiceOutputEnabled: data.voiceOutputEnabled,
|
||||||
voice: data.voice,
|
voice: data.voice,
|
||||||
speed: data.speed,
|
speed: data.speed,
|
||||||
hotwords: data.hotwords,
|
hotwords: data.hotwords,
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ export interface Assistant {
|
|||||||
prompt: string;
|
prompt: string;
|
||||||
knowledgeBaseId: string;
|
knowledgeBaseId: string;
|
||||||
language: 'zh' | 'en';
|
language: 'zh' | 'en';
|
||||||
|
voiceOutputEnabled?: boolean;
|
||||||
voice: string; // This will now store the ID of the voice from Voice Library
|
voice: string; // This will now store the ID of the voice from Voice Library
|
||||||
speed: number;
|
speed: number;
|
||||||
hotwords: string[];
|
hotwords: string[];
|
||||||
|
|||||||
Reference in New Issue
Block a user