From 15523d9ec2cce7ee45b63c96576ea8ae9cc932f0 Mon Sep 17 00:00:00 2001
From: Xin Wang <wx44wx@XindeMac-mini.local>
Date: Wed, 11 Feb 2026 09:50:46 +0800
Subject: [PATCH] Add tts/text output schema

---
 api/app/db.py                       |  28 ++++++-
 api/app/main.py                     |   3 +-
 api/app/models.py                   |   1 +
 api/app/routers/assistants.py       |  10 ++-
 api/app/schemas.py                  |   2 +
 api/tests/conftest.py               |   1 +
 api/tests/test_assistants.py        |  17 ++++-
 engine/core/duplex_pipeline.py      | 114 +++++++++++++++++++---------
 engine/docs/ws_v1_schema.md         |   8 ++
 engine/tests/test_tool_call_flow.py |  30 ++++++++
 web/pages/Assistants.tsx            |  51 ++++++++++---
 web/services/backendApi.ts          |   3 +
 web/types.ts                        |   1 +
 13 files changed, 219 insertions(+), 50 deletions(-)

diff --git a/api/app/db.py b/api/app/db.py
index 3b04bc8..22a25af 100644
--- a/api/app/db.py
+++ b/api/app/db.py
@@ -1,4 +1,4 @@
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from sqlalchemy.orm import sessionmaker, DeclarativeBase
 import os
 
@@ -14,6 +14,32 @@ class Base(DeclarativeBase):
     pass
 
 
+def ensure_schema_compatibility() -> None:
+    """Best-effort lightweight migrations for SQLite deployments."""
+    if engine.dialect.name != "sqlite":
+        return
+
+    with engine.begin() as conn:
+        columns = {
+            row[1]
+            for row in conn.execute(text("PRAGMA table_info(assistants)"))
+        }
+        if "voice_output_enabled" not in columns:
+            conn.execute(
+                text(
+                    "ALTER TABLE assistants "
+                    "ADD COLUMN voice_output_enabled BOOLEAN DEFAULT 1"
+                )
+            )
+            conn.execute(
+                text(
+                    "UPDATE assistants "
+                    "SET voice_output_enabled = 1 "
+                    "WHERE voice_output_enabled IS NULL"
+                )
+            )
+
+
 def get_db():
     db = SessionLocal()
     try:
diff --git a/api/app/main.py b/api/app/main.py
index a193ff9..055d53c 100644
--- a/api/app/main.py
+++ b/api/app/main.py
@@ -3,7 +3,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from contextlib import asynccontextmanager
 import os
 
-from .db import Base, engine
+from .db import Base, engine, ensure_schema_compatibility
 from .routers import assistants, voices, workflows, history, knowledge, llm, asr, tools
 
 
@@ -11,6 +11,7 @@ from .routers import assistants, voices, workflows, history, knowledge, llm, asr
 async def lifespan(app: FastAPI):
     # 启动时创建表
     Base.metadata.create_all(bind=engine)
+    ensure_schema_compatibility()
     yield
 
 
diff --git a/api/app/models.py b/api/app/models.py
index c69bd78..cc253aa 100644
--- a/api/app/models.py
+++ b/api/app/models.py
@@ -112,6 +112,7 @@ class Assistant(Base):
     prompt: Mapped[str] = mapped_column(Text, default="")
     knowledge_base_id: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
     language: Mapped[str] = mapped_column(String(16), default="zh")
+    voice_output_enabled: Mapped[bool] = mapped_column(default=True)
     voice: Mapped[Optional[str]] = mapped_column(String(64), nullable=True)
     speed: Mapped[float] = mapped_column(Float, default=1.0)
     hotwords: Mapped[dict] = mapped_column(JSON, default=list)
diff --git a/api/app/routers/assistants.py b/api/app/routers/assistants.py
index 0e5227e..2368cee 100644
--- a/api/app/routers/assistants.py
+++ b/api/app/routers/assistants.py
@@ -21,6 +21,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
     metadata = {
         "systemPrompt": assistant.prompt or "",
         "greeting": assistant.opener or "",
+        "output": {"mode": "audio" if assistant.voice_output_enabled else "text"},
         "services": {},
     }
     warnings = []
@@ -49,11 +50,14 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
         else:
             warnings.append(f"ASR model not found: {assistant.asr_model_id}")
 
-    if assistant.voice:
+    if not assistant.voice_output_enabled:
+        metadata["services"]["tts"] = {"enabled": False}
+    elif assistant.voice:
         voice = db.query(Voice).filter(Voice.id == assistant.voice).first()
         if voice:
             tts_provider = "siliconflow" if _is_siliconflow_vendor(voice.vendor) else "edge"
             metadata["services"]["tts"] = {
+                "enabled": True,
                 "provider": tts_provider,
                 "model": voice.model,
                 "apiKey": voice.api_key if tts_provider == "siliconflow" else None,
@@ -63,6 +67,7 @@ def _resolve_runtime_metadata(db: Session, assistant: Assistant) -> dict:
         else:
             # Keep assistant.voice as direct voice identifier fallback
             metadata["services"]["tts"] = {
+                "enabled": True,
                 "voice": assistant.voice,
                 "speed": assistant.speed or 1.0,
             }
@@ -98,6 +103,7 @@ def assistant_to_dict(assistant: Assistant) -> dict:
         "prompt": assistant.prompt or "",
         "knowledgeBaseId": assistant.knowledge_base_id,
         "language": assistant.language,
+        "voiceOutputEnabled": assistant.voice_output_enabled,
         "voice": assistant.voice,
         "speed": assistant.speed,
         "hotwords": assistant.hotwords or [],
@@ -120,6 +126,7 @@ def _apply_assistant_update(assistant: Assistant, update_data: dict) -> None:
         "knowledgeBaseId": "knowledge_base_id",
         "interruptionSensitivity": "interruption_sensitivity",
         "configMode": "config_mode",
+        "voiceOutputEnabled": "voice_output_enabled",
         "apiUrl": "api_url",
         "apiKey": "api_key",
         "llmModelId": "llm_model_id",
@@ -180,6 +187,7 @@ def create_assistant(data: AssistantCreate, db: Session = Depends(get_db)):
         prompt=data.prompt,
         knowledge_base_id=data.knowledgeBaseId,
         language=data.language,
+        voice_output_enabled=data.voiceOutputEnabled,
         voice=data.voice,
         speed=data.speed,
         hotwords=data.hotwords,
diff --git a/api/app/schemas.py b/api/app/schemas.py
index 457476b..6b7645e 100644
--- a/api/app/schemas.py
+++ b/api/app/schemas.py
@@ -268,6 +268,7 @@ class AssistantBase(BaseModel):
     prompt: str = ""
     knowledgeBaseId: Optional[str] = None
     language: str = "zh"
+    voiceOutputEnabled: bool = True
     voice: Optional[str] = None
     speed: float = 1.0
     hotwords: List[str] = []
@@ -293,6 +294,7 @@ class AssistantUpdate(BaseModel):
     prompt: Optional[str] = None
     knowledgeBaseId: Optional[str] = None
     language: Optional[str] = None
+    voiceOutputEnabled: Optional[bool] = None
     voice: Optional[str] = None
     speed: Optional[float] = None
     hotwords: Optional[List[str]] = None
diff --git a/api/tests/conftest.py b/api/tests/conftest.py
index 015cef7..9435444 100644
--- a/api/tests/conftest.py
+++ b/api/tests/conftest.py
@@ -85,6 +85,7 @@ def sample_assistant_data():
         "opener": "Hello, welcome!",
         "prompt": "You are a helpful assistant.",
         "language": "zh",
+        "voiceOutputEnabled": True,
         "speed": 1.0,
         "hotwords": ["test", "hello"],
         "tools": [],
diff --git a/api/tests/test_assistants.py b/api/tests/test_assistants.py
index 106ff10..a0140d1 100644
--- a/api/tests/test_assistants.py
+++ b/api/tests/test_assistants.py
@@ -23,6 +23,7 @@ class TestAssistantAPI:
         assert data["opener"] == sample_assistant_data["opener"]
         assert data["prompt"] == sample_assistant_data["prompt"]
         assert data["language"] == sample_assistant_data["language"]
+        assert data["voiceOutputEnabled"] is True
         assert "id" in data
         assert data["callCount"] == 0
 
@@ -61,7 +62,8 @@ class TestAssistantAPI:
         update_data = {
             "name": "Updated Assistant",
             "prompt": "You are an updated assistant.",
-            "speed": 1.5
+            "speed": 1.5,
+            "voiceOutputEnabled": False,
         }
         response = client.put(f"/api/assistants/{assistant_id}", json=update_data)
         assert response.status_code == 200
@@ -69,6 +71,7 @@ class TestAssistantAPI:
         assert data["name"] == "Updated Assistant"
         assert data["prompt"] == "You are an updated assistant."
         assert data["speed"] == 1.5
+        assert data["voiceOutputEnabled"] is False
 
     def test_delete_assistant(self, client, sample_assistant_data):
         """Test deleting an assistant"""
@@ -210,3 +213,15 @@ class TestAssistantAPI:
         assert metadata["services"]["llm"]["model"] == sample_llm_model_data["model_name"]
         assert metadata["services"]["asr"]["model"] == sample_asr_model_data["model_name"]
         assert metadata["services"]["tts"]["voice"] == sample_voice_data["voice_key"]
+
+    def test_runtime_config_text_mode_when_voice_output_disabled(self, client, sample_assistant_data):
+        sample_assistant_data["voiceOutputEnabled"] = False
+        assistant_resp = client.post("/api/assistants", json=sample_assistant_data)
+        assert assistant_resp.status_code == 200
+        assistant_id = assistant_resp.json()["id"]
+
+        runtime_resp = client.get(f"/api/assistants/{assistant_id}/runtime-config")
+        assert runtime_resp.status_code == 200
+        metadata = runtime_resp.json()["sessionStartMetadata"]
+        assert metadata["output"]["mode"] == "text"
+        assert metadata["services"]["tts"]["enabled"] is False
diff --git a/engine/core/duplex_pipeline.py b/engine/core/duplex_pipeline.py
index 2fa2948..8527144 100644
--- a/engine/core/duplex_pipeline.py
+++ b/engine/core/duplex_pipeline.py
@@ -211,6 +211,7 @@ class DuplexPipeline:
         self._runtime_llm: Dict[str, Any] = {}
         self._runtime_asr: Dict[str, Any] = {}
         self._runtime_tts: Dict[str, Any] = {}
+        self._runtime_output: Dict[str, Any] = {}
         self._runtime_system_prompt: Optional[str] = None
         self._runtime_greeting: Optional[str] = None
         self._runtime_knowledge: Dict[str, Any] = {}
@@ -257,6 +258,9 @@ class DuplexPipeline:
                 self._runtime_asr = services["asr"]
             if isinstance(services.get("tts"), dict):
                 self._runtime_tts = services["tts"]
+        output = metadata.get("output") or {}
+        if isinstance(output, dict):
+            self._runtime_output = output
 
         knowledge_base_id = metadata.get("knowledgeBaseId")
         if knowledge_base_id is not None:
@@ -283,6 +287,31 @@ class DuplexPipeline:
         if self.llm_service and hasattr(self.llm_service, "set_tool_schemas"):
             self.llm_service.set_tool_schemas(self._resolved_tool_schemas())
 
+    @staticmethod
+    def _coerce_bool(value: Any) -> Optional[bool]:
+        if isinstance(value, bool):
+            return value
+        if isinstance(value, (int, float)):
+            return bool(value)
+        if isinstance(value, str):
+            normalized = value.strip().lower()
+            if normalized in {"1", "true", "yes", "on", "enabled"}:
+                return True
+            if normalized in {"0", "false", "no", "off", "disabled"}:
+                return False
+        return None
+
+    def _tts_output_enabled(self) -> bool:
+        enabled = self._coerce_bool(self._runtime_tts.get("enabled"))
+        if enabled is not None:
+            return enabled
+
+        output_mode = str(self._runtime_output.get("mode") or "").strip().lower()
+        if output_mode in {"text", "text_only", "text-only"}:
+            return False
+
+        return True
+
     async def start(self) -> None:
         """Start the pipeline and connect services."""
         try:
@@ -311,38 +340,44 @@ class DuplexPipeline:
 
             await self.llm_service.connect()
 
-            # Connect TTS service
-            if not self.tts_service:
-                tts_provider = (self._runtime_tts.get("provider") or settings.tts_provider).lower()
-                tts_api_key = self._runtime_tts.get("apiKey") or settings.siliconflow_api_key
-                tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
-                tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model
-                tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
+            tts_output_enabled = self._tts_output_enabled()
 
-                if tts_provider == "siliconflow" and tts_api_key:
-                    self.tts_service = SiliconFlowTTSService(
-                        api_key=tts_api_key,
-                        voice=tts_voice,
-                        model=tts_model,
-                        sample_rate=settings.sample_rate,
-                        speed=tts_speed
-                    )
-                    logger.info("Using SiliconFlow TTS service")
-                else:
-                    self.tts_service = EdgeTTSService(
-                        voice=tts_voice,
+            # Connect TTS service only when audio output is enabled.
+            if tts_output_enabled:
+                if not self.tts_service:
+                    tts_provider = (self._runtime_tts.get("provider") or settings.tts_provider).lower()
+                    tts_api_key = self._runtime_tts.get("apiKey") or settings.siliconflow_api_key
+                    tts_voice = self._runtime_tts.get("voice") or settings.tts_voice
+                    tts_model = self._runtime_tts.get("model") or settings.siliconflow_tts_model
+                    tts_speed = float(self._runtime_tts.get("speed") or settings.tts_speed)
+
+                    if tts_provider == "siliconflow" and tts_api_key:
+                        self.tts_service = SiliconFlowTTSService(
+                            api_key=tts_api_key,
+                            voice=tts_voice,
+                            model=tts_model,
+                            sample_rate=settings.sample_rate,
+                            speed=tts_speed
+                        )
+                        logger.info("Using SiliconFlow TTS service")
+                    else:
+                        self.tts_service = EdgeTTSService(
+                            voice=tts_voice,
+                            sample_rate=settings.sample_rate
+                        )
+                        logger.info("Using Edge TTS service")
+
+                try:
+                    await self.tts_service.connect()
+                except Exception as e:
+                    logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS")
+                    self.tts_service = MockTTSService(
                         sample_rate=settings.sample_rate
                     )
-                    logger.info("Using Edge TTS service")
-
-            try:
-                await self.tts_service.connect()
-            except Exception as e:
-                logger.warning(f"TTS backend unavailable ({e}); falling back to MockTTS")
-                self.tts_service = MockTTSService(
-                    sample_rate=settings.sample_rate
-                )
-                await self.tts_service.connect()
+                    await self.tts_service.connect()
+            else:
+                self.tts_service = None
+                logger.info("TTS output disabled by runtime metadata")
 
             # Connect ASR service
             if not self.asr_service:
@@ -375,7 +410,7 @@ class DuplexPipeline:
                 self._outbound_task = asyncio.create_task(self._outbound_loop())
 
             # Speak greeting if configured
-            if self.conversation.greeting:
+            if self.conversation.greeting and tts_output_enabled:
                 await self._speak(self.conversation.greeting)
 
         except Exception as e:
@@ -932,7 +967,7 @@ class DuplexPipeline:
                             pending_punctuation = sentence
                             continue
 
-                        if not self._interrupt_event.is_set():
+                        if self._tts_output_enabled() and not self._interrupt_event.is_set():
                             if not first_audio_sent:
                                 await self._send_event(
                                     {
@@ -952,7 +987,12 @@ class DuplexPipeline:
                             )
 
                 remaining_text = f"{pending_punctuation}{sentence_buffer}".strip()
-                if remaining_text and has_spoken_content(remaining_text) and not self._interrupt_event.is_set():
+                if (
+                    self._tts_output_enabled()
+                    and remaining_text
+                    and has_spoken_content(remaining_text)
+                    and not self._interrupt_event.is_set()
+                ):
                     if not first_audio_sent:
                         await self._send_event(
                             {
@@ -1066,7 +1106,10 @@ class DuplexPipeline:
             fade_in_ms: Fade-in duration for sentence start chunks
             fade_out_ms: Fade-out duration for sentence end chunks
         """
-        if not text.strip() or self._interrupt_event.is_set():
+        if not self._tts_output_enabled():
+            return
+
+        if not text.strip() or self._interrupt_event.is_set() or not self.tts_service:
             return
             
         logger.info(f"[TTS] split sentence: {text!r}")
@@ -1153,7 +1196,10 @@ class DuplexPipeline:
         Args:
             text: Text to speak
         """
-        if not text.strip():
+        if not self._tts_output_enabled():
+            return
+
+        if not text.strip() or not self.tts_service:
             return
 
         try:
diff --git a/engine/docs/ws_v1_schema.md b/engine/docs/ws_v1_schema.md
index 0d21432..88ac755 100644
--- a/engine/docs/ws_v1_schema.md
+++ b/engine/docs/ws_v1_schema.md
@@ -53,6 +53,9 @@ Rules:
   },
   "metadata": {
     "client": "web-debug",
+    "output": {
+      "mode": "audio"
+    },
     "systemPrompt": "You are concise.",
     "greeting": "Hi, how can I help?",
     "services": {
@@ -70,6 +73,7 @@ Rules:
         "minAudioMs": 300
       },
       "tts": {
+        "enabled": true,
         "provider": "siliconflow",
         "model": "FunAudioLLM/CosyVoice2-0.5B",
         "apiKey": "sf-...",
@@ -83,6 +87,10 @@ Rules:
 
 `metadata.services` is optional. If omitted, server defaults to environment configuration.
 
+Text-only mode:
+- Set `metadata.output.mode = "text"` OR `metadata.services.tts.enabled = false`.
+- In this mode server still sends `assistant.response.delta/final`, but will not emit audio frames or `output.audio.start/end`.
+
 ### `input.text`
 
 ```json
diff --git a/engine/tests/test_tool_call_flow.py b/engine/tests/test_tool_call_flow.py
index d21e73a..e5f241b 100644
--- a/engine/tests/test_tool_call_flow.py
+++ b/engine/tests/test_tool_call_flow.py
@@ -125,6 +125,36 @@ async def test_turn_without_tool_keeps_streaming(monkeypatch):
     assert "assistant.tool_call" not in event_types
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "metadata",
+    [
+        {"output": {"mode": "text"}},
+        {"services": {"tts": {"enabled": False}}},
+    ],
+)
+async def test_text_output_mode_skips_audio_events(monkeypatch, metadata):
+    pipeline, events = _build_pipeline(
+        monkeypatch,
+        [
+            [
+                LLMStreamEvent(type="text_delta", text="hello "),
+                LLMStreamEvent(type="text_delta", text="world."),
+                LLMStreamEvent(type="done"),
+            ]
+        ],
+    )
+    pipeline.apply_runtime_overrides(metadata)
+
+    await pipeline._handle_turn("hi")
+
+    event_types = [e.get("type") for e in events]
+    assert "assistant.response.delta" in event_types
+    assert "assistant.response.final" in event_types
+    assert "output.audio.start" not in event_types
+    assert "output.audio.end" not in event_types
+
+
 @pytest.mark.asyncio
 async def test_turn_with_tool_call_then_results(monkeypatch):
     pipeline, events = _build_pipeline(
diff --git a/web/pages/Assistants.tsx b/web/pages/Assistants.tsx
index 87646a7..58b09ab 100644
--- a/web/pages/Assistants.tsx
+++ b/web/pages/Assistants.tsx
@@ -118,6 +118,7 @@ export const AssistantsPage: React.FC = () => {
       prompt: '',
       knowledgeBaseId: '',
       language: 'zh',
+      voiceOutputEnabled: true,
       voice: voices[0]?.id || '',
       speed: 1,
       hotwords: [],
@@ -531,6 +532,7 @@ export const AssistantsPage: React.FC = () => {
                                         placeholder="设定小助手的人设、语气、行为规范以及业务逻辑..."
                                     />
                                 </div>
+
                             </div>
                         )}
 
@@ -624,15 +626,32 @@ export const AssistantsPage: React.FC = () => {
                                     </p>
                                 </div>
 
+                                <div className="space-y-2">
+                                    <label className="text-sm font-medium text-white flex items-center">
+                                        <Volume2 className="w-4 h-4 mr-2 text-primary" /> 启用语音输出
+                                    </label>
+                                    <label className="flex h-12 items-center justify-between rounded-xl border border-white/10 bg-white/5 px-4 text-sm">
+                                        <span className="text-foreground">TTS 输出</span>
+                                        <input
+                                          type="checkbox"
+                                          checked={selectedAssistant.voiceOutputEnabled !== false}
+                                          onChange={(e) => updateAssistant('voiceOutputEnabled', e.target.checked)}
+                                          className="accent-primary"
+                                        />
+                                    </label>
+                                    <p className="text-xs text-muted-foreground">关闭后将进入纯文本输出模式，不会产生语音音频。</p>
+                                </div>
+
                                 <div className="space-y-2">
                                     <label className="text-sm font-medium text-white flex items-center">
                                         <Volume2 className="w-4 h-4 mr-2 text-primary"/> 选择音色 (From Voice Library)
                                     </label>
                                     <div className="relative group">
                                         <select 
-                                            className="flex h-12 w-full rounded-xl border border-white/10 bg-white/5 px-4 py-1 text-sm shadow-sm transition-all focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-primary/50 [&>option]:bg-card text-foreground appearance-none cursor-pointer"
+                                            className="flex h-12 w-full rounded-xl border border-white/10 bg-white/5 px-4 py-1 text-sm shadow-sm transition-all focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-primary/50 [&>option]:bg-card text-foreground appearance-none cursor-pointer disabled:opacity-50 disabled:cursor-not-allowed"
                                             value={selectedAssistant.voice}
                                             onChange={(e) => updateAssistant('voice', e.target.value)}
+                                            disabled={selectedAssistant.voiceOutputEnabled === false}
                                         >
                                             <option value="" disabled>请选择声音库中的声音...</option>
                                             {voices.map(voice => (
@@ -645,7 +664,9 @@ export const AssistantsPage: React.FC = () => {
                                     </div>
                                     <p className="text-xs text-muted-foreground flex items-center mt-1">
                                         <Sparkles className="w-3 h-3 mr-1 text-primary opacity-70" />
-                                        音色配置同步自声音库。如需添加更多音色，请前往“声音库”模块。
+                                        {selectedAssistant.voiceOutputEnabled === false
+                                          ? '启用语音输出后才可选择音色。'
+                                          : '音色配置同步自声音库。如需添加更多音色，请前往“声音库”模块。'}
                                     </p>
                                 </div>
 
@@ -916,6 +937,7 @@ export const AssistantsPage: React.FC = () => {
            llmModels={llmModels}
            asrModels={asrModels}
            tools={tools}
+           textTtsEnabled={selectedAssistant.voiceOutputEnabled !== false}
          />
       )}
 
@@ -1030,6 +1052,7 @@ export const DebugDrawer: React.FC<{
   llmModels?: LLMModel[];
   asrModels?: ASRModel[];
   tools?: Tool[];
+  textTtsEnabled: boolean;
   sessionMetadataExtras?: Record<string, any>;
   onProtocolEvent?: (event: Record<string, any>) => void;
 }> = ({
@@ -1040,6 +1063,7 @@ export const DebugDrawer: React.FC<{
   llmModels = [],
   asrModels = [],
   tools = [],
+  textTtsEnabled,
   sessionMetadataExtras,
   onProtocolEvent,
 }) => {
@@ -1117,7 +1141,6 @@ export const DebugDrawer: React.FC<{
   const [selectedCamera, setSelectedCamera] = useState<string>('');
   const [selectedMic, setSelectedMic] = useState<string>('');
   const [isSwapped, setIsSwapped] = useState(false); 
-  const [textTtsEnabled, setTextTtsEnabled] = useState(true);
   const [aecEnabled, setAecEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_aec') !== '0');
   const [nsEnabled, setNsEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_ns') !== '0');
   const [agcEnabled, setAgcEnabled] = useState<boolean>(() => localStorage.getItem('debug_audio_agc') !== '0');
@@ -1546,6 +1569,7 @@ export const DebugDrawer: React.FC<{
   const buildLocalResolvedRuntime = () => {
     const warnings: string[] = [];
     const services: Record<string, any> = {};
+    const ttsEnabled = Boolean(textTtsEnabled);
     const isExternalLlm = assistant.configMode === 'dify' || assistant.configMode === 'fastgpt';
     const knowledgeBaseId = String(assistant.knowledgeBaseId || '').trim();
     const knowledge = knowledgeBaseId
@@ -1597,6 +1621,7 @@ export const DebugDrawer: React.FC<{
       if (voice) {
         const ttsProvider = isSiliconflowVendor(voice.vendor) ? 'siliconflow' : 'edge';
         services.tts = {
+          enabled: ttsEnabled,
           provider: ttsProvider,
           model: voice.model,
           apiKey: ttsProvider === 'siliconflow' ? voice.apiKey : null,
@@ -1605,17 +1630,25 @@ export const DebugDrawer: React.FC<{
         };
       } else {
         services.tts = {
+          enabled: ttsEnabled,
           voice: assistant.voice,
           speed: assistant.speed || 1.0,
         };
         warnings.push(`Voice resource not found in loaded list: ${assistant.voice}`);
       }
+    } else if (!ttsEnabled) {
+      services.tts = {
+        enabled: false,
+      };
     }
 
     const localResolved = {
       assistantId: assistant.id,
       warnings,
       sessionStartMetadata: {
+        output: {
+          mode: ttsEnabled ? 'audio' : 'text',
+        },
         systemPrompt: assistant.prompt || '',
         greeting: assistant.opener || '',
         knowledgeBaseId,
@@ -2007,15 +2040,9 @@ export const DebugDrawer: React.FC<{
         </div>
         <div className="flex items-center justify-between gap-2">
           <Badge variant="outline" className="text-xs">WS: {wsStatus}</Badge>
-          <label className="inline-flex items-center gap-1 text-xs text-muted-foreground px-2 py-1 rounded border border-white/10">
-            <input
-              type="checkbox"
-              checked={textTtsEnabled}
-              onChange={(e) => setTextTtsEnabled(e.target.checked)}
-              className="accent-primary"
-            />
-            TTS
-          </label>
+          <Badge variant={textTtsEnabled ? 'outline' : 'secondary'} className="text-xs">
+            TTS: {textTtsEnabled ? 'ON' : 'OFF'}
+          </Badge>
         </div>
         <div className="rounded-md border border-white/10 bg-black/20 p-2 space-y-2">
           <p className="text-[10px] uppercase tracking-widest text-muted-foreground">Audio 3A</p>
diff --git a/web/services/backendApi.ts b/web/services/backendApi.ts
index 72c5720..5ea47c3 100644
--- a/web/services/backendApi.ts
+++ b/web/services/backendApi.ts
@@ -33,6 +33,7 @@ const mapAssistant = (raw: AnyRecord): Assistant => ({
   prompt: readField(raw, ['prompt'], ''),
   knowledgeBaseId: readField(raw, ['knowledgeBaseId', 'knowledge_base_id'], ''),
   language: readField(raw, ['language'], 'zh') as 'zh' | 'en',
+  voiceOutputEnabled: Boolean(readField(raw, ['voiceOutputEnabled', 'voice_output_enabled'], true)),
   voice: readField(raw, ['voice'], ''),
   speed: Number(readField(raw, ['speed'], 1)),
   hotwords: readField(raw, ['hotwords'], []),
@@ -210,6 +211,7 @@ export const createAssistant = async (data: Partial<Assistant>): Promise<Assista
     prompt: data.prompt || '',
     knowledgeBaseId: data.knowledgeBaseId || '',
     language: data.language || 'zh',
+    voiceOutputEnabled: data.voiceOutputEnabled ?? true,
     voice: data.voice || '',
     speed: data.speed ?? 1,
     hotwords: data.hotwords || [],
@@ -234,6 +236,7 @@ export const updateAssistant = async (id: string, data: Partial<Assistant>): Pro
     prompt: data.prompt,
     knowledgeBaseId: data.knowledgeBaseId,
     language: data.language,
+    voiceOutputEnabled: data.voiceOutputEnabled,
     voice: data.voice,
     speed: data.speed,
     hotwords: data.hotwords,
diff --git a/web/types.ts b/web/types.ts
index 9862e73..697f15d 100644
--- a/web/types.ts
+++ b/web/types.ts
@@ -7,6 +7,7 @@ export interface Assistant {
   prompt: string;
   knowledgeBaseId: string;
   language: 'zh' | 'en';
+  voiceOutputEnabled?: boolean;
   voice: string; // This will now store the ID of the voice from Voice Library
   speed: number;
   hotwords: string[];