Integrate product-ws voice demo on port 8000 alongside REST API.

Add src/voice Pipecat pipeline, browser demo at /voice-demo, and config/voice.json. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-22 16:26:06 +08:00
parent 0b6b40aba4
commit bc2aa5b133
20 changed files with 3726 additions and 4 deletions
--- a/config/voice.json
+++ b/config/voice.json
@@ -0,0 +1,75 @@
+{
+  "server": {
+    "host": "0.0.0.0",
+    "port": 8000,
+    "cors_origins": ["http://localhost:3000", "http://localhost:8080"],
+    "serve_webpage": true,
+    "webpage_mount": "/voice-demo"
+  },
+  "audio": {
+    "sample_rate_hz": 16000,
+    "channels": 1,
+    "frame_ms": 20
+  },
+  "session": {
+    "inactivity_timeout_sec": 60
+  },
+  "turn": {
+    "vad": {
+      "confidence": 0.7,
+      "start_secs": 0.2,
+      "stop_secs": 0.4,
+      "min_volume": 0.6
+    },
+    "interruption_min_chars": 3,
+    "interruption_use_interim": true,
+    "interruption_short_replies": [
+      "是",
+      "是的",
+      "对",
+      "对的",
+      "嗯",
+      "好",
+      "好的",
+      "行",
+      "可以",
+      "没问题",
+      "不是",
+      "不",
+      "不行",
+      "不用",
+      "不要",
+      "没有",
+      "否"
+    ],
+    "user_speech_timeout_sec": 0.8
+  },
+  "agent": {
+    "system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
+    "greeting": "Please introduce yourself briefly.",
+    "greeting_mode": "generated"
+  },
+  "services": {
+    "stt": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini-transcribe",
+      "language": "en"
+    },
+    "llm": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini",
+      "temperature": 0.7
+    },
+    "tts": {
+      "provider": "openai",
+      "api_key": "",
+      "base_url": null,
+      "model": "gpt-4o-mini-tts",
+      "voice": "alloy"
+    }
+  }
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 fastapi>=0.104.0
-uvicorn>=0.24.0
+uvicorn[standard]>=0.24.0
+pipecat-ai[websocket,openai,silero]
+websockets>=13.1,<16.0
 pydantic>=2.4.2
 python-dotenv>=1.0.0
 httpx>=0.25.0
@@ -12,7 +14,7 @@ pydantic-settings==2.1.0
 python-multipart==0.0.6
 python-jose[cryptography]==3.3.0
 passlib[bcrypt]==1.7.4
-openai==1.55.3
+openai>=1.74.0,<3
 loguru>=0.7.0
 pandas
 requests
--- a/src/main.py
+++ b/src/main.py
@@ -1,8 +1,8 @@
 from fastapi import FastAPI
-import sys
 from .api.endpoints import router as api_router
 from .core.fastgpt_client import lifespan
 from .core.logging_config import setup_logging
+from .voice.routes import register_voice

 # Setup logging first
 setup_logging()
@@ -18,4 +18,5 @@ app = FastAPI(
 def read_root():
    return {"message": "Server is running."}

-app.include_router(api_router)
+app.include_router(api_router)
+register_voice(app)
--- a/src/voice/init.py
+++ b/src/voice/init.py
@@ -0,0 +1 @@
+"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""
--- a/src/voice/config.py
+++ b/src/voice/config.py
@@ -0,0 +1,202 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+DEFAULT_VOICE_CONFIG = PROJECT_ROOT / "config" / "voice.json"
+
+
+@dataclass(frozen=True)
+class ServerConfig:
+    host: str = "0.0.0.0"
+    port: int = 8000
+    cors_origins: list[str] = field(default_factory=list)
+    serve_webpage: bool = True
+    webpage_mount: str = "/voice-demo"
+
+
+@dataclass(frozen=True)
+class AudioConfig:
+    sample_rate_hz: int = 16000
+    channels: int = 1
+    frame_ms: int = 20
+
+    @property
+    def frame_bytes(self) -> int:
+        return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
+
+
+@dataclass(frozen=True)
+class SessionConfig:
+    inactivity_timeout_sec: int = 60
+
+
+@dataclass(frozen=True)
+class VADConfig:
+    confidence: float = 0.7
+    start_secs: float = 0.2
+    stop_secs: float = 0.6
+    min_volume: float = 0.6
+
+
+@dataclass(frozen=True)
+class TurnConfig:
+    vad: VADConfig = field(default_factory=VADConfig)
+    user_speech_timeout_sec: float = 1.0
+    interruption_min_chars: int = 3
+    interruption_use_interim: bool = True
+    interruption_short_replies: list[str] = field(
+        default_factory=lambda: [
+            "是",
+            "是的",
+            "对",
+            "对的",
+            "嗯",
+            "好",
+            "好的",
+            "行",
+            "可以",
+            "没问题",
+            "不是",
+            "不",
+            "不行",
+            "不用",
+            "不要",
+            "没有",
+            "否",
+            "no",
+            "yes",
+            "ok",
+            "okay",
+        ]
+    )
+
+
+@dataclass(frozen=True)
+class AgentConfig:
+    system_prompt: str = "You are a helpful, friendly voice assistant."
+    greeting: str | None = None
+    greeting_mode: str = "generated"
+
+
+@dataclass(frozen=True)
+class LLMConfig:
+    provider: str = "openai"
+    api_key: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini"
+    temperature: float | None = 0.7
+
+
+@dataclass(frozen=True)
+class STTConfig:
+    provider: str = "openai"
+    app_id: str = ""
+    api_key: str = ""
+    api_secret: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini-transcribe"
+    language: str | None = "en"
+    domain: str = "iat"
+    accent: str = "mandarin"
+    encoding: str = "raw"
+    frame_size: int = 1280
+    timeout_sec: float = 10.0
+    dynamic_correction: bool = False
+
+
+@dataclass(frozen=True)
+class TTSConfig:
+    provider: str = "openai"
+    app_id: str = ""
+    api_key: str = ""
+    api_secret: str = ""
+    base_url: str | None = None
+    model: str = "gpt-4o-mini-tts"
+    voice: str = "alloy"
+    aue: str = "raw"
+    tte: str = "UTF8"
+    speed: int = 50
+    volume: int = 50
+    pitch: int = 50
+    timeout_sec: float = 30.0
+    source_sample_rate_hz: int | None = None
+
+
+@dataclass(frozen=True)
+class ServicesConfig:
+    llm: LLMConfig = field(default_factory=LLMConfig)
+    stt: STTConfig = field(default_factory=STTConfig)
+    tts: TTSConfig = field(default_factory=TTSConfig)
+
+
+@dataclass(frozen=True)
+class EngineConfig:
+    server: ServerConfig = field(default_factory=ServerConfig)
+    audio: AudioConfig = field(default_factory=AudioConfig)
+    session: SessionConfig = field(default_factory=SessionConfig)
+    turn: TurnConfig = field(default_factory=TurnConfig)
+    agent: AgentConfig = field(default_factory=AgentConfig)
+    services: ServicesConfig = field(default_factory=ServicesConfig)
+
+
+def load_config(path: str | Path | None = None) -> EngineConfig:
+    config_path = Path(path) if path is not None else DEFAULT_VOICE_CONFIG
+    if not config_path.is_absolute():
+        config_path = PROJECT_ROOT / config_path
+    data = json.loads(config_path.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise ValueError(f"Config file must contain a JSON object: {config_path}")
+    return config_from_dict(data)
+
+
+def config_from_dict(data: dict) -> EngineConfig:
+    services = _dict(data.get("services"))
+    agent = _dict(data.get("agent"))
+    if agent.get("greeting") == "":
+        agent["greeting"] = None
+    if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"):
+        raise ValueError("agent.greeting_mode must be one of: generated, fixed, off")
+
+    stt = _dict(services.get("stt") or services.get("asr"))
+    if stt.get("language") == "":
+        stt["language"] = None
+
+    turn = _dict(data.get("turn"))
+    vad = _dict(turn.get("vad"))
+
+    return EngineConfig(
+        server=ServerConfig(**_dict(data.get("server"))),
+        audio=AudioConfig(**_dict(data.get("audio"))),
+        session=SessionConfig(**_dict(data.get("session"))),
+        turn=TurnConfig(
+            vad=VADConfig(**vad),
+            user_speech_timeout_sec=float(
+                turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
+            ),
+            interruption_min_chars=int(
+                turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
+            ),
+            interruption_use_interim=bool(
+                turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
+            ),
+            interruption_short_replies=list(
+                turn.get(
+                    "interruption_short_replies",
+                    TurnConfig().interruption_short_replies,
+                )
+            ),
+        ),
+        agent=AgentConfig(**agent),
+        services=ServicesConfig(
+            llm=LLMConfig(**_dict(services.get("llm"))),
+            stt=STTConfig(**stt),
+            tts=TTSConfig(**_dict(services.get("tts"))),
+        ),
+    )
+
+
+def _dict(value: object) -> dict:
+    return dict(value) if isinstance(value, dict) else {}
--- a/src/voice/pipeline.py
+++ b/src/voice/pipeline.py
@@ -0,0 +1,179 @@
+from __future__ import annotations
+
+from loguru import logger
+
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import (
+    LLMRunFrame,
+    OutputTransportMessageUrgentFrame,
+    TTSSpeakFrame,
+)
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import (
+    AssistantTurnStoppedMessage,
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+    UserTurnStoppedMessage,
+)
+from pipecat.serializers.base_serializer import FrameSerializer
+from pipecat.transports.websocket.fastapi import (
+    FastAPIWebsocketParams,
+    FastAPIWebsocketTransport,
+)
+from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
+    SpeechTimeoutUserTurnStopStrategy,
+)
+from pipecat.turns.user_turn_strategies import UserTurnStrategies
+
+from .config import EngineConfig
+from .protocol import ProductWebsocketSerializer
+from .services import create_llm_service, create_stt_service, create_tts_service
+from .text_input import ProductTextInputProcessor
+from .text_stream import ProductTextStreamProcessor
+from .transcript_stream import ProductTranscriptStreamProcessor
+from .turn_start import InterruptionGateUserTurnStartStrategy
+
+
+async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
+    await run_pipeline_with_serializer(
+        websocket,
+        config,
+        serializer=ProductWebsocketSerializer(
+            sample_rate=config.audio.sample_rate_hz,
+            channels=config.audio.channels,
+        ),
+        client_label="Product JSON",
+    )
+
+
+async def run_pipeline_with_serializer(
+    websocket,
+    config: EngineConfig,
+    *,
+    serializer: FrameSerializer,
+    client_label: str,
+) -> None:
+    transport = FastAPIWebsocketTransport(
+        websocket=websocket,
+        params=FastAPIWebsocketParams(
+            audio_in_enabled=True,
+            audio_out_enabled=True,
+            audio_in_sample_rate=config.audio.sample_rate_hz,
+            audio_out_sample_rate=config.audio.sample_rate_hz,
+            audio_in_channels=config.audio.channels,
+            audio_out_channels=config.audio.channels,
+            serializer=serializer,
+            session_timeout=None,
+        ),
+    )
+
+    stt = create_stt_service(config.services.stt, config.audio)
+    llm = create_llm_service(config.services.llm)
+    tts = create_tts_service(config.services.tts, config.audio)
+
+    messages = [{"role": "system", "content": config.agent.system_prompt}]
+    if config.agent.greeting and config.agent.greeting_mode == "generated":
+        messages.append({"role": "system", "content": config.agent.greeting})
+
+    context = LLMContext(messages)
+
+    vad_params = VADParams(
+        confidence=config.turn.vad.confidence,
+        start_secs=config.turn.vad.start_secs,
+        stop_secs=config.turn.vad.stop_secs,
+        min_volume=config.turn.vad.min_volume,
+    )
+    user_turn_strategies = UserTurnStrategies(
+        start=[
+            InterruptionGateUserTurnStartStrategy(
+                min_chars_when_bot_speaking=config.turn.interruption_min_chars,
+                allowed_short_replies=config.turn.interruption_short_replies,
+                use_interim=config.turn.interruption_use_interim,
+            ),
+        ],
+        stop=[
+            SpeechTimeoutUserTurnStopStrategy(
+                user_speech_timeout=config.turn.user_speech_timeout_sec,
+            ),
+        ],
+    )
+    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
+        context,
+        user_params=LLMUserAggregatorParams(
+            vad_analyzer=SileroVADAnalyzer(params=vad_params),
+            user_turn_strategies=user_turn_strategies,
+        ),
+    )
+
+    pipeline = Pipeline(
+        [
+            transport.input(),
+            ProductTextInputProcessor(),
+            stt,
+            ProductTranscriptStreamProcessor(),
+            user_aggregator,
+            llm,
+            ProductTextStreamProcessor(),
+            tts,
+            transport.output(),
+            assistant_aggregator,
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            audio_in_sample_rate=config.audio.sample_rate_hz,
+            audio_out_sample_rate=config.audio.sample_rate_hz,
+            enable_metrics=True,
+            enable_usage_metrics=True,
+            enable_heartbeats=True,
+        ),
+        idle_timeout_secs=config.session.inactivity_timeout_sec,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(_transport, _client):
+        logger.info(f"{client_label} websocket client connected")
+        if config.agent.greeting_mode == "fixed" and config.agent.greeting:
+            await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
+        elif config.agent.greeting_mode == "generated":
+            await task.queue_frames([LLMRunFrame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(_transport, _client):
+        logger.info(f"{client_label} websocket client disconnected")
+        await task.cancel()
+
+    @transport.event_handler("on_session_timeout")
+    async def on_session_timeout(_transport, _client):
+        logger.info(f"{client_label} websocket session timed out")
+        await task.cancel()
+
+    @user_aggregator.event_handler("on_user_turn_stopped")
+    async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
+        logger.info(f"User: {message.content}")
+        text = (message.content or "").strip()
+        if not text:
+            return
+        await task.queue_frame(
+            OutputTransportMessageUrgentFrame(
+                message={
+                    "type": "input.transcript.final",
+                    "text": text,
+                    "user_id": message.user_id,
+                    "timestamp": message.timestamp,
+                }
+            )
+        )
+
+    @assistant_aggregator.event_handler("on_assistant_turn_stopped")
+    async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage):
+        logger.info(f"Assistant: {message.content}")
+
+    runner = PipelineRunner(handle_sigint=False)
+    await runner.run(task)
--- a/src/voice/protocol.py
+++ b/src/voice/protocol.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+import base64
+import json
+from typing import Any
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    BotStartedSpeakingFrame,
+    BotStoppedSpeakingFrame,
+    EndFrame,
+    Frame,
+    InputAudioRawFrame,
+    InputTransportMessageFrame,
+    OutputAudioRawFrame,
+    OutputTransportMessageFrame,
+    OutputTransportMessageUrgentFrame,
+    TextFrame,
+    TranscriptionFrame,
+)
+from pipecat.serializers.base_serializer import FrameSerializer
+
+
+class ProductWebsocketSerializer(FrameSerializer):
+    """Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport."""
+
+    protocol = "va.ws.v1"
+
+    def __init__(self, *, sample_rate: int, channels: int):
+        super().__init__()
+        self._sample_rate = sample_rate
+        self._channels = channels
+        self._sequence = 0
+
+    async def serialize(self, frame: Frame) -> str | bytes | None:
+        if isinstance(frame, OutputAudioRawFrame):
+            return self._event(
+                "response.audio.delta",
+                audio=base64.b64encode(frame.audio).decode("ascii"),
+                bytes=len(frame.audio),
+                sample_rate=frame.sample_rate,
+                channels=frame.num_channels,
+            )
+
+        if isinstance(frame, BotStartedSpeakingFrame):
+            return self._event("response.audio.started")
+
+        if isinstance(frame, BotStoppedSpeakingFrame):
+            return self._event("response.audio.stopped")
+
+        if isinstance(frame, TranscriptionFrame):
+            return self._event(
+                "input.transcript.final",
+                text=frame.text,
+                user_id=frame.user_id,
+                timestamp=frame.timestamp,
+            )
+
+        if isinstance(frame, TextFrame):
+            return self._event("response.text.delta", text=frame.text)
+
+        if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
+            if self.should_ignore_frame(frame):
+                return None
+            message = frame.message
+            if isinstance(message, dict) and isinstance(message.get("type"), str):
+                event_type = message["type"]
+                payload = {k: v for k, v in message.items() if k != "type"}
+                return self._event(event_type, **payload)
+            return self._event("transport.message", message=message)
+
+        return None
+
+    async def deserialize(self, data: str | bytes) -> Frame | None:
+        if isinstance(data, bytes):
+            return InputAudioRawFrame(
+                audio=data,
+                sample_rate=self._sample_rate,
+                num_channels=self._channels,
+            )
+
+        try:
+            message = json.loads(data)
+        except json.JSONDecodeError as exc:
+            logger.warning(f"Invalid product websocket JSON: {exc}")
+            return None
+
+        if not isinstance(message, dict):
+            logger.warning("Product websocket message must be a JSON object")
+            return None
+
+        message_type = message.get("type")
+        if message_type == "session.start":
+            return InputTransportMessageFrame(
+                message={
+                    "type": "session.started",
+                    "protocol": self.protocol,
+                    "audio": {
+                        "encoding": "pcm_s16le",
+                        "sample_rate": self._sample_rate,
+                        "channels": self._channels,
+                    },
+                }
+            )
+
+        if message_type == "session.stop":
+            return EndFrame()
+
+        if message_type == "response.cancel":
+            return CancelFrame(reason="client_cancelled")
+
+        if message_type == "input.audio":
+            audio = message.get("audio") or message.get("data")
+            if not isinstance(audio, str):
+                logger.warning("input.audio requires base64 'audio' or 'data'")
+                return None
+            try:
+                pcm = base64.b64decode(audio)
+            except ValueError as exc:
+                logger.warning(f"Invalid input.audio base64: {exc}")
+                return None
+            return InputAudioRawFrame(
+                audio=pcm,
+                sample_rate=int(message.get("sample_rate") or self._sample_rate),
+                num_channels=int(message.get("channels") or self._channels),
+            )
+
+        if message_type == "input.text":
+            text = message.get("text")
+            if not isinstance(text, str) or not text.strip():
+                logger.warning("input.text requires non-empty 'text'")
+                return None
+            return InputTransportMessageFrame(
+                message={
+                    "type": "input.text",
+                    "text": text,
+                    "interrupt": bool(message.get("interrupt", True)),
+                }
+            )
+
+        if message_type == "transport.message":
+            payload = message.get("message")
+            return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message)
+
+        logger.warning(f"Unsupported product websocket message type: {message_type!r}")
+        return None
+
+    def _event(self, event_type: str, **payload: Any) -> str:
+        self._sequence += 1
+        return json.dumps(
+            {
+                "type": event_type,
+                "protocol": self.protocol,
+                "seq": self._sequence,
+                **payload,
+            },
+            ensure_ascii=False,
+        )
--- a/src/voice/routes.py
+++ b/src/voice/routes.py
@@ -0,0 +1,92 @@
+from __future__ import annotations
+
+from functools import lru_cache
+from pathlib import Path
+
+from fastapi import APIRouter, FastAPI, WebSocket
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from loguru import logger
+
+from .config import DEFAULT_VOICE_CONFIG, EngineConfig, load_config
+from .pipeline import run_product_voice_pipeline
+
+PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo"
+
+router = APIRouter(tags=["voice"])
+
+
+@lru_cache(maxsize=1)
+def get_voice_config() -> EngineConfig:
+    return load_config(DEFAULT_VOICE_CONFIG)
+
+
+def _normalize_mount_path(path: str) -> str:
+    normalized = path.strip() or "/voice-demo"
+    if not normalized.startswith("/"):
+        normalized = f"/{normalized}"
+    return normalized.rstrip("/") or "/"
+
+
+@router.get("/voice/health")
+async def voice_health() -> dict[str, object]:
+    config = get_voice_config()
+    mount = (
+        _normalize_mount_path(config.server.webpage_mount)
+        if config.server.serve_webpage
+        else None
+    )
+    return {
+        "status": "healthy",
+        "protocols": {
+            "/ws-product": "va.ws.v1.json_base64",
+        },
+        "features": {
+            "product_text_input": True,
+            "product_text_interrupt": True,
+        },
+        "demo": mount,
+        "llm_provider": config.services.llm.provider,
+        "stt_provider": config.services.stt.provider,
+        "tts_provider": config.services.tts.provider,
+    }
+
+
+@router.websocket("/ws-product")
+async def product_websocket_endpoint(websocket: WebSocket) -> None:
+    await websocket.accept()
+    config = get_voice_config()
+    await run_product_voice_pipeline(websocket, config)
+
+
+def register_voice(app: FastAPI) -> None:
+    """Mount voice websocket routes and optional browser demo static files."""
+    if not DEFAULT_VOICE_CONFIG.exists():
+        logger.warning(f"Voice config not found at {DEFAULT_VOICE_CONFIG}; voice demo disabled")
+        return
+
+    config = get_voice_config()
+    app.include_router(router)
+
+    if config.server.cors_origins:
+        app.add_middleware(
+            CORSMiddleware,
+            allow_origins=config.server.cors_origins,
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"],
+        )
+
+    if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir():
+        mount = _normalize_mount_path(config.server.webpage_mount)
+        app.mount(
+            mount,
+            StaticFiles(directory=str(VOICE_DEMO_DIR), html=True),
+            name="voice-demo",
+        )
+        logger.info(f"Voice demo mounted at {mount}")
+    else:
+        logger.info("Voice demo static page disabled or missing")
+
+    logger.info("Voice websocket registered at /ws-product")
--- a/src/voice/services.py
+++ b/src/voice/services.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+from collections.abc import AsyncGenerator
+
+from openai import BadRequestError
+from openai import NOT_GIVEN
+
+from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
+from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.openai.stt import OpenAISTTService
+from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
+from pipecat.transcriptions.language import Language
+
+from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig
+from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService
+from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService
+
+
+def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
+    if config.provider == "xfyun":
+        sample_rate = audio.sample_rate_hz if audio else 16000
+        return XfyunASRService(
+            app_id=config.app_id,
+            api_key=config.api_key or "",
+            api_secret=config.api_secret,
+            url=config.base_url or DEFAULT_XFYUN_ASR_URL,
+            language=config.language or "zh_cn",
+            domain=config.domain,
+            accent=config.accent,
+            sample_rate=sample_rate,
+            encoding=config.encoding,
+            frame_size=config.frame_size,
+            open_timeout=config.timeout_sec,
+            dynamic_correction=config.dynamic_correction,
+        )
+
+    _require_provider(config.provider, "openai", "stt")
+    return OpenAISTTService(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        settings=OpenAISTTService.Settings(
+            model=config.model,
+            language=_language(config.language),
+        ),
+    )
+
+
+def create_llm_service(config: LLMConfig):
+    _require_provider(config.provider, "openai", "llm")
+    return OpenAILLMService(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        settings=OpenAILLMService.Settings(
+            model=config.model,
+            temperature=config.temperature if config.temperature is not None else NOT_GIVEN,
+        ),
+    )
+
+
+def create_tts_service(config: TTSConfig, audio: AudioConfig):
+    if config.provider == "xfyun":
+        source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz
+        if source_sample_rate not in (8000, 16000):
+            raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000")
+        return XfyunTTSService(
+            app_id=config.app_id,
+            api_key=config.api_key or "",
+            api_secret=config.api_secret,
+            voice=config.voice,
+            url=config.base_url or DEFAULT_XFYUN_TTS_URL,
+            sample_rate=audio.sample_rate_hz,
+            source_sample_rate=source_sample_rate,
+            encoding=config.aue,
+            text_encoding=config.tte,
+            speed=config.speed,
+            volume=config.volume,
+            pitch=config.pitch,
+            timeout=config.timeout_sec,
+        )
+
+    _require_provider(config.provider, "openai", "tts")
+    service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService
+    return service_class(
+        api_key=config.api_key or None,
+        base_url=config.base_url,
+        sample_rate=audio.sample_rate_hz,
+        source_sample_rate=config.source_sample_rate_hz,
+        settings=OpenAITTSService.Settings(
+            model=config.model,
+            voice=config.voice,
+        ),
+    )
+
+
+class OpenAICompatibleTTSService(OpenAITTSService):
+    """OpenAI-compatible TTS service that permits provider-specific voice ids."""
+
+    def __init__(self, *, source_sample_rate: int | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE
+
+    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
+        voice = self._settings.voice
+        if not voice:
+            yield ErrorFrame(error="TTS voice must be specified")
+            return
+
+        try:
+            create_params = {
+                "input": text,
+                "model": self._settings.model,
+                "voice": voice,
+                "response_format": "pcm",
+            }
+
+            if self._settings.instructions:
+                create_params["instructions"] = self._settings.instructions
+
+            if self._settings.speed:
+                create_params["speed"] = self._settings.speed
+
+            async with self._client.audio.speech.with_streaming_response.create(
+                **create_params
+            ) as response:
+                if response.status_code != 200:
+                    error = await response.text()
+                    yield ErrorFrame(
+                        error=f"TTS request failed (status: {response.status_code}, error: {error})"
+                    )
+                    return
+
+                await self.start_tts_usage_metrics(text)
+
+                async def audio_chunks():
+                    async for chunk in response.iter_bytes(self.chunk_size):
+                        if chunk:
+                            yield chunk
+
+                first_frame = True
+                async for frame in self._stream_audio_frames_from_iterator(
+                    audio_chunks(),
+                    in_sample_rate=self._source_sample_rate,
+                    context_id=context_id,
+                ):
+                    if first_frame:
+                        await self.stop_ttfb_metrics()
+                        first_frame = False
+                    yield frame
+        except BadRequestError as exc:
+            yield ErrorFrame(error=f"TTS request failed: {exc}")
+        except Exception as exc:
+            yield ErrorFrame(error=f"TTS request failed: {exc}")
+
+
+def _require_provider(actual: str, expected: str, service_name: str) -> None:
+    if actual != expected:
+        raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}")
+
+
+def _language(value: str | None) -> Language | None:
+    if value is None:
+        return None
+    normalized = value.replace("-", "_").upper()
+    return getattr(Language, normalized, value)
--- a/src/voice/text_input.py
+++ b/src/voice/text_input.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+from loguru import logger
+
+from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class ProductTextInputProcessor(FrameProcessor):
+    """Converts product text-input transport messages into LLM turns."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if not isinstance(frame, InputTransportMessageFrame):
+            await self.push_frame(frame, direction)
+            return
+
+        message = frame.message
+        if not isinstance(message, dict) or message.get("type") != "input.text":
+            await self.push_frame(frame, direction)
+            return
+
+        text = str(message.get("text") or "").strip()
+        if not text:
+            return
+
+        if message.get("interrupt", True):
+            logger.info("Text input interrupting current response")
+            await self.broadcast_interruption()
+
+        await self.push_frame(
+            LLMMessagesAppendFrame(
+                messages=[{"role": "user", "content": text}],
+                run_llm=True,
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
--- a/src/voice/text_stream.py
+++ b/src/voice/text_stream.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from pipecat.frames.frames import (
+    Frame,
+    InterruptionFrame,
+    LLMFullResponseEndFrame,
+    LLMFullResponseStartFrame,
+    LLMTextFrame,
+    OutputTransportMessageUrgentFrame,
+    TTSSpeakFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class ProductTextStreamProcessor(FrameProcessor):
+    """Mirrors LLM text frames as streaming protocol events."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._aggregation: list[str] = []
+        self._turn_active = False
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, LLMFullResponseStartFrame):
+            await self._start_turn()
+        elif isinstance(frame, LLMTextFrame):
+            if frame.text:
+                await self._delta(frame.text)
+        elif isinstance(frame, LLMFullResponseEndFrame):
+            await self._end_turn(interrupted=False)
+        elif isinstance(frame, InterruptionFrame):
+            await self._end_turn(interrupted=True)
+        elif isinstance(frame, TTSSpeakFrame):
+            text = frame.text or ""
+            await self._start_turn()
+            if text:
+                await self._delta(text)
+            await self._end_turn(interrupted=False)
+
+        await self.push_frame(frame, direction)
+
+    async def _start_turn(self) -> None:
+        if self._turn_active:
+            return
+        self._turn_active = True
+        self._aggregation = []
+        await self._emit("response.text.started")
+
+    async def _delta(self, text: str) -> None:
+        if not self._turn_active:
+            await self._start_turn()
+        self._aggregation.append(text)
+        await self._emit("response.text.delta", text=text)
+
+    async def _end_turn(self, *, interrupted: bool) -> None:
+        if not self._turn_active:
+            return
+        full_text = "".join(self._aggregation)
+        self._turn_active = False
+        self._aggregation = []
+        await self._emit(
+            "response.text.final",
+            text=full_text,
+            interrupted=interrupted,
+        )
+
+    async def _emit(self, event_type: str, **payload: object) -> None:
+        await self.push_frame(
+            OutputTransportMessageUrgentFrame(
+                message={"type": event_type, **payload},
+            ),
+            FrameDirection.DOWNSTREAM,
+        )
--- a/src/voice/transcript_stream.py
+++ b/src/voice/transcript_stream.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from pipecat.frames.frames import (
+    Frame,
+    InterimTranscriptionFrame,
+    OutputTransportMessageUrgentFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
+
+
+class ProductTranscriptStreamProcessor(FrameProcessor):
+    """Mirrors interim STT frames to the product websocket protocol."""
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, InterimTranscriptionFrame):
+            await self.push_frame(
+                OutputTransportMessageUrgentFrame(
+                    message={
+                        "type": "input.transcript.interim",
+                        "text": frame.text,
+                        "user_id": frame.user_id,
+                        "timestamp": frame.timestamp,
+                    }
+                ),
+                FrameDirection.DOWNSTREAM,
+            )
+
+        await self.push_frame(frame, direction)
--- a/src/voice/turn_start.py
+++ b/src/voice/turn_start.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import re
+
+from loguru import logger
+from pipecat.frames.frames import (
+    BotStartedSpeakingFrame,
+    BotStoppedSpeakingFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    TranscriptionFrame,
+)
+from pipecat.turns.types import ProcessFrameResult
+from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy
+
+
+_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
+
+
+class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
+    """Starts user turns only after likely intentional speech."""
+
+    def __init__(
+        self,
+        *,
+        min_chars_when_bot_speaking: int,
+        allowed_short_replies: list[str],
+        use_interim: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self._min_chars_when_bot_speaking = min_chars_when_bot_speaking
+        self._allowed_short_replies = {
+            self._normalize_text(reply) for reply in allowed_short_replies if reply.strip()
+        }
+        self._use_interim = use_interim
+        self._bot_speaking = False
+
+    async def reset(self):
+        await super().reset()
+
+    async def process_frame(self, frame: Frame) -> ProcessFrameResult:
+        if isinstance(frame, BotStartedSpeakingFrame):
+            self._bot_speaking = True
+            return ProcessFrameResult.CONTINUE
+        if isinstance(frame, BotStoppedSpeakingFrame):
+            self._bot_speaking = False
+            return ProcessFrameResult.CONTINUE
+        if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
+            return await self._handle_transcription(frame.text, interim=True)
+        if isinstance(frame, TranscriptionFrame):
+            return await self._handle_transcription(frame.text, interim=False)
+
+        return ProcessFrameResult.CONTINUE
+
+    async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult:
+        normalized = self._normalize_text(text)
+        if not normalized:
+            return ProcessFrameResult.CONTINUE
+
+        if not self._bot_speaking:
+            await self.trigger_user_turn_started()
+            return ProcessFrameResult.STOP
+
+        should_interrupt = self._should_interrupt(normalized)
+        logger.debug(
+            f"{self} interruption_gate text={text!r} normalized={normalized!r} "
+            f"should_interrupt={should_interrupt} interim={interim}"
+        )
+
+        if should_interrupt:
+            await self.trigger_user_turn_started()
+            return ProcessFrameResult.STOP
+
+        await self.trigger_reset_aggregation()
+        return ProcessFrameResult.CONTINUE
+
+    def _should_interrupt(self, normalized: str) -> bool:
+        return (
+            normalized in self._allowed_short_replies
+            or len(normalized) >= self._min_chars_when_bot_speaking
+        )
+
+    @staticmethod
+    def _normalize_text(text: str) -> str:
+        return "".join(_COUNTABLE_TEXT_RE.findall(text.lower()))
--- a/src/voice/xfyun_asr.py
+++ b/src/voice/xfyun_asr.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import asyncio
+import base64
+import hashlib
+import hmac
+import json
+import os
+from collections.abc import AsyncGenerator
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from typing import Any
+from urllib.parse import urlencode, urlparse
+
+from loguru import logger
+
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    TranscriptionFrame,
+    UserStoppedSpeakingFrame,
+    VADUserStartedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.settings import STTSettings
+from pipecat.services.stt_service import STTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+from websockets.asyncio.client import connect as websocket_connect
+from websockets.protocol import State
+
+
+DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat"
+
+
+class XfyunASRService(STTService):
+    """iFlytek/Xfyun streaming voice dictation service for Pipecat."""
+
+    def __init__(
+        self,
+        *,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        url: str | None = None,
+        language: str = "zh_cn",
+        domain: str = "iat",
+        accent: str = "mandarin",
+        sample_rate: int = 16000,
+        encoding: str = "raw",
+        frame_size: int = 1280,
+        open_timeout: float = 10.0,
+        dynamic_correction: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            sample_rate=sample_rate,
+            settings=STTSettings(model=None, language=language),
+            **kwargs,
+        )
+        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
+        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
+        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
+        self._url = url or DEFAULT_XFYUN_ASR_URL
+        self._language = language
+        self._domain = domain
+        self._accent = accent
+        self._encoding = encoding
+        self._frame_size = frame_size
+        self._open_timeout = open_timeout
+        self._dynamic_correction = dynamic_correction
+
+        self._websocket = None
+        self._receive_task = None
+        self._audio_buffer = bytearray()
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+        self._finalizing_turn = False
+        self._partials: list[str] = []
+        self._last_text = ""
+
+    async def cleanup(self) -> None:
+        await self._close_utterance()
+        await super().cleanup()
+
+    async def stop(self, frame: EndFrame) -> None:
+        await self._close_utterance()
+        await super().stop(frame)
+
+    async def cancel(self, frame: CancelFrame) -> None:
+        await self._close_utterance()
+        await super().cancel(frame)
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            # Aggregator-level turn end (broadcast once per logical user turn).
+            # This is the only boundary that finalizes/closes the xfyun
+            # websocket, so brief VAD pauses do not restart the ASR session.
+            await self._finish_utterance()
+        elif isinstance(frame, VADUserStartedSpeakingFrame):
+            await self._start_utterance()
+
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]:
+        if not audio:
+            yield None
+            return
+
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            await self._start_utterance()
+
+        self._audio_buffer.extend(audio)
+        await self._flush_audio_buffer(final=False)
+        yield None
+
+    async def _start_utterance(self) -> None:
+        if self._websocket and self._websocket.state is State.OPEN:
+            return
+
+        if not self._app_id or not self._api_key or not self._api_secret:
+            await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret")
+            return
+
+        if self.sample_rate not in (8000, 16000):
+            await self.push_error("Xfyun ASR sample rate must be 8000 or 16000")
+            return
+
+        self._audio_buffer.clear()
+        self._partials = []
+        self._last_text = ""
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+
+        auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
+        try:
+            self._websocket = await websocket_connect(
+                auth_url,
+                max_size=None,
+                open_timeout=self._open_timeout,
+            )
+        except Exception as exc:
+            await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc)
+            self._websocket = None
+            return
+
+        self._receive_task = self.create_task(
+            self._receive_messages(),
+            name="xfyun_asr_receive",
+        )
+
+    async def _finish_utterance(self) -> None:
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            return
+
+        await self._flush_audio_buffer(final=True)
+        if not self._sent_first_frame:
+            await self._close_utterance()
+            return
+
+        if not self._sent_final_frame:
+            self._finalizing_turn = True
+            await self._send_payload({"data": {"status": 2}})
+            self.request_finalize()
+            self._sent_final_frame = True
+
+    async def _close_utterance(self) -> None:
+        current_task = asyncio.current_task()
+        if self._receive_task and self._receive_task is not current_task:
+            await self.cancel_task(self._receive_task)
+            self._receive_task = None
+
+        websocket = self._websocket
+        self._websocket = None
+        if websocket and websocket.state is State.OPEN:
+            try:
+                await websocket.close()
+            except Exception:
+                pass
+
+        self._audio_buffer.clear()
+        self._sent_first_frame = False
+        self._sent_final_frame = False
+        self._finalizing_turn = False
+
+    async def _flush_audio_buffer(self, *, final: bool) -> None:
+        while len(self._audio_buffer) >= self._frame_size:
+            chunk = bytes(self._audio_buffer[: self._frame_size])
+            del self._audio_buffer[: self._frame_size]
+            await self._send_audio_chunk(chunk, status=1)
+
+        if final and self._audio_buffer:
+            chunk = bytes(self._audio_buffer)
+            self._audio_buffer.clear()
+            await self._send_audio_chunk(chunk, status=1)
+
+    async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None:
+        if not audio:
+            return
+
+        if not self._sent_first_frame:
+            business = {
+                "language": self._language,
+                "domain": self._domain,
+                "accent": self._accent,
+            }
+            if self._dynamic_correction:
+                business["dwa"] = "wpgs"
+
+            payload = {
+                "common": {"app_id": self._app_id},
+                "business": business,
+                "data": {
+                    "status": 0,
+                    "format": f"audio/L16;rate={self.sample_rate}",
+                    "encoding": self._encoding,
+                    "audio": base64.b64encode(audio).decode("utf-8"),
+                },
+            }
+            self._sent_first_frame = True
+        else:
+            payload = {
+                "data": {
+                    "status": status,
+                    "format": f"audio/L16;rate={self.sample_rate}",
+                    "encoding": self._encoding,
+                    "audio": base64.b64encode(audio).decode("utf-8"),
+                }
+            }
+
+        await self._send_payload(payload)
+
+    async def _send_payload(self, payload: dict[str, Any]) -> None:
+        if not self._websocket or self._websocket.state is not State.OPEN:
+            return
+        await self._websocket.send(json.dumps(payload, ensure_ascii=False))
+
+    async def _receive_messages(self) -> None:
+        websocket = self._websocket
+        if not websocket:
+            return
+
+        try:
+            async for message in websocket:
+                await self._process_response(json.loads(message))
+        except Exception as exc:
+            if self._websocket is websocket:
+                await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc)
+        finally:
+            if self._websocket is websocket:
+                self._websocket = None
+            self._receive_task = None
+
+    async def _process_response(self, payload: dict[str, Any]) -> None:
+        code = payload.get("code", -1)
+        if code != 0:
+            message = payload.get("message", "unknown error")
+            sid = payload.get("sid")
+            await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}")
+            return
+
+        data = payload.get("data")
+        if not isinstance(data, dict):
+            return
+
+        is_final_response = data.get("status") == 2
+        recognition = data.get("result")
+        if isinstance(recognition, dict):
+            text = self._apply_recognition_result(recognition)
+            if text and text != self._last_text:
+                self._last_text = text
+                if not self._finalizing_turn and not is_final_response:
+                    await self.push_frame(
+                        InterimTranscriptionFrame(
+                            text,
+                            self._user_id,
+                            time_now_iso8601(),
+                            _language_or_none(self._language),
+                            result=payload,
+                        )
+                    )
+
+        if is_final_response:
+            final_text = self._last_text
+            if final_text:
+                self.confirm_finalize()
+                await self.push_frame(
+                    TranscriptionFrame(
+                        final_text,
+                        self._user_id,
+                        time_now_iso8601(),
+                        _language_or_none(self._language),
+                        result=payload,
+                    )
+                )
+            await self._close_utterance()
+
+    def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
+        partial = _extract_text_from_result(recognition)
+        if not partial:
+            return self._last_text
+
+        if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
+            start, end = recognition["rg"]
+            if 1 <= start <= len(self._partials):
+                self._partials[start - 1 : end] = [partial]
+            else:
+                logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}")
+        else:
+            self._partials.append(partial)
+
+        return "".join(self._partials)
+
+
+def _extract_text_from_result(result: dict[str, Any]) -> str:
+    words: list[str] = []
+    for item in result.get("ws", []):
+        for candidate in item.get("cw", []):
+            word = candidate.get("w")
+            if word:
+                words.append(word)
+    return "".join(words)
+
+
+def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
+    parsed = urlparse(url)
+    host = parsed.netloc
+    path = parsed.path or "/v2/iat"
+    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
+    request_line = f"GET {path} HTTP/1.1"
+    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
+    signature_sha = hmac.new(
+        api_secret.encode("utf-8"),
+        signature_origin.encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    signature = base64.b64encode(signature_sha).decode("utf-8")
+    authorization_origin = (
+        f'api_key="{api_key}", algorithm="hmac-sha256", '
+        f'headers="host date request-line", signature="{signature}"'
+    )
+    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
+    query = urlencode({"authorization": authorization, "date": date, "host": host})
+    return f"{url}?{query}"
+
+
+def _language_or_none(value: str) -> Language | None:
+    try:
+        return Language(value)
+    except ValueError:
+        return None
--- a/src/voice/xfyun_tts.py
+++ b/src/voice/xfyun_tts.py
@@ -0,0 +1,257 @@
+from __future__ import annotations
+
+import base64
+import hashlib
+import hmac
+import json
+import os
+import re
+import unicodedata
+from collections.abc import AsyncGenerator, AsyncIterator
+from datetime import datetime, timezone
+from email.utils import format_datetime
+from typing import Any
+from urllib.parse import urlencode, urlparse
+
+from loguru import logger
+
+from pipecat.frames.frames import ErrorFrame, Frame
+from pipecat.services.settings import TTSSettings
+from pipecat.services.tts_service import TTSService
+from websockets.asyncio.client import connect
+
+
+DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
+
+# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
+# rejects (or returns empty audio for) text containing emoji and other
+# non-BMP symbols, which surfaces as "request finished without audio data".
+_EMOJI_AND_SYMBOL_RE = re.compile(
+    "["
+    "\U0001F300-\U0001FAFF"  # misc pictographs, emoji, symbols, transport, etc.
+    "\U00002600-\U000027BF"  # misc symbols and dingbats
+    "\U0001F1E6-\U0001F1FF"  # regional indicators (flags)
+    "\uFE00-\uFE0F"           # variation selectors
+    "\u200D"                  # zero-width joiner
+    "]",
+    flags=re.UNICODE,
+)
+
+
+class XfyunTTSService(TTSService):
+    """iFlytek/Xfyun online TTS service for Pipecat.
+
+    Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
+    receives one JSON request per synthesis, and streams text WebSocket
+    messages containing base64-encoded audio chunks. This service requests
+    raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
+    """
+
+    def __init__(
+        self,
+        *,
+        app_id: str,
+        api_key: str,
+        api_secret: str,
+        voice: str,
+        url: str | None = None,
+        sample_rate: int = 16000,
+        source_sample_rate: int = 16000,
+        encoding: str = "raw",
+        text_encoding: str = "UTF8",
+        speed: int = 50,
+        volume: int = 50,
+        pitch: int = 50,
+        timeout: float = 30.0,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            sample_rate=sample_rate,
+            settings=TTSSettings(model=None, voice=voice, language=None),
+            **kwargs,
+        )
+        self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
+        self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
+        self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
+        self._voice = voice
+        self._url = url or DEFAULT_XFYUN_TTS_URL
+        self._source_sample_rate = source_sample_rate
+        self._encoding = encoding
+        self._text_encoding = text_encoding
+        self._speed = speed
+        self._volume = volume
+        self._pitch = pitch
+        self._timeout = timeout
+        self._last_failure_detail: str | None = None
+
+    async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
+        if not text:
+            return
+
+        if not self._app_id or not self._api_key or not self._api_secret:
+            yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
+            return
+
+        sanitized = _sanitize_text_for_tts(text)
+        if not sanitized:
+            logger.debug(
+                f"{self}: skipping Xfyun TTS, text became empty after sanitization "
+                f"(original={text!r})"
+            )
+            return
+
+        if sanitized != text:
+            logger.debug(
+                f"{self}: sanitized Xfyun TTS text "
+                f"(original={text!r}, sanitized={sanitized!r})"
+            )
+
+        if len(sanitized.encode("utf-8")) >= 8000:
+            yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
+            return
+
+        if self._encoding != "raw":
+            yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
+            return
+
+        try:
+            await self.start_tts_usage_metrics(sanitized)
+
+            first_frame = True
+            async for frame in self._stream_audio_frames_from_iterator(
+                self._iter_audio_chunks(sanitized),
+                in_sample_rate=self._source_sample_rate,
+                context_id=context_id,
+            ):
+                if first_frame:
+                    await self.stop_ttfb_metrics()
+                    first_frame = False
+                yield frame
+
+            if first_frame:
+                detail = self._last_failure_detail or "no audio frames received"
+                yield ErrorFrame(
+                    error=(
+                        f"Xfyun TTS request finished without audio data ({detail}); "
+                        f"text={sanitized!r}"
+                    )
+                )
+        except Exception as exc:
+            yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")
+
+    async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
+        request = self._build_request_frame(text)
+        auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
+
+        self._last_failure_detail = None
+        frames_received = 0
+        audio_bytes_received = 0
+        last_status: int | None = None
+        last_sid: str | None = None
+        saw_status_2 = False
+
+        async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
+            await websocket.send(json.dumps(request, ensure_ascii=False))
+
+            async for raw_message in websocket:
+                frames_received += 1
+                payload = json.loads(raw_message)
+                code = payload.get("code", -1)
+                sid = payload.get("sid")
+                if sid:
+                    last_sid = sid
+                if code != 0:
+                    err_msg = payload.get("message", "unknown error")
+                    raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")
+
+                data = payload.get("data")
+                if not isinstance(data, dict):
+                    continue
+
+                last_status = data.get("status", last_status)
+
+                audio_b64 = data.get("audio")
+                if audio_b64:
+                    audio_bytes = base64.b64decode(audio_b64)
+                    audio_bytes_received += len(audio_bytes)
+                    yield audio_bytes
+
+                if data.get("status") == 2:
+                    saw_status_2 = True
+                    break
+
+        if audio_bytes_received == 0:
+            self._last_failure_detail = (
+                f"frames={frames_received}, audio_bytes=0, "
+                f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
+            )
+            logger.warning(
+                f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
+            )
+
+    def _build_request_frame(self, text: str) -> dict[str, Any]:
+        business: dict[str, Any] = {
+            "aue": self._encoding,
+            "auf": f"audio/L16;rate={self._source_sample_rate}",
+            "vcn": self._voice,
+            "speed": self._speed,
+            "volume": self._volume,
+            "pitch": self._pitch,
+            "tte": self._text_encoding,
+        }
+
+        return {
+            "common": {"app_id": self._app_id},
+            "business": business,
+            "data": {
+                "status": 2,
+                "text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
+            },
+        }
+
+
+def _sanitize_text_for_tts(text: str) -> str:
+    """Strip characters Xfyun's online TTS cannot synthesize.
+
+    The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
+    dingbats, regional-indicator flags, variation selectors, and zero-width
+    joiners.  When such characters appear in the input the synthesis can
+    finish without any audio data ("Xfyun TTS request finished without audio
+    data").  We also drop control characters (other than common whitespace)
+    and "Symbol, Other" codepoints, then collapse runs of whitespace.
+    """
+    if not text:
+        return text
+
+    cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
+    filtered: list[str] = []
+    for ch in cleaned:
+        category = unicodedata.category(ch)
+        if category == "So":
+            continue
+        if category.startswith("C") and ch not in ("\n", "\r", "\t"):
+            continue
+        filtered.append(ch)
+    return re.sub(r"\s+", " ", "".join(filtered)).strip()
+
+
+def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
+    parsed = urlparse(url)
+    host = parsed.netloc
+    path = parsed.path or "/v2/tts"
+    date = format_datetime(datetime.now(timezone.utc), usegmt=True)
+    request_line = f"GET {path} HTTP/1.1"
+    signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
+    signature_sha = hmac.new(
+        api_secret.encode("utf-8"),
+        signature_origin.encode("utf-8"),
+        digestmod=hashlib.sha256,
+    ).digest()
+    signature = base64.b64encode(signature_sha).decode("utf-8")
+    authorization_origin = (
+        f'api_key="{api_key}", algorithm="hmac-sha256", '
+        f'headers="host date request-line", signature="{signature}"'
+    )
+    authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
+    query = urlencode({"authorization": authorization, "date": date, "host": host})
+    return f"{url}?{query}"
--- a/static/voice-demo/README.md
+++ b/static/voice-demo/README.md
@@ -0,0 +1,106 @@
+# Webpage Example — Realtime Voice Chat
+
+A self-contained browser client for the engine's product websocket
+(`/ws-product`, protocol `va.ws.v1`).
+
+## Features
+
+- **Connect / Disconnect** to any `ws://` or `wss://` URL.
+- **Microphone selector + mic on/off toggle** — available input devices
+  are listed with `enumerateDevices`, and getUserMedia is requested with
+  `echoCancellation`, `noiseSuppression`, and `autoGainControl` so the
+  browser handles AEC against the bot's voice.
+- **Text composer** — type a message and press <kbd>Enter</kbd> to send
+  an `input.text` event (Shift+Enter for newline). Sending interrupts
+  any in-flight bot audio so the next reply is heard cleanly.
+- **Chat history** rendered from `input.transcript.final` (you, when
+  spoken), streamed `response.text.delta` / `response.text.final`
+  (assistant — deltas arrive ahead of the synthesized audio), and locally
+  for text you submit (the engine doesn't echo text input back as a
+  transcript).
+- **WebSocket log** panel for connection state and compact send/receive
+  events. Audio chunks are summarized so the UI does not flood.
+- **Gapless TTS playback** by scheduling each `response.audio.delta`
+  chunk back-to-back on the AudioContext.
+- **Live VU meter** + mic and bot activity indicators.
+- **Clear** button to reset history.
+
+No build step, no dependencies — just three files plus an AudioWorklet.
+
+## Layout
+
+```text
+examples/webpage/
+├── index.html
+├── styles.css
+├── app.js
+└── pcm-recorder.worklet.js
+```
+
+## Run
+
+1. Start the engine (default port `8000`):
+
+   ```bash
+   cd AI-VideoAssistant-engine-v5-pipecat-minimal
+   source .venv/bin/activate
+   export OPENAI_API_KEY=...
+   uvicorn engine.main:app --host 127.0.0.1 --port 8000
+   ```
+
+2. Open the demo page served by the same process:
+
+   ```text
+   http://127.0.0.1:8000/demo/
+   ```
+
+   The default websocket URL is derived from the page host
+   (`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a
+   microphone if needed, click **Enable mic**, and start speaking.
+
+   Mount path and on/off are controlled in `config.json`:
+
+   ```json
+   "server": {
+     "serve_webpage": true,
+     "webpage_mount": "/demo"
+   }
+   ```
+
+   Set `"serve_webpage": false` in production if you serve the UI elsewhere.
+
+### Standalone static server (optional)
+
+You can still serve the files from another port for UI-only iteration.
+Add that origin to `server.cors_origins` in `config.json` if needed:
+
+```bash
+cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage
+python -m http.server 8080
+```
+
+Then open <http://localhost:8080> and point the URL field at
+`ws://127.0.0.1:8000/ws-product`.
+
+> The browser's mic API requires a secure context. `http://localhost`
+> qualifies; if you serve from another host, use HTTPS and a `wss://`
+> URL.
+
+## Audio details
+
+- Input: mono Float32 from `getUserMedia` is resampled in the
+  AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and
+  sent as **binary** websocket messages (the server accepts either
+  binary or the JSON+base64 form).
+- Output: each `response.audio.delta` carries base64-encoded PCM16 @
+  16 kHz; chunks are decoded and scheduled back-to-back through Web
+  Audio. The browser handles resampling to the device rate.
+
+## Notes
+
+- Use headphones if you still hear echo despite browser AEC; the bot's
+  voice leaking back into the open mic is the most common cause of
+  feedback loops.
+- The engine's session has an inactivity timeout
+  (`session.inactivity_timeout_sec` in `config.json`). If the bot
+  doesn't respond after a long silence, reconnect.
--- a/static/voice-demo/app.js
+++ b/static/voice-demo/app.js
@@ -0,0 +1,899 @@
+/**
+ * Minimal browser client for the AI VideoAssistant engine's product
+ * websocket (`/ws-product`, protocol `va.ws.v1`).
+ *
+ * Responsibilities:
+ *   - Open/close the websocket and run the session handshake.
+ *   - List/select microphones and capture mic audio with browser AEC enabled.
+ *   - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
+ *     as binary websocket messages.
+ *   - Play `response.audio.delta` frames gaplessly through Web Audio.
+ *   - Render a chat-style history of user transcripts and bot text deltas.
+ */
+
+const SAMPLE_RATE = 16000;
+const CHANNELS = 1;
+const FRAME_MS = 20;
+const PROTOCOL = "va.ws.v1";
+const MAX_WS_LOG_LINES = 120;
+const AUDIO_DELTA_LOG_INTERVAL_MS = 1000;
+
+function defaultWsUrl() {
+  const scheme = location.protocol === "https:" ? "wss:" : "ws:";
+  return `${scheme}//${location.host}/ws-product`;
+}
+
+const els = {
+  url: document.getElementById("ws-url"),
+  connectBtn: document.getElementById("connect-btn"),
+  statusDot: document.getElementById("status-dot"),
+  statusText: document.getElementById("status-text"),
+  chatLog: document.getElementById("chat-log"),
+  micBtn: document.getElementById("mic-btn"),
+  micSelect: document.getElementById("mic-select"),
+  micLabel: document.querySelector(".mic-btn__label"),
+  micIndicator: document.getElementById("mic-indicator"),
+  botIndicator: document.getElementById("bot-indicator"),
+  clearBtn: document.getElementById("clear-btn"),
+  clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
+  wsLog: document.getElementById("ws-log"),
+  meterFill: document.getElementById("meter-fill"),
+  composer: document.getElementById("composer"),
+  textInput: document.getElementById("text-input"),
+  sendBtn: document.getElementById("send-btn"),
+};
+
+const state = {
+  ws: null,
+  connected: false,
+  connecting: false,
+
+  audioContext: null,
+  micStream: null,
+  micSourceNode: null,
+  recorderNode: null,
+
+  micEnabled: false,
+  micDevices: [],
+  selectedMicDeviceId: "",
+
+  // Output scheduling.
+  nextPlaybackTime: 0,
+  playbackEndsAt: 0,
+  scheduledSources: [],
+  botActive: false,
+  botUiTimer: null,
+
+  // Chat state.
+  currentAssistantBubble: null,
+
+  // VU meter smoothing.
+  meterLevel: 0,
+
+  // Compact websocket logging.
+  audioDeltaLogCount: 0,
+  audioDeltaLogBytes: 0,
+  lastAudioDeltaLogAt: 0,
+  audioSendLogCount: 0,
+  audioSendLogBytes: 0,
+  lastAudioSendLogAt: 0,
+};
+
+/* ------------------------------------------------------------------ UI */
+
+function setStatus(kind, text) {
+  els.statusDot.className = `status__dot status__dot--${kind}`;
+  els.statusText.textContent = text;
+}
+
+function setConnectButton() {
+  if (state.connecting) {
+    els.connectBtn.textContent = "Connecting…";
+    els.connectBtn.disabled = true;
+    els.connectBtn.classList.remove("is-disconnect");
+  } else if (state.connected) {
+    els.connectBtn.textContent = "Disconnect";
+    els.connectBtn.disabled = false;
+    els.connectBtn.classList.add("is-disconnect");
+  } else {
+    els.connectBtn.textContent = "Connect";
+    els.connectBtn.disabled = false;
+    els.connectBtn.classList.remove("is-disconnect");
+  }
+}
+
+function setMicButton() {
+  els.micBtn.disabled = !state.connected;
+  els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
+  els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
+  els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
+  els.micIndicator.classList.toggle("is-active", state.micEnabled);
+}
+
+function setMicSelectEnabled() {
+  els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
+}
+
+function setComposerEnabled(enabled) {
+  els.textInput.disabled = !enabled;
+  els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
+}
+
+function setBotIndicator(active) {
+  els.botIndicator.classList.toggle("is-active", active);
+}
+
+function addBubble(role, text) {
+  if (els.chatLog.querySelector(".chat__empty")) {
+    els.chatLog.innerHTML = "";
+  }
+  const bubble = document.createElement("div");
+  bubble.className = `bubble bubble--${role}`;
+  if (role !== "system") {
+    const tag = document.createElement("span");
+    tag.className = "bubble__role";
+    tag.textContent = role === "user" ? "You" : "Assistant";
+    bubble.appendChild(tag);
+  }
+  const body = document.createElement("span");
+  body.className = "bubble__text";
+  body.textContent = text;
+  bubble.appendChild(body);
+  els.chatLog.appendChild(bubble);
+  scrollChatToBottom();
+  return bubble;
+}
+
+function appendToBubble(bubble, text) {
+  const body = bubble.querySelector(".bubble__text");
+  body.textContent += text;
+  scrollChatToBottom();
+}
+
+function scrollChatToBottom() {
+  els.chatLog.scrollTop = els.chatLog.scrollHeight;
+}
+
+function clearChat() {
+  els.chatLog.innerHTML = "";
+  state.currentAssistantBubble = null;
+  const empty = document.createElement("div");
+  empty.className = "chat__empty";
+  empty.innerHTML = "<p>Chat cleared.</p>";
+  els.chatLog.appendChild(empty);
+}
+
+function truncateLogValue(value, maxLength = 160) {
+  const text = String(value);
+  if (text.length <= maxLength) return text;
+  return `${text.slice(0, maxLength - 1)}…`;
+}
+
+function compactWsPayload(payload) {
+  if (!payload || typeof payload !== "object") return String(payload);
+  const compact = { ...payload };
+
+  if (typeof compact.audio === "string") {
+    compact.audio = `<base64 ${compact.audio.length} chars>`;
+  }
+  if (typeof compact.data === "string" && compact.data.length > 160) {
+    compact.data = `<string ${compact.data.length} chars>`;
+  }
+  if (typeof compact.text === "string") {
+    compact.text = truncateLogValue(compact.text);
+  }
+
+  try {
+    return JSON.stringify(compact);
+  } catch (_) {
+    return payload.type || "unserializable websocket payload";
+  }
+}
+
+function addWsLog(direction, detail, kind = direction) {
+  if (els.wsLog.querySelector(".ws-log__empty")) {
+    els.wsLog.innerHTML = "";
+  }
+
+  const entry = document.createElement("div");
+  entry.className = `ws-log__entry ws-log__entry--${kind}`;
+
+  const time = document.createElement("span");
+  time.className = "ws-log__time";
+  time.textContent = new Date().toLocaleTimeString([], {
+    hour12: false,
+    hour: "2-digit",
+    minute: "2-digit",
+    second: "2-digit",
+  });
+
+  const dir = document.createElement("span");
+  dir.className = "ws-log__direction";
+  dir.textContent =
+    direction === "send"
+      ? "SEND"
+      : direction === "recv"
+        ? "RECV"
+        : direction.toUpperCase();
+
+  const body = document.createElement("span");
+  body.className = "ws-log__detail";
+  body.textContent = detail;
+
+  entry.append(time, dir, body);
+  els.wsLog.appendChild(entry);
+
+  while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
+    els.wsLog.firstElementChild.remove();
+  }
+  els.wsLog.scrollTop = els.wsLog.scrollHeight;
+}
+
+function flushAudioDeltaLog() {
+  if (state.audioDeltaLogCount === 0) return;
+  addWsLog(
+    "recv",
+    `response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`,
+  );
+  state.audioDeltaLogCount = 0;
+  state.audioDeltaLogBytes = 0;
+  state.lastAudioDeltaLogAt = performance.now();
+}
+
+function flushAudioSendLog() {
+  if (state.audioSendLogCount === 0) return;
+  addWsLog(
+    "send",
+    `input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`,
+  );
+  state.audioSendLogCount = 0;
+  state.audioSendLogBytes = 0;
+  state.lastAudioSendLogAt = performance.now();
+}
+
+function flushPendingWsLogs() {
+  flushAudioDeltaLog();
+  flushAudioSendLog();
+}
+
+function logWsPayload(direction, payload) {
+  if (direction === "send") {
+    flushAudioSendLog();
+  } else {
+    flushAudioDeltaLog();
+  }
+
+  if (direction === "recv" && payload?.type === "response.audio.delta") {
+    state.audioDeltaLogCount += 1;
+    state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0;
+    const now = performance.now();
+    if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
+      flushAudioDeltaLog();
+    }
+    return;
+  }
+
+  addWsLog(direction, compactWsPayload(payload));
+}
+
+function logBinarySend(byteLength) {
+  state.audioSendLogCount += 1;
+  state.audioSendLogBytes += byteLength;
+  const now = performance.now();
+  if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
+    flushAudioSendLog();
+  }
+}
+
+function wsSend(data) {
+  if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
+
+  if (typeof data === "string") {
+    try {
+      logWsPayload("send", JSON.parse(data));
+    } catch (_) {
+      flushAudioSendLog();
+      flushAudioDeltaLog();
+      addWsLog("send", truncateLogValue(data));
+    }
+  } else {
+    const byteLength =
+      data instanceof ArrayBuffer
+        ? data.byteLength
+        : ArrayBuffer.isView(data)
+          ? data.byteLength
+          : 0;
+    if (byteLength > 0) {
+      logBinarySend(byteLength);
+    }
+  }
+
+  state.ws.send(data);
+  return true;
+}
+
+function clearWsLog() {
+  state.audioDeltaLogCount = 0;
+  state.audioDeltaLogBytes = 0;
+  state.audioSendLogCount = 0;
+  state.audioSendLogBytes = 0;
+  els.wsLog.innerHTML =
+    '<div class="ws-log__empty">No websocket events yet.</div>';
+}
+
+/* ---------------------------------------------------------------- Audio */
+
+async function ensureAudioContext() {
+  if (!state.audioContext) {
+    const Ctx = window.AudioContext || window.webkitAudioContext;
+    state.audioContext = new Ctx();
+    await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
+  }
+  if (state.audioContext.state === "suspended") {
+    await state.audioContext.resume();
+  }
+  return state.audioContext;
+}
+
+function renderMicDevices() {
+  const previousValue = state.selectedMicDeviceId || els.micSelect.value;
+  els.micSelect.innerHTML = "";
+
+  const defaultOption = document.createElement("option");
+  defaultOption.value = "";
+  defaultOption.textContent = "Default microphone";
+  els.micSelect.appendChild(defaultOption);
+
+  state.micDevices.forEach((device, index) => {
+    const option = document.createElement("option");
+    option.value = device.deviceId;
+    option.textContent = device.label || `Microphone ${index + 1}`;
+    els.micSelect.appendChild(option);
+  });
+
+  const hasPrevious = state.micDevices.some(
+    (device) => device.deviceId === previousValue,
+  );
+  state.selectedMicDeviceId = hasPrevious ? previousValue : "";
+  els.micSelect.value = state.selectedMicDeviceId;
+  setMicSelectEnabled();
+}
+
+async function refreshMicDevices() {
+  if (!navigator.mediaDevices?.enumerateDevices) {
+    setMicSelectEnabled();
+    return;
+  }
+
+  try {
+    const devices = await navigator.mediaDevices.enumerateDevices();
+    state.micDevices = devices.filter((device) => device.kind === "audioinput");
+    renderMicDevices();
+  } catch (err) {
+    console.warn("Could not enumerate microphones", err);
+    setMicSelectEnabled();
+  }
+}
+
+async function startMic() {
+  const ctx = await ensureAudioContext();
+  const audioConstraints = {
+    echoCancellation: true,
+    noiseSuppression: true,
+    autoGainControl: true,
+    channelCount: 1,
+  };
+  if (state.selectedMicDeviceId) {
+    audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
+  }
+
+  state.micStream = await navigator.mediaDevices.getUserMedia({
+    audio: audioConstraints,
+    video: false,
+  });
+  await refreshMicDevices();
+
+  state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
+  state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
+    numberOfInputs: 1,
+    numberOfOutputs: 0,
+    channelCount: 1,
+    processorOptions: {
+      targetSampleRate: SAMPLE_RATE,
+      frameMs: FRAME_MS,
+    },
+  });
+  state.recorderNode.port.onmessage = (event) => {
+    const data = event.data;
+    if (!data || data.type !== "frame") return;
+    updateMeter(data.rms || 0);
+    if (state.connected) {
+      wsSend(data.buffer);
+    }
+  };
+
+  state.micSourceNode.connect(state.recorderNode);
+  state.micEnabled = true;
+  addWsLog("system", "mic capture started (binary input.audio frames)");
+  setMicButton();
+}
+
+function stopMic() {
+  const wasEnabled = state.micEnabled;
+  if (state.recorderNode) {
+    try {
+      state.recorderNode.port.onmessage = null;
+      state.recorderNode.disconnect();
+    } catch (_) {
+      /* ignore */
+    }
+    state.recorderNode = null;
+  }
+  if (state.micSourceNode) {
+    try {
+      state.micSourceNode.disconnect();
+    } catch (_) {
+      /* ignore */
+    }
+    state.micSourceNode = null;
+  }
+  if (state.micStream) {
+    for (const track of state.micStream.getTracks()) {
+      try {
+        track.stop();
+      } catch (_) {
+        /* ignore */
+      }
+    }
+    state.micStream = null;
+  }
+  state.micEnabled = false;
+  updateMeter(0);
+  if (wasEnabled) {
+    flushAudioSendLog();
+    addWsLog("system", "mic capture stopped");
+  }
+  setMicButton();
+}
+
+function updateMeter(rms) {
+  // Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
+  const target = Math.min(1, rms * 2.4);
+  state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
+  els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
+}
+
+/* ---------------------------------------------------- Bot audio playback */
+
+function schedulePlayback(int16) {
+  const ctx = state.audioContext;
+  if (!ctx) return;
+
+  const float32 = new Float32Array(int16.length);
+  for (let i = 0; i < int16.length; i++) {
+    float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
+  }
+  const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
+  buffer.copyToChannel(float32, 0);
+
+  const src = ctx.createBufferSource();
+  src.buffer = buffer;
+  src.connect(ctx.destination);
+
+  const now = ctx.currentTime;
+  // Schedule immediately after the previously scheduled chunk to keep
+  // playback contiguous, with a tiny safety margin if we fell behind.
+  const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
+  src.start(startAt);
+  state.nextPlaybackTime = startAt + buffer.duration;
+  state.playbackEndsAt = state.nextPlaybackTime;
+
+  src.onended = () => {
+    const idx = state.scheduledSources.indexOf(src);
+    if (idx >= 0) state.scheduledSources.splice(idx, 1);
+  };
+  state.scheduledSources.push(src);
+
+  setBotIndicator(true);
+  if (state.botUiTimer) clearTimeout(state.botUiTimer);
+  const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
+  state.botUiTimer = setTimeout(() => {
+    if (state.audioContext &&
+        state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
+      setBotIndicator(false);
+    }
+  }, msUntilEnd);
+}
+
+function stopPlaybackQueue() {
+  for (const src of state.scheduledSources) {
+    try {
+      src.onended = null;
+      src.stop();
+      src.disconnect();
+    } catch (_) {
+      /* already stopped */
+    }
+  }
+  state.scheduledSources = [];
+  resetPlaybackClock();
+  if (state.botUiTimer) {
+    clearTimeout(state.botUiTimer);
+    state.botUiTimer = null;
+  }
+  setBotIndicator(false);
+}
+
+function resetPlaybackClock() {
+  if (state.audioContext) {
+    state.nextPlaybackTime = state.audioContext.currentTime;
+    state.playbackEndsAt = state.audioContext.currentTime;
+  }
+}
+
+/* --------------------------------------------------------- Chat updates */
+
+function handleUserTranscript(text) {
+  if (!text) return;
+  state.currentAssistantBubble = null;
+  addBubble("user", text);
+}
+
+function sendText(text) {
+  const value = (text || "").trim();
+  if (!value) return false;
+  if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
+  const message = {
+    type: "input.text",
+    text: value,
+    interrupt: true,
+  };
+
+  // The engine does not echo text input back as a transcript event, so we
+  // render the user bubble locally. Also interrupt any in-flight bot audio
+  // so the next reply is heard cleanly. We deliberately do NOT clear
+  // `currentAssistantBubble` here — the engine will emit a
+  // `response.text.final(interrupted=true)` for the in-flight assistant
+  // turn, which finalizes that bubble in place. A brand-new bubble for the
+  // reply will be created when `response.text.started` arrives.
+  wsSend(JSON.stringify(message));
+  stopPlaybackQueue();
+  addBubble("user", value);
+  return true;
+}
+
+function handleAssistantDelta(text) {
+  if (!text) return;
+  if (!state.currentAssistantBubble) {
+    state.currentAssistantBubble = addBubble("assistant", "");
+  }
+  appendToBubble(state.currentAssistantBubble, text);
+}
+
+function handleAssistantStarted() {
+  state.currentAssistantBubble = null;
+}
+
+function handleAssistantFinal(text, interrupted) {
+  if (!text) {
+    state.currentAssistantBubble = null;
+    return;
+  }
+  if (state.currentAssistantBubble) {
+    const body = state.currentAssistantBubble.querySelector(".bubble__text");
+    body.textContent = text;
+  } else {
+    state.currentAssistantBubble = addBubble("assistant", text);
+  }
+  if (interrupted) {
+    state.currentAssistantBubble.classList.add("bubble--interrupted");
+  }
+  state.currentAssistantBubble = null;
+  scrollChatToBottom();
+}
+
+function finalizeAssistantBubble() {
+  state.currentAssistantBubble = null;
+}
+
+/* ---------------------------------------------------------- Websocket IO */
+
+function decodeBase64ToInt16(b64) {
+  const binary = atob(b64);
+  const len = binary.length;
+  const bytes = new Uint8Array(len);
+  for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
+  return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
+}
+
+function handleEvent(event) {
+  switch (event.type) {
+    case "response.audio.delta":
+      if (typeof event.audio === "string") {
+        schedulePlayback(decodeBase64ToInt16(event.audio));
+      }
+      break;
+    case "response.audio.started":
+      setBotIndicator(true);
+      break;
+    case "response.audio.stopped":
+      finalizeAssistantBubble();
+      // The indicator turns off automatically when the playback queue drains.
+      break;
+    case "response.text.delta":
+      handleAssistantDelta(event.text);
+      break;
+    case "response.text.started":
+      handleAssistantStarted();
+      break;
+    case "response.text.final":
+      handleAssistantFinal(event.text, event.interrupted);
+      break;
+    case "input.transcript.final":
+      handleUserTranscript(event.text);
+      break;
+    case "input.transcript.interim":
+      // Ignore partial ASR updates; chat history renders committed user turns.
+      break;
+    case "transport.message":
+      // Reserved for future structured messages; ignore silently.
+      break;
+    default:
+      // Unknown event type: log for debugging.
+      console.debug("ws event", event);
+  }
+}
+
+async function connect() {
+  if (state.connected || state.connecting) return;
+  const url = (els.url.value || "").trim();
+  if (!url) {
+    setStatus("error", "Missing URL");
+    return;
+  }
+
+  state.connecting = true;
+  setStatus("connecting", "Connecting…");
+  setConnectButton();
+  addWsLog("system", `connecting ${url}`);
+
+  try {
+    // Pre-warm audio context on user gesture so playback works on Safari.
+    await ensureAudioContext();
+  } catch (err) {
+    console.error("AudioContext failed", err);
+    state.connecting = false;
+    setStatus("error", "Audio init failed");
+    setConnectButton();
+    addWsLog("error", `audio init failed: ${err.message || err}`, "error");
+    return;
+  }
+
+  let ws;
+  try {
+    ws = new WebSocket(url);
+  } catch (err) {
+    console.error("WebSocket constructor failed", err);
+    state.connecting = false;
+    setStatus("error", "Bad URL");
+    setConnectButton();
+    addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
+    return;
+  }
+  ws.binaryType = "arraybuffer";
+  state.ws = ws;
+
+  ws.addEventListener("open", () => {
+    const startMessage = {
+      type: "session.start",
+      protocol: PROTOCOL,
+      audio: {
+        encoding: "pcm_s16le",
+        sample_rate: SAMPLE_RATE,
+        channels: CHANNELS,
+      },
+    };
+
+    state.connecting = false;
+    state.connected = true;
+    resetPlaybackClock();
+    addWsLog("system", "websocket open");
+    setStatus("connected", "Connected");
+    setConnectButton();
+    setMicButton();
+    setMicSelectEnabled();
+    refreshMicDevices();
+
+    wsSend(JSON.stringify(startMessage));
+    addBubble("system", "Session started.");
+    setComposerEnabled(true);
+    els.textInput.focus();
+  });
+
+  ws.addEventListener("message", (event) => {
+    const data = event.data;
+    if (typeof data === "string") {
+      let parsed;
+      try {
+        parsed = JSON.parse(data);
+      } catch (err) {
+        console.warn("Bad JSON from server", err, data);
+        addWsLog(
+          "error",
+          `invalid JSON from server: ${truncateLogValue(data)}`,
+          "error",
+        );
+        return;
+      }
+      logWsPayload("recv", parsed);
+      handleEvent(parsed);
+    } else if (data instanceof ArrayBuffer) {
+      // Server doesn't currently send binary, but handle it just in case.
+      addWsLog("recv", `binary audio ${data.byteLength} bytes`);
+      schedulePlayback(new Int16Array(data));
+    }
+  });
+
+  ws.addEventListener("error", (err) => {
+    console.error("WebSocket error", err);
+    setStatus("error", "Connection error");
+    addWsLog("error", "websocket error", "error");
+  });
+
+  ws.addEventListener("close", (event) => {
+    const wasConnected = state.connected;
+    state.ws = null;
+    state.connected = false;
+    state.connecting = false;
+    if (state.micEnabled) stopMic();
+    stopPlaybackQueue();
+    setConnectButton();
+    setMicButton();
+    setMicSelectEnabled();
+    setComposerEnabled(false);
+    setBotIndicator(false);
+    flushPendingWsLogs();
+    addWsLog(
+      "system",
+      `websocket close code=${event.code}${
+        event.reason ? ` reason=${event.reason}` : ""
+      }`,
+    );
+    if (wasConnected) {
+      addBubble(
+        "system",
+        `Session ended${event.reason ? ` — ${event.reason}` : ""}.`,
+      );
+      setStatus("idle", "Disconnected");
+    } else {
+      setStatus("error", "Connection closed");
+    }
+  });
+}
+
+function disconnect() {
+  if (!state.ws) return;
+  try {
+    if (state.ws.readyState === WebSocket.OPEN) {
+      const stopMessage = { type: "session.stop", reason: "client_disconnect" };
+      wsSend(JSON.stringify(stopMessage));
+    }
+  } catch (_) {
+    /* ignore */
+  }
+  try {
+    state.ws.close(1000, "client_disconnect");
+  } catch (_) {
+    /* ignore */
+  }
+}
+
+/* ---------------------------------------------------------------- Wiring */
+
+els.connectBtn.addEventListener("click", () => {
+  if (state.connected) disconnect();
+  else connect();
+});
+
+els.micBtn.addEventListener("click", async () => {
+  if (!state.connected) return;
+  els.micBtn.disabled = true;
+  try {
+    if (state.micEnabled) {
+      stopMic();
+    } else {
+      await startMic();
+    }
+  } catch (err) {
+    console.error("Mic error", err);
+    addBubble("system", `Mic error: ${err.message || err}`);
+  } finally {
+    els.micBtn.disabled = !state.connected;
+  }
+});
+
+els.micSelect.addEventListener("change", async () => {
+  state.selectedMicDeviceId = els.micSelect.value;
+  if (!state.micEnabled) return;
+
+  els.micSelect.disabled = true;
+  els.micBtn.disabled = true;
+  try {
+    stopMic();
+    await startMic();
+  } catch (err) {
+    console.error("Mic switch error", err);
+    addBubble("system", `Mic switch error: ${err.message || err}`);
+  } finally {
+    setMicButton();
+    setMicSelectEnabled();
+  }
+});
+
+if (navigator.mediaDevices?.addEventListener) {
+  navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
+}
+
+els.clearBtn.addEventListener("click", () => {
+  clearChat();
+});
+
+els.clearWsLogBtn.addEventListener("click", () => {
+  clearWsLog();
+});
+
+function autosizeTextarea() {
+  const ta = els.textInput;
+  ta.style.height = "auto";
+  ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
+}
+
+function submitText() {
+  const value = els.textInput.value;
+  if (!sendText(value)) return;
+  els.textInput.value = "";
+  autosizeTextarea();
+  setComposerEnabled(state.connected);
+}
+
+els.composer.addEventListener("submit", (event) => {
+  event.preventDefault();
+  submitText();
+});
+
+els.textInput.addEventListener("input", () => {
+  autosizeTextarea();
+  setComposerEnabled(state.connected);
+});
+
+els.textInput.addEventListener("keydown", (event) => {
+  if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
+    event.preventDefault();
+    submitText();
+  }
+});
+
+window.addEventListener("beforeunload", () => {
+  if (state.ws) {
+    try {
+      state.ws.close();
+    } catch (_) {
+      /* ignore */
+    }
+  }
+  if (state.audioContext) {
+    try {
+      state.audioContext.close();
+    } catch (_) {
+      /* ignore */
+    }
+  }
+});
+
+els.url.value = defaultWsUrl();
+
+setStatus("idle", "Disconnected");
+setConnectButton();
+setMicButton();
+setMicSelectEnabled();
+setComposerEnabled(false);
--- a/static/voice-demo/index.html
+++ b/static/voice-demo/index.html
@@ -0,0 +1,158 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>VA Voice Chat &mdash; /ws-product</title>
+    <link rel="stylesheet" href="./styles.css" />
+  </head>
+  <body>
+    <main class="app">
+      <header class="app__header">
+        <div class="brand">
+          <span class="brand__dot" aria-hidden="true"></span>
+          <h1>VA Voice Chat</h1>
+        </div>
+
+        <div class="connection">
+          <label class="connection__field">
+            <span>WebSocket URL</span>
+            <input
+              id="ws-url"
+              type="text"
+              placeholder="ws://host/ws-product"
+              spellcheck="false"
+              autocomplete="off"
+            />
+          </label>
+          <button id="connect-btn" class="btn btn--primary" type="button">
+            Connect
+          </button>
+        </div>
+
+        <div class="status">
+          <span id="status-dot" class="status__dot status__dot--idle"></span>
+          <span id="status-text" class="status__text">Disconnected</span>
+        </div>
+      </header>
+
+      <div class="app__body">
+        <div class="app__main">
+          <section class="chat" aria-label="Conversation history">
+            <div id="chat-log" class="chat__log" role="log" aria-live="polite">
+              <div class="chat__empty">
+                <p>Connect to the engine, enable your mic, and start talking.</p>
+                <p class="chat__hint">
+                  Audio is streamed as PCM16 mono @ 16&nbsp;kHz over
+                  <code>/ws-product</code>.
+                </p>
+              </div>
+            </div>
+          </section>
+
+          <footer class="controls" aria-label="Chat controls">
+            <div class="meter" aria-hidden="true">
+              <div id="meter-fill" class="meter__fill"></div>
+            </div>
+
+            <form id="composer" class="composer" autocomplete="off">
+              <textarea
+                id="text-input"
+                class="composer__input"
+                rows="1"
+                placeholder="Type a message, or use the mic…"
+                disabled
+              ></textarea>
+              <button
+                id="send-btn"
+                class="btn btn--primary composer__send"
+                type="submit"
+                disabled
+                title="Send message (Enter)"
+              >
+                Send
+              </button>
+            </form>
+
+            <div class="controls__row">
+              <label class="device-picker">
+                <span class="device-picker__label">Microphone</span>
+                <select id="mic-select" class="device-picker__select" disabled>
+                  <option value="">Default microphone</option>
+                </select>
+              </label>
+
+              <button
+                id="mic-btn"
+                class="mic-btn"
+                type="button"
+                disabled
+                aria-pressed="false"
+                title="Mic is off"
+              >
+                <svg
+                  class="mic-btn__icon"
+                  viewBox="0 0 24 24"
+                  width="24"
+                  height="24"
+                  aria-hidden="true"
+                >
+                  <path
+                    d="M12 14a3 3 0 0 0 3-3V6a3 3 0 1 0-6 0v5a3 3 0 0 0 3 3Z"
+                    fill="currentColor"
+                  />
+                  <path
+                    d="M19 11a1 1 0 1 0-2 0 5 5 0 0 1-10 0 1 1 0 1 0-2 0 7 7 0 0 0 6 6.92V21a1 1 0 1 0 2 0v-3.08A7 7 0 0 0 19 11Z"
+                    fill="currentColor"
+                  />
+                </svg>
+                <span class="mic-btn__label">Enable mic</span>
+              </button>
+
+              <div class="indicators">
+                <span id="mic-indicator" class="indicator">
+                  <span class="indicator__dot indicator__dot--mic"></span>
+                  <span class="indicator__label">Mic</span>
+                </span>
+                <span id="bot-indicator" class="indicator">
+                  <span class="indicator__dot indicator__dot--bot"></span>
+                  <span class="indicator__label">Bot</span>
+                </span>
+              </div>
+
+              <button id="clear-btn" class="btn btn--ghost" type="button">
+                Clear
+              </button>
+            </div>
+
+            <p class="hint">
+              Press <kbd>Enter</kbd> to send, <kbd>Shift</kbd>+<kbd>Enter</kbd>
+              for newline. Sending text will interrupt the bot if it's speaking.
+              Browser echo cancellation is on; use headphones if echo persists.
+            </p>
+          </footer>
+        </div>
+
+        <section class="ws-log" aria-label="WebSocket log">
+          <div class="ws-log__header">
+            <div class="ws-log__header-left">
+              <h2>WebSocket Log</h2>
+              <div class="ws-log__legend" aria-hidden="true">
+                <span class="ws-log__legend-item ws-log__legend-item--send">Send</span>
+                <span class="ws-log__legend-item ws-log__legend-item--recv">Recv</span>
+              </div>
+            </div>
+            <button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
+              Clear log
+            </button>
+          </div>
+          <div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
+            <div class="ws-log__empty">No websocket events yet.</div>
+          </div>
+        </section>
+      </div>
+    </main>
+
+    <script type="module" src="./app.js"></script>
+  </body>
+</html>
--- a/static/voice-demo/pcm-recorder.worklet.js
+++ b/static/voice-demo/pcm-recorder.worklet.js
@@ -0,0 +1,104 @@
+/**
+ * PCM Recorder AudioWorklet.
+ *
+ * Captures mono Float32 mic samples at the AudioContext's native rate,
+ * resamples them to a target sample rate (default 16 kHz) with linear
+ * interpolation, then ships PCM16 frames of a fixed duration (default 20 ms)
+ * to the main thread via `port.postMessage(ArrayBuffer)`.
+ *
+ * It also computes a simple RMS level per frame for the UI VU meter so the
+ * main thread doesn't have to re-process the audio.
+ */
+
+class PcmRecorderProcessor extends AudioWorkletProcessor {
+  constructor(options) {
+    super();
+
+    const opts = (options && options.processorOptions) || {};
+    this._targetSampleRate = opts.targetSampleRate || 16000;
+    this._frameMs = opts.frameMs || 20;
+    this._frameSamples = Math.round(
+      (this._targetSampleRate * this._frameMs) / 1000,
+    );
+
+    // Resampling state.
+    // `ratio` is input samples per output sample.
+    this._ratio = sampleRate / this._targetSampleRate;
+    this._inputBuffer = new Float32Array(0);
+    // Float position in `_inputBuffer` for the next output sample.
+    this._inputOffset = 0;
+
+    // Output framing state.
+    this._frameBuffer = new Int16Array(this._frameSamples);
+    this._frameIndex = 0;
+
+    // VU meter accumulator.
+    this._rmsSumSquares = 0;
+    this._rmsCount = 0;
+  }
+
+  process(inputs) {
+    const input = inputs[0];
+    if (!input || input.length === 0) return true;
+    const channel = input[0];
+    if (!channel || channel.length === 0) return true;
+
+    // Append new samples to the input buffer.
+    const merged = new Float32Array(this._inputBuffer.length + channel.length);
+    merged.set(this._inputBuffer, 0);
+    merged.set(channel, this._inputBuffer.length);
+    this._inputBuffer = merged;
+
+    const ratio = this._ratio;
+    const inLen = this._inputBuffer.length;
+    let pos = this._inputOffset;
+
+    while (pos + 1 < inLen) {
+      const lo = Math.floor(pos);
+      const hi = lo + 1;
+      const w = pos - lo;
+      const sample =
+        this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w;
+
+      this._rmsSumSquares += sample * sample;
+      this._rmsCount += 1;
+
+      let s = sample;
+      if (s > 1) s = 1;
+      else if (s < -1) s = -1;
+      this._frameBuffer[this._frameIndex++] =
+        s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
+
+      if (this._frameIndex === this._frameSamples) {
+        const frame = new Int16Array(this._frameSamples);
+        frame.set(this._frameBuffer);
+        const rms =
+          this._rmsCount > 0
+            ? Math.sqrt(this._rmsSumSquares / this._rmsCount)
+            : 0;
+        this.port.postMessage(
+          { type: "frame", buffer: frame.buffer, rms },
+          [frame.buffer],
+        );
+        this._frameIndex = 0;
+        this._rmsSumSquares = 0;
+        this._rmsCount = 0;
+      }
+
+      pos += ratio;
+    }
+
+    // Trim consumed samples from the input buffer; keep at least the last
+    // sample we still need to interpolate against on the next call.
+    const consumed = Math.floor(pos);
+    if (consumed > 0) {
+      this._inputBuffer = this._inputBuffer.slice(consumed);
+      pos -= consumed;
+    }
+    this._inputOffset = pos;
+
+    return true;
+  }
+}
+
+registerProcessor("pcm-recorder", PcmRecorderProcessor);
--- a/static/voice-demo/styles.css
+++ b/static/voice-demo/styles.css
@@ -0,0 +1,739 @@
+:root {
+  color-scheme: dark;
+  --bg: #0b0d12;
+  --bg-elevated: #131722;
+  --bg-soft: #1a2030;
+  --border: #232a3b;
+  --text: #e6e9f2;
+  --text-dim: #95a0bb;
+  --accent: #4f8cff;
+  --accent-strong: #6aa1ff;
+  --user: #2f7ff0;
+  --assistant: #2a3145;
+  --danger: #ff5577;
+  --success: #2dd28b;
+  --warning: #ffb84d;
+  --radius: 14px;
+  --shadow: 0 10px 30px rgba(0, 0, 0, 0.35);
+  font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto,
+    "Helvetica Neue", Arial, sans-serif;
+}
+
+* {
+  box-sizing: border-box;
+}
+
+html,
+body {
+  margin: 0;
+  padding: 0;
+  height: 100%;
+  background: radial-gradient(
+      1200px 600px at 80% -10%,
+      rgba(79, 140, 255, 0.18),
+      transparent 60%
+    ),
+    radial-gradient(
+      900px 500px at -10% 110%,
+      rgba(45, 210, 139, 0.12),
+      transparent 60%
+    ),
+    var(--bg);
+  color: var(--text);
+}
+
+body {
+  display: flex;
+  justify-content: center;
+  padding: 12px;
+}
+
+.app {
+  display: grid;
+  grid-template-rows: auto minmax(0, 1fr);
+  gap: 12px;
+  width: min(1440px, 100%);
+  height: calc(100dvh - 24px);
+  max-height: calc(100vh - 24px);
+  background: var(--bg-elevated);
+  border: 1px solid var(--border);
+  border-radius: 20px;
+  box-shadow: var(--shadow);
+  padding: 14px;
+}
+
+.app__body {
+  display: grid;
+  grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
+  gap: 12px;
+  min-height: 0;
+}
+
+.app__main {
+  display: grid;
+  grid-template-rows: minmax(0, 1fr) auto;
+  gap: 0;
+  min-height: 0;
+  overflow: hidden;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+}
+
+/* Header ---------------------------------------------------------------- */
+
+.app__header {
+  display: grid;
+  grid-template-columns: auto minmax(0, 1fr) auto;
+  align-items: center;
+  gap: 12px;
+  padding-bottom: 10px;
+  border-bottom: 1px solid var(--border);
+}
+
+.brand {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+}
+
+.brand h1 {
+  font-size: 16px;
+  margin: 0;
+  letter-spacing: 0.2px;
+}
+
+.brand__dot {
+  width: 10px;
+  height: 10px;
+  border-radius: 50%;
+  background: var(--accent);
+  box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.18);
+}
+
+.connection {
+  display: flex;
+  align-items: end;
+  gap: 8px;
+  min-width: 0;
+}
+
+.connection__field {
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+  min-width: 0;
+  flex: 1;
+}
+
+.connection__field span {
+  font-size: 11px;
+  color: var(--text-dim);
+  text-transform: uppercase;
+  letter-spacing: 0.8px;
+}
+
+.connection__field input {
+  background: var(--bg-soft);
+  color: var(--text);
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  padding: 7px 10px;
+  font: inherit;
+  font-size: 12px;
+  outline: none;
+  width: 100%;
+  min-width: 0;
+}
+
+.connection__field input:focus {
+  border-color: var(--accent);
+  box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
+}
+
+.status {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 13px;
+  color: var(--text-dim);
+  white-space: nowrap;
+}
+
+.status__dot {
+  width: 9px;
+  height: 9px;
+  border-radius: 50%;
+  background: var(--text-dim);
+}
+
+.status__dot--idle {
+  background: var(--text-dim);
+}
+
+.status__dot--connecting {
+  background: var(--warning);
+  animation: pulse 1.2s ease-in-out infinite;
+}
+
+.status__dot--connected {
+  background: var(--success);
+}
+
+.status__dot--error {
+  background: var(--danger);
+}
+
+/* Chat ------------------------------------------------------------------ */
+
+.chat {
+  overflow: hidden;
+  display: flex;
+  flex-direction: column;
+  min-height: 0;
+}
+
+.chat__log {
+  flex: 1;
+  overflow-y: auto;
+  padding: 18px;
+  display: flex;
+  flex-direction: column;
+  gap: 10px;
+  scroll-behavior: smooth;
+}
+
+.chat__empty {
+  margin: auto;
+  text-align: center;
+  color: var(--text-dim);
+}
+
+.chat__empty p {
+  margin: 4px 0;
+}
+
+.chat__hint code {
+  background: var(--bg-soft);
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  padding: 1px 6px;
+  font-size: 12px;
+}
+
+.bubble {
+  max-width: 78%;
+  padding: 10px 14px;
+  border-radius: 14px;
+  line-height: 1.45;
+  font-size: 14px;
+  white-space: pre-wrap;
+  word-wrap: break-word;
+  animation: bubble-in 0.16s ease-out;
+}
+
+.bubble--user {
+  background: var(--user);
+  color: #fff;
+  align-self: flex-end;
+  border-bottom-right-radius: 4px;
+}
+
+.bubble--assistant {
+  background: var(--assistant);
+  color: var(--text);
+  align-self: flex-start;
+  border-bottom-left-radius: 4px;
+}
+
+.bubble--assistant.bubble--interrupted {
+  opacity: 0.75;
+  border-left: 2px solid var(--warning);
+}
+
+.bubble--system {
+  align-self: center;
+  background: transparent;
+  color: var(--text-dim);
+  font-size: 12px;
+  border: 1px dashed var(--border);
+  padding: 6px 10px;
+  border-radius: 999px;
+}
+
+.bubble__role {
+  display: block;
+  font-size: 10px;
+  text-transform: uppercase;
+  letter-spacing: 0.8px;
+  opacity: 0.7;
+  margin-bottom: 4px;
+}
+
+/* WebSocket log --------------------------------------------------------- */
+
+.ws-log {
+  display: flex;
+  flex-direction: column;
+  min-height: 0;
+  background: var(--bg);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  overflow: hidden;
+}
+
+.ws-log__header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 8px;
+  padding: 8px 12px;
+  border-bottom: 1px solid var(--border);
+  flex-shrink: 0;
+  background: var(--bg-soft);
+}
+
+.ws-log__legend {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 10px;
+  color: var(--text-dim);
+}
+
+.ws-log__legend-item {
+  display: inline-flex;
+  align-items: center;
+  gap: 4px;
+}
+
+.ws-log__legend-item::before {
+  content: "";
+  width: 8px;
+  height: 8px;
+  border-radius: 2px;
+}
+
+.ws-log__legend-item--send::before {
+  background: var(--success);
+}
+
+.ws-log__legend-item--recv::before {
+  background: var(--accent-strong);
+}
+
+.ws-log__header h2 {
+  margin: 0;
+  font-size: 12px;
+  color: var(--text-dim);
+  text-transform: uppercase;
+  letter-spacing: 0.8px;
+}
+
+.ws-log__header-left {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  min-width: 0;
+}
+
+.ws-log__body {
+  flex: 1;
+  min-height: 0;
+  overflow-y: auto;
+  overflow-x: hidden;
+  padding: 6px 8px;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-size: 10.5px;
+  line-height: 1.4;
+  color: var(--text-dim);
+}
+
+.ws-log__entry {
+  display: grid;
+  grid-template-columns: 58px 42px minmax(0, 1fr);
+  gap: 6px;
+  align-items: start;
+  padding: 5px 4px;
+  border-radius: 6px;
+  white-space: pre-wrap;
+  word-break: break-word;
+}
+
+.ws-log__entry:hover {
+  background: rgba(255, 255, 255, 0.03);
+}
+
+.ws-log__time {
+  color: var(--text-dim);
+  opacity: 0.7;
+  font-size: 10px;
+}
+
+.ws-log__direction {
+  font-weight: 700;
+  font-size: 9px;
+  letter-spacing: 0.4px;
+  text-transform: uppercase;
+  padding: 2px 0;
+}
+
+.ws-log__detail {
+  min-width: 0;
+  overflow-wrap: anywhere;
+}
+
+.ws-log__entry--send .ws-log__direction {
+  color: var(--success);
+}
+
+.ws-log__entry--recv .ws-log__direction {
+  color: var(--accent-strong);
+}
+
+.ws-log__entry--system .ws-log__direction {
+  color: var(--text-dim);
+}
+
+.ws-log__entry--error .ws-log__direction {
+  color: var(--danger);
+}
+
+.ws-log__empty {
+  color: var(--text-dim);
+  opacity: 0.7;
+  padding: 8px 4px;
+}
+
+/* Controls -------------------------------------------------------------- */
+
+.controls {
+  display: grid;
+  gap: 8px;
+  padding: 10px 12px 8px;
+  border-top: 1px solid var(--border);
+  background: var(--bg-elevated);
+}
+
+.meter {
+  height: 6px;
+  background: var(--bg-soft);
+  border-radius: 999px;
+  overflow: hidden;
+}
+
+.meter__fill {
+  height: 100%;
+  width: 0%;
+  background: linear-gradient(90deg, var(--success), var(--accent));
+  transition: width 80ms linear;
+}
+
+.composer {
+  display: flex;
+  align-items: flex-end;
+  gap: 8px;
+}
+
+.composer__input {
+  flex: 1;
+  resize: none;
+  background: var(--bg-soft);
+  color: var(--text);
+  border: 1px solid var(--border);
+  border-radius: 12px;
+  padding: 10px 12px;
+  font: inherit;
+  font-size: 14px;
+  line-height: 1.4;
+  outline: none;
+  min-height: 42px;
+  max-height: 180px;
+  overflow-y: auto;
+}
+
+.composer__input:focus {
+  border-color: var(--accent);
+  box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
+}
+
+.composer__input:disabled {
+  opacity: 0.55;
+  cursor: not-allowed;
+}
+
+.composer__send {
+  padding: 10px 18px;
+  border-radius: 12px;
+  min-width: 84px;
+  align-self: flex-end;
+}
+
+.composer__send:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+
+.controls__row {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+  flex-wrap: wrap;
+}
+
+.device-picker {
+  display: flex;
+  flex-direction: column;
+  gap: 3px;
+  min-width: 0;
+  flex: 1 1 160px;
+  max-width: 240px;
+}
+
+.device-picker__label {
+  font-size: 11px;
+  color: var(--text-dim);
+  text-transform: uppercase;
+  letter-spacing: 0.8px;
+}
+
+.device-picker__select {
+  appearance: none;
+  background: var(--bg-soft);
+  color: var(--text);
+  border: 1px solid var(--border);
+  border-radius: 10px;
+  padding: 7px 28px 7px 10px;
+  font: inherit;
+  font-size: 12px;
+  outline: none;
+  width: 100%;
+  cursor: pointer;
+}
+
+.device-picker__select:focus {
+  border-color: var(--accent);
+  box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
+}
+
+.device-picker__select:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+
+.mic-btn {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  padding: 8px 12px;
+  border-radius: 999px;
+  background: var(--bg-soft);
+  color: var(--text);
+  border: 1px solid var(--border);
+  font: inherit;
+  font-size: 12px;
+  font-weight: 600;
+  cursor: pointer;
+  flex-shrink: 0;
+  transition: transform 0.08s ease, background 0.15s ease, color 0.15s ease,
+    border-color 0.15s ease;
+}
+
+.mic-btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+
+.mic-btn:not(:disabled):hover {
+  border-color: var(--accent);
+}
+
+.mic-btn:not(:disabled):active {
+  transform: scale(0.98);
+}
+
+.mic-btn[aria-pressed="true"] {
+  background: var(--danger);
+  border-color: var(--danger);
+  color: #fff;
+  box-shadow: 0 0 0 6px rgba(255, 85, 119, 0.18);
+}
+
+.mic-btn__icon {
+  display: block;
+}
+
+.indicators {
+  display: flex;
+  gap: 12px;
+  margin-left: auto;
+}
+
+.indicator {
+  display: inline-flex;
+  align-items: center;
+  gap: 6px;
+  font-size: 12px;
+  color: var(--text-dim);
+}
+
+.indicator__dot {
+  width: 8px;
+  height: 8px;
+  border-radius: 50%;
+  background: var(--bg-soft);
+  border: 1px solid var(--border);
+}
+
+.indicator.is-active .indicator__dot--mic {
+  background: var(--success);
+  border-color: var(--success);
+  box-shadow: 0 0 0 4px rgba(45, 210, 139, 0.2);
+}
+
+.indicator.is-active .indicator__dot--bot {
+  background: var(--accent);
+  border-color: var(--accent);
+  box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.22);
+  animation: pulse 1s ease-in-out infinite;
+}
+
+.indicator.is-active .indicator__label {
+  color: var(--text);
+}
+
+.btn {
+  appearance: none;
+  border: 1px solid var(--border);
+  background: var(--bg-soft);
+  color: var(--text);
+  padding: 9px 14px;
+  border-radius: 10px;
+  font: inherit;
+  font-size: 13px;
+  cursor: pointer;
+  transition: background 0.15s ease, border-color 0.15s ease;
+}
+
+.btn:hover {
+  border-color: var(--accent);
+}
+
+.btn--primary {
+  background: var(--accent);
+  color: #fff;
+  border-color: var(--accent);
+}
+
+.btn--primary:hover {
+  background: var(--accent-strong);
+  border-color: var(--accent-strong);
+}
+
+.btn--primary.is-disconnect {
+  background: transparent;
+  color: var(--danger);
+  border-color: var(--danger);
+}
+
+.btn--ghost {
+  background: transparent;
+}
+
+.hint {
+  margin: 0;
+  font-size: 11px;
+  color: var(--text-dim);
+  opacity: 0.85;
+}
+
+.hint kbd {
+  background: var(--bg-soft);
+  border: 1px solid var(--border);
+  border-bottom-width: 2px;
+  border-radius: 4px;
+  padding: 0 5px;
+  font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+  font-size: 11px;
+  color: var(--text);
+}
+
+/* Animations ------------------------------------------------------------ */
+
+@keyframes pulse {
+  0%,
+  100% {
+    opacity: 1;
+  }
+  50% {
+    opacity: 0.55;
+  }
+}
+
+@keyframes bubble-in {
+  from {
+    opacity: 0;
+    transform: translateY(4px);
+  }
+  to {
+    opacity: 1;
+    transform: translateY(0);
+  }
+}
+
+/* Responsive ------------------------------------------------------------ */
+
+@media (max-width: 820px) {
+  .app__body {
+    grid-template-columns: 1fr;
+    grid-template-rows: minmax(0, 1fr) min(240px, 32vh);
+  }
+
+  .ws-log__legend {
+    display: none;
+  }
+
+  .hint {
+    display: none;
+  }
+}
+
+@media (max-width: 720px) {
+  body {
+    padding: 0;
+  }
+
+  .app {
+    height: 100dvh;
+    max-height: 100vh;
+    border-radius: 0;
+    border: none;
+    padding: 10px;
+  }
+
+  .app__header {
+    grid-template-columns: 1fr;
+  }
+
+  .connection {
+    flex-direction: column;
+    align-items: stretch;
+  }
+
+  .ws-log__entry {
+    grid-template-columns: 54px 38px minmax(0, 1fr);
+  }
+
+  .status {
+    justify-content: flex-end;
+  }
+
+  .device-picker {
+    flex: 1 1 100%;
+    max-width: none;
+  }
+
+  .indicators {
+    margin-left: 0;
+  }
+}
				`@@ -0,0 +1 @@`
				`"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""`