Integrate product-ws voice demo on port 8000 alongside REST API.

Add src/voice Pipecat pipeline, browser demo at /voice-demo, and config/voice.json.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
Xin Wang
2026-05-22 16:26:06 +08:00
parent 0b6b40aba4
commit bc2aa5b133
20 changed files with 3726 additions and 4 deletions

75
config/voice.json Normal file
View File

@@ -0,0 +1,75 @@
{
"server": {
"host": "0.0.0.0",
"port": 8000,
"cors_origins": ["http://localhost:3000", "http://localhost:8080"],
"serve_webpage": true,
"webpage_mount": "/voice-demo"
},
"audio": {
"sample_rate_hz": 16000,
"channels": 1,
"frame_ms": 20
},
"session": {
"inactivity_timeout_sec": 60
},
"turn": {
"vad": {
"confidence": 0.7,
"start_secs": 0.2,
"stop_secs": 0.4,
"min_volume": 0.6
},
"interruption_min_chars": 3,
"interruption_use_interim": true,
"interruption_short_replies": [
"是",
"是的",
"对",
"对的",
"嗯",
"好",
"好的",
"行",
"可以",
"没问题",
"不是",
"不",
"不行",
"不用",
"不要",
"没有",
"否"
],
"user_speech_timeout_sec": 0.8
},
"agent": {
"system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
"greeting": "Please introduce yourself briefly.",
"greeting_mode": "generated"
},
"services": {
"stt": {
"provider": "openai",
"api_key": "",
"base_url": null,
"model": "gpt-4o-mini-transcribe",
"language": "en"
},
"llm": {
"provider": "openai",
"api_key": "",
"base_url": null,
"model": "gpt-4o-mini",
"temperature": 0.7
},
"tts": {
"provider": "openai",
"api_key": "",
"base_url": null,
"model": "gpt-4o-mini-tts",
"voice": "alloy"
}
}
}

View File

@@ -1,5 +1,7 @@
fastapi>=0.104.0
uvicorn>=0.24.0
uvicorn[standard]>=0.24.0
pipecat-ai[websocket,openai,silero]
websockets>=13.1,<16.0
pydantic>=2.4.2
python-dotenv>=1.0.0
httpx>=0.25.0
@@ -12,7 +14,7 @@ pydantic-settings==2.1.0
python-multipart==0.0.6
python-jose[cryptography]==3.3.0
passlib[bcrypt]==1.7.4
openai==1.55.3
openai>=1.74.0,<3
loguru>=0.7.0
pandas
requests

View File

@@ -1,8 +1,8 @@
from fastapi import FastAPI
import sys
from .api.endpoints import router as api_router
from .core.fastgpt_client import lifespan
from .core.logging_config import setup_logging
from .voice.routes import register_voice
# Setup logging first
setup_logging()
@@ -18,4 +18,5 @@ app = FastAPI(
def read_root():
return {"message": "Server is running."}
app.include_router(api_router)
app.include_router(api_router)
register_voice(app)

1
src/voice/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""

202
src/voice/config.py Normal file
View File

@@ -0,0 +1,202 @@
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
DEFAULT_VOICE_CONFIG = PROJECT_ROOT / "config" / "voice.json"
@dataclass(frozen=True)
class ServerConfig:
host: str = "0.0.0.0"
port: int = 8000
cors_origins: list[str] = field(default_factory=list)
serve_webpage: bool = True
webpage_mount: str = "/voice-demo"
@dataclass(frozen=True)
class AudioConfig:
sample_rate_hz: int = 16000
channels: int = 1
frame_ms: int = 20
@property
def frame_bytes(self) -> int:
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
@dataclass(frozen=True)
class SessionConfig:
inactivity_timeout_sec: int = 60
@dataclass(frozen=True)
class VADConfig:
confidence: float = 0.7
start_secs: float = 0.2
stop_secs: float = 0.6
min_volume: float = 0.6
@dataclass(frozen=True)
class TurnConfig:
vad: VADConfig = field(default_factory=VADConfig)
user_speech_timeout_sec: float = 1.0
interruption_min_chars: int = 3
interruption_use_interim: bool = True
interruption_short_replies: list[str] = field(
default_factory=lambda: [
"",
"是的",
"",
"对的",
"",
"",
"好的",
"",
"可以",
"没问题",
"不是",
"",
"不行",
"不用",
"不要",
"没有",
"",
"no",
"yes",
"ok",
"okay",
]
)
@dataclass(frozen=True)
class AgentConfig:
system_prompt: str = "You are a helpful, friendly voice assistant."
greeting: str | None = None
greeting_mode: str = "generated"
@dataclass(frozen=True)
class LLMConfig:
provider: str = "openai"
api_key: str = ""
base_url: str | None = None
model: str = "gpt-4o-mini"
temperature: float | None = 0.7
@dataclass(frozen=True)
class STTConfig:
provider: str = "openai"
app_id: str = ""
api_key: str = ""
api_secret: str = ""
base_url: str | None = None
model: str = "gpt-4o-mini-transcribe"
language: str | None = "en"
domain: str = "iat"
accent: str = "mandarin"
encoding: str = "raw"
frame_size: int = 1280
timeout_sec: float = 10.0
dynamic_correction: bool = False
@dataclass(frozen=True)
class TTSConfig:
provider: str = "openai"
app_id: str = ""
api_key: str = ""
api_secret: str = ""
base_url: str | None = None
model: str = "gpt-4o-mini-tts"
voice: str = "alloy"
aue: str = "raw"
tte: str = "UTF8"
speed: int = 50
volume: int = 50
pitch: int = 50
timeout_sec: float = 30.0
source_sample_rate_hz: int | None = None
@dataclass(frozen=True)
class ServicesConfig:
llm: LLMConfig = field(default_factory=LLMConfig)
stt: STTConfig = field(default_factory=STTConfig)
tts: TTSConfig = field(default_factory=TTSConfig)
@dataclass(frozen=True)
class EngineConfig:
server: ServerConfig = field(default_factory=ServerConfig)
audio: AudioConfig = field(default_factory=AudioConfig)
session: SessionConfig = field(default_factory=SessionConfig)
turn: TurnConfig = field(default_factory=TurnConfig)
agent: AgentConfig = field(default_factory=AgentConfig)
services: ServicesConfig = field(default_factory=ServicesConfig)
def load_config(path: str | Path | None = None) -> EngineConfig:
config_path = Path(path) if path is not None else DEFAULT_VOICE_CONFIG
if not config_path.is_absolute():
config_path = PROJECT_ROOT / config_path
data = json.loads(config_path.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise ValueError(f"Config file must contain a JSON object: {config_path}")
return config_from_dict(data)
def config_from_dict(data: dict) -> EngineConfig:
services = _dict(data.get("services"))
agent = _dict(data.get("agent"))
if agent.get("greeting") == "":
agent["greeting"] = None
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"):
raise ValueError("agent.greeting_mode must be one of: generated, fixed, off")
stt = _dict(services.get("stt") or services.get("asr"))
if stt.get("language") == "":
stt["language"] = None
turn = _dict(data.get("turn"))
vad = _dict(turn.get("vad"))
return EngineConfig(
server=ServerConfig(**_dict(data.get("server"))),
audio=AudioConfig(**_dict(data.get("audio"))),
session=SessionConfig(**_dict(data.get("session"))),
turn=TurnConfig(
vad=VADConfig(**vad),
user_speech_timeout_sec=float(
turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
),
interruption_min_chars=int(
turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
),
interruption_use_interim=bool(
turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
),
interruption_short_replies=list(
turn.get(
"interruption_short_replies",
TurnConfig().interruption_short_replies,
)
),
),
agent=AgentConfig(**agent),
services=ServicesConfig(
llm=LLMConfig(**_dict(services.get("llm"))),
stt=STTConfig(**stt),
tts=TTSConfig(**_dict(services.get("tts"))),
),
)
def _dict(value: object) -> dict:
return dict(value) if isinstance(value, dict) else {}

179
src/voice/pipeline.py Normal file
View File

@@ -0,0 +1,179 @@
from __future__ import annotations
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import (
LLMRunFrame,
OutputTransportMessageUrgentFrame,
TTSSpeakFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import (
AssistantTurnStoppedMessage,
LLMContextAggregatorPair,
LLMUserAggregatorParams,
UserTurnStoppedMessage,
)
from pipecat.serializers.base_serializer import FrameSerializer
from pipecat.transports.websocket.fastapi import (
FastAPIWebsocketParams,
FastAPIWebsocketTransport,
)
from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
SpeechTimeoutUserTurnStopStrategy,
)
from pipecat.turns.user_turn_strategies import UserTurnStrategies
from .config import EngineConfig
from .protocol import ProductWebsocketSerializer
from .services import create_llm_service, create_stt_service, create_tts_service
from .text_input import ProductTextInputProcessor
from .text_stream import ProductTextStreamProcessor
from .transcript_stream import ProductTranscriptStreamProcessor
from .turn_start import InterruptionGateUserTurnStartStrategy
async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
await run_pipeline_with_serializer(
websocket,
config,
serializer=ProductWebsocketSerializer(
sample_rate=config.audio.sample_rate_hz,
channels=config.audio.channels,
),
client_label="Product JSON",
)
async def run_pipeline_with_serializer(
websocket,
config: EngineConfig,
*,
serializer: FrameSerializer,
client_label: str,
) -> None:
transport = FastAPIWebsocketTransport(
websocket=websocket,
params=FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_in_sample_rate=config.audio.sample_rate_hz,
audio_out_sample_rate=config.audio.sample_rate_hz,
audio_in_channels=config.audio.channels,
audio_out_channels=config.audio.channels,
serializer=serializer,
session_timeout=None,
),
)
stt = create_stt_service(config.services.stt, config.audio)
llm = create_llm_service(config.services.llm)
tts = create_tts_service(config.services.tts, config.audio)
messages = [{"role": "system", "content": config.agent.system_prompt}]
if config.agent.greeting and config.agent.greeting_mode == "generated":
messages.append({"role": "system", "content": config.agent.greeting})
context = LLMContext(messages)
vad_params = VADParams(
confidence=config.turn.vad.confidence,
start_secs=config.turn.vad.start_secs,
stop_secs=config.turn.vad.stop_secs,
min_volume=config.turn.vad.min_volume,
)
user_turn_strategies = UserTurnStrategies(
start=[
InterruptionGateUserTurnStartStrategy(
min_chars_when_bot_speaking=config.turn.interruption_min_chars,
allowed_short_replies=config.turn.interruption_short_replies,
use_interim=config.turn.interruption_use_interim,
),
],
stop=[
SpeechTimeoutUserTurnStopStrategy(
user_speech_timeout=config.turn.user_speech_timeout_sec,
),
],
)
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
context,
user_params=LLMUserAggregatorParams(
vad_analyzer=SileroVADAnalyzer(params=vad_params),
user_turn_strategies=user_turn_strategies,
),
)
pipeline = Pipeline(
[
transport.input(),
ProductTextInputProcessor(),
stt,
ProductTranscriptStreamProcessor(),
user_aggregator,
llm,
ProductTextStreamProcessor(),
tts,
transport.output(),
assistant_aggregator,
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
audio_in_sample_rate=config.audio.sample_rate_hz,
audio_out_sample_rate=config.audio.sample_rate_hz,
enable_metrics=True,
enable_usage_metrics=True,
enable_heartbeats=True,
),
idle_timeout_secs=config.session.inactivity_timeout_sec,
)
@transport.event_handler("on_client_connected")
async def on_client_connected(_transport, _client):
logger.info(f"{client_label} websocket client connected")
if config.agent.greeting_mode == "fixed" and config.agent.greeting:
await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
elif config.agent.greeting_mode == "generated":
await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(_transport, _client):
logger.info(f"{client_label} websocket client disconnected")
await task.cancel()
@transport.event_handler("on_session_timeout")
async def on_session_timeout(_transport, _client):
logger.info(f"{client_label} websocket session timed out")
await task.cancel()
@user_aggregator.event_handler("on_user_turn_stopped")
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
logger.info(f"User: {message.content}")
text = (message.content or "").strip()
if not text:
return
await task.queue_frame(
OutputTransportMessageUrgentFrame(
message={
"type": "input.transcript.final",
"text": text,
"user_id": message.user_id,
"timestamp": message.timestamp,
}
)
)
@assistant_aggregator.event_handler("on_assistant_turn_stopped")
async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage):
logger.info(f"Assistant: {message.content}")
runner = PipelineRunner(handle_sigint=False)
await runner.run(task)

160
src/voice/protocol.py Normal file
View File

@@ -0,0 +1,160 @@
from __future__ import annotations
import base64
import json
from typing import Any
from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
EndFrame,
Frame,
InputAudioRawFrame,
InputTransportMessageFrame,
OutputAudioRawFrame,
OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
TextFrame,
TranscriptionFrame,
)
from pipecat.serializers.base_serializer import FrameSerializer
class ProductWebsocketSerializer(FrameSerializer):
"""Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport."""
protocol = "va.ws.v1"
def __init__(self, *, sample_rate: int, channels: int):
super().__init__()
self._sample_rate = sample_rate
self._channels = channels
self._sequence = 0
async def serialize(self, frame: Frame) -> str | bytes | None:
if isinstance(frame, OutputAudioRawFrame):
return self._event(
"response.audio.delta",
audio=base64.b64encode(frame.audio).decode("ascii"),
bytes=len(frame.audio),
sample_rate=frame.sample_rate,
channels=frame.num_channels,
)
if isinstance(frame, BotStartedSpeakingFrame):
return self._event("response.audio.started")
if isinstance(frame, BotStoppedSpeakingFrame):
return self._event("response.audio.stopped")
if isinstance(frame, TranscriptionFrame):
return self._event(
"input.transcript.final",
text=frame.text,
user_id=frame.user_id,
timestamp=frame.timestamp,
)
if isinstance(frame, TextFrame):
return self._event("response.text.delta", text=frame.text)
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
if self.should_ignore_frame(frame):
return None
message = frame.message
if isinstance(message, dict) and isinstance(message.get("type"), str):
event_type = message["type"]
payload = {k: v for k, v in message.items() if k != "type"}
return self._event(event_type, **payload)
return self._event("transport.message", message=message)
return None
async def deserialize(self, data: str | bytes) -> Frame | None:
if isinstance(data, bytes):
return InputAudioRawFrame(
audio=data,
sample_rate=self._sample_rate,
num_channels=self._channels,
)
try:
message = json.loads(data)
except json.JSONDecodeError as exc:
logger.warning(f"Invalid product websocket JSON: {exc}")
return None
if not isinstance(message, dict):
logger.warning("Product websocket message must be a JSON object")
return None
message_type = message.get("type")
if message_type == "session.start":
return InputTransportMessageFrame(
message={
"type": "session.started",
"protocol": self.protocol,
"audio": {
"encoding": "pcm_s16le",
"sample_rate": self._sample_rate,
"channels": self._channels,
},
}
)
if message_type == "session.stop":
return EndFrame()
if message_type == "response.cancel":
return CancelFrame(reason="client_cancelled")
if message_type == "input.audio":
audio = message.get("audio") or message.get("data")
if not isinstance(audio, str):
logger.warning("input.audio requires base64 'audio' or 'data'")
return None
try:
pcm = base64.b64decode(audio)
except ValueError as exc:
logger.warning(f"Invalid input.audio base64: {exc}")
return None
return InputAudioRawFrame(
audio=pcm,
sample_rate=int(message.get("sample_rate") or self._sample_rate),
num_channels=int(message.get("channels") or self._channels),
)
if message_type == "input.text":
text = message.get("text")
if not isinstance(text, str) or not text.strip():
logger.warning("input.text requires non-empty 'text'")
return None
return InputTransportMessageFrame(
message={
"type": "input.text",
"text": text,
"interrupt": bool(message.get("interrupt", True)),
}
)
if message_type == "transport.message":
payload = message.get("message")
return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message)
logger.warning(f"Unsupported product websocket message type: {message_type!r}")
return None
def _event(self, event_type: str, **payload: Any) -> str:
self._sequence += 1
return json.dumps(
{
"type": event_type,
"protocol": self.protocol,
"seq": self._sequence,
**payload,
},
ensure_ascii=False,
)

92
src/voice/routes.py Normal file
View File

@@ -0,0 +1,92 @@
from __future__ import annotations
from functools import lru_cache
from pathlib import Path
from fastapi import APIRouter, FastAPI, WebSocket
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from loguru import logger
from .config import DEFAULT_VOICE_CONFIG, EngineConfig, load_config
from .pipeline import run_product_voice_pipeline
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo"
router = APIRouter(tags=["voice"])
@lru_cache(maxsize=1)
def get_voice_config() -> EngineConfig:
return load_config(DEFAULT_VOICE_CONFIG)
def _normalize_mount_path(path: str) -> str:
normalized = path.strip() or "/voice-demo"
if not normalized.startswith("/"):
normalized = f"/{normalized}"
return normalized.rstrip("/") or "/"
@router.get("/voice/health")
async def voice_health() -> dict[str, object]:
config = get_voice_config()
mount = (
_normalize_mount_path(config.server.webpage_mount)
if config.server.serve_webpage
else None
)
return {
"status": "healthy",
"protocols": {
"/ws-product": "va.ws.v1.json_base64",
},
"features": {
"product_text_input": True,
"product_text_interrupt": True,
},
"demo": mount,
"llm_provider": config.services.llm.provider,
"stt_provider": config.services.stt.provider,
"tts_provider": config.services.tts.provider,
}
@router.websocket("/ws-product")
async def product_websocket_endpoint(websocket: WebSocket) -> None:
await websocket.accept()
config = get_voice_config()
await run_product_voice_pipeline(websocket, config)
def register_voice(app: FastAPI) -> None:
"""Mount voice websocket routes and optional browser demo static files."""
if not DEFAULT_VOICE_CONFIG.exists():
logger.warning(f"Voice config not found at {DEFAULT_VOICE_CONFIG}; voice demo disabled")
return
config = get_voice_config()
app.include_router(router)
if config.server.cors_origins:
app.add_middleware(
CORSMiddleware,
allow_origins=config.server.cors_origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir():
mount = _normalize_mount_path(config.server.webpage_mount)
app.mount(
mount,
StaticFiles(directory=str(VOICE_DEMO_DIR), html=True),
name="voice-demo",
)
logger.info(f"Voice demo mounted at {mount}")
else:
logger.info("Voice demo static page disabled or missing")
logger.info("Voice websocket registered at /ws-product")

165
src/voice/services.py Normal file
View File

@@ -0,0 +1,165 @@
from __future__ import annotations
from collections.abc import AsyncGenerator
from openai import BadRequestError
from openai import NOT_GIVEN
from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.services.openai.stt import OpenAISTTService
from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
from pipecat.transcriptions.language import Language
from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig
from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService
from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService
def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
if config.provider == "xfyun":
sample_rate = audio.sample_rate_hz if audio else 16000
return XfyunASRService(
app_id=config.app_id,
api_key=config.api_key or "",
api_secret=config.api_secret,
url=config.base_url or DEFAULT_XFYUN_ASR_URL,
language=config.language or "zh_cn",
domain=config.domain,
accent=config.accent,
sample_rate=sample_rate,
encoding=config.encoding,
frame_size=config.frame_size,
open_timeout=config.timeout_sec,
dynamic_correction=config.dynamic_correction,
)
_require_provider(config.provider, "openai", "stt")
return OpenAISTTService(
api_key=config.api_key or None,
base_url=config.base_url,
settings=OpenAISTTService.Settings(
model=config.model,
language=_language(config.language),
),
)
def create_llm_service(config: LLMConfig):
_require_provider(config.provider, "openai", "llm")
return OpenAILLMService(
api_key=config.api_key or None,
base_url=config.base_url,
settings=OpenAILLMService.Settings(
model=config.model,
temperature=config.temperature if config.temperature is not None else NOT_GIVEN,
),
)
def create_tts_service(config: TTSConfig, audio: AudioConfig):
if config.provider == "xfyun":
source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz
if source_sample_rate not in (8000, 16000):
raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000")
return XfyunTTSService(
app_id=config.app_id,
api_key=config.api_key or "",
api_secret=config.api_secret,
voice=config.voice,
url=config.base_url or DEFAULT_XFYUN_TTS_URL,
sample_rate=audio.sample_rate_hz,
source_sample_rate=source_sample_rate,
encoding=config.aue,
text_encoding=config.tte,
speed=config.speed,
volume=config.volume,
pitch=config.pitch,
timeout=config.timeout_sec,
)
_require_provider(config.provider, "openai", "tts")
service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService
return service_class(
api_key=config.api_key or None,
base_url=config.base_url,
sample_rate=audio.sample_rate_hz,
source_sample_rate=config.source_sample_rate_hz,
settings=OpenAITTSService.Settings(
model=config.model,
voice=config.voice,
),
)
class OpenAICompatibleTTSService(OpenAITTSService):
"""OpenAI-compatible TTS service that permits provider-specific voice ids."""
def __init__(self, *, source_sample_rate: int | None = None, **kwargs):
super().__init__(**kwargs)
self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
voice = self._settings.voice
if not voice:
yield ErrorFrame(error="TTS voice must be specified")
return
try:
create_params = {
"input": text,
"model": self._settings.model,
"voice": voice,
"response_format": "pcm",
}
if self._settings.instructions:
create_params["instructions"] = self._settings.instructions
if self._settings.speed:
create_params["speed"] = self._settings.speed
async with self._client.audio.speech.with_streaming_response.create(
**create_params
) as response:
if response.status_code != 200:
error = await response.text()
yield ErrorFrame(
error=f"TTS request failed (status: {response.status_code}, error: {error})"
)
return
await self.start_tts_usage_metrics(text)
async def audio_chunks():
async for chunk in response.iter_bytes(self.chunk_size):
if chunk:
yield chunk
first_frame = True
async for frame in self._stream_audio_frames_from_iterator(
audio_chunks(),
in_sample_rate=self._source_sample_rate,
context_id=context_id,
):
if first_frame:
await self.stop_ttfb_metrics()
first_frame = False
yield frame
except BadRequestError as exc:
yield ErrorFrame(error=f"TTS request failed: {exc}")
except Exception as exc:
yield ErrorFrame(error=f"TTS request failed: {exc}")
def _require_provider(actual: str, expected: str, service_name: str) -> None:
if actual != expected:
raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}")
def _language(value: str | None) -> Language | None:
if value is None:
return None
normalized = value.replace("-", "_").upper()
return getattr(Language, normalized, value)

38
src/voice/text_input.py Normal file
View File

@@ -0,0 +1,38 @@
from __future__ import annotations
from loguru import logger
from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class ProductTextInputProcessor(FrameProcessor):
"""Converts product text-input transport messages into LLM turns."""
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if not isinstance(frame, InputTransportMessageFrame):
await self.push_frame(frame, direction)
return
message = frame.message
if not isinstance(message, dict) or message.get("type") != "input.text":
await self.push_frame(frame, direction)
return
text = str(message.get("text") or "").strip()
if not text:
return
if message.get("interrupt", True):
logger.info("Text input interrupting current response")
await self.broadcast_interruption()
await self.push_frame(
LLMMessagesAppendFrame(
messages=[{"role": "user", "content": text}],
run_llm=True,
),
FrameDirection.DOWNSTREAM,
)

75
src/voice/text_stream.py Normal file
View File

@@ -0,0 +1,75 @@
from __future__ import annotations
from pipecat.frames.frames import (
Frame,
InterruptionFrame,
LLMFullResponseEndFrame,
LLMFullResponseStartFrame,
LLMTextFrame,
OutputTransportMessageUrgentFrame,
TTSSpeakFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class ProductTextStreamProcessor(FrameProcessor):
"""Mirrors LLM text frames as streaming protocol events."""
def __init__(self) -> None:
super().__init__()
self._aggregation: list[str] = []
self._turn_active = False
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
await super().process_frame(frame, direction)
if isinstance(frame, LLMFullResponseStartFrame):
await self._start_turn()
elif isinstance(frame, LLMTextFrame):
if frame.text:
await self._delta(frame.text)
elif isinstance(frame, LLMFullResponseEndFrame):
await self._end_turn(interrupted=False)
elif isinstance(frame, InterruptionFrame):
await self._end_turn(interrupted=True)
elif isinstance(frame, TTSSpeakFrame):
text = frame.text or ""
await self._start_turn()
if text:
await self._delta(text)
await self._end_turn(interrupted=False)
await self.push_frame(frame, direction)
async def _start_turn(self) -> None:
if self._turn_active:
return
self._turn_active = True
self._aggregation = []
await self._emit("response.text.started")
async def _delta(self, text: str) -> None:
if not self._turn_active:
await self._start_turn()
self._aggregation.append(text)
await self._emit("response.text.delta", text=text)
async def _end_turn(self, *, interrupted: bool) -> None:
if not self._turn_active:
return
full_text = "".join(self._aggregation)
self._turn_active = False
self._aggregation = []
await self._emit(
"response.text.final",
text=full_text,
interrupted=interrupted,
)
async def _emit(self, event_type: str, **payload: object) -> None:
await self.push_frame(
OutputTransportMessageUrgentFrame(
message={"type": event_type, **payload},
),
FrameDirection.DOWNSTREAM,
)

View File

@@ -0,0 +1,30 @@
from __future__ import annotations
from pipecat.frames.frames import (
Frame,
InterimTranscriptionFrame,
OutputTransportMessageUrgentFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class ProductTranscriptStreamProcessor(FrameProcessor):
"""Mirrors interim STT frames to the product websocket protocol."""
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
await super().process_frame(frame, direction)
if isinstance(frame, InterimTranscriptionFrame):
await self.push_frame(
OutputTransportMessageUrgentFrame(
message={
"type": "input.transcript.interim",
"text": frame.text,
"user_id": frame.user_id,
"timestamp": frame.timestamp,
}
),
FrameDirection.DOWNSTREAM,
)
await self.push_frame(frame, direction)

86
src/voice/turn_start.py Normal file
View File

@@ -0,0 +1,86 @@
from __future__ import annotations
import re
from loguru import logger
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
Frame,
InterimTranscriptionFrame,
TranscriptionFrame,
)
from pipecat.turns.types import ProcessFrameResult
from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy
_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
"""Starts user turns only after likely intentional speech."""
def __init__(
self,
*,
min_chars_when_bot_speaking: int,
allowed_short_replies: list[str],
use_interim: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self._min_chars_when_bot_speaking = min_chars_when_bot_speaking
self._allowed_short_replies = {
self._normalize_text(reply) for reply in allowed_short_replies if reply.strip()
}
self._use_interim = use_interim
self._bot_speaking = False
async def reset(self):
await super().reset()
async def process_frame(self, frame: Frame) -> ProcessFrameResult:
if isinstance(frame, BotStartedSpeakingFrame):
self._bot_speaking = True
return ProcessFrameResult.CONTINUE
if isinstance(frame, BotStoppedSpeakingFrame):
self._bot_speaking = False
return ProcessFrameResult.CONTINUE
if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
return await self._handle_transcription(frame.text, interim=True)
if isinstance(frame, TranscriptionFrame):
return await self._handle_transcription(frame.text, interim=False)
return ProcessFrameResult.CONTINUE
async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult:
normalized = self._normalize_text(text)
if not normalized:
return ProcessFrameResult.CONTINUE
if not self._bot_speaking:
await self.trigger_user_turn_started()
return ProcessFrameResult.STOP
should_interrupt = self._should_interrupt(normalized)
logger.debug(
f"{self} interruption_gate text={text!r} normalized={normalized!r} "
f"should_interrupt={should_interrupt} interim={interim}"
)
if should_interrupt:
await self.trigger_user_turn_started()
return ProcessFrameResult.STOP
await self.trigger_reset_aggregation()
return ProcessFrameResult.CONTINUE
def _should_interrupt(self, normalized: str) -> bool:
return (
normalized in self._allowed_short_replies
or len(normalized) >= self._min_chars_when_bot_speaking
)
@staticmethod
def _normalize_text(text: str) -> str:
return "".join(_COUNTABLE_TEXT_RE.findall(text.lower()))

353
src/voice/xfyun_asr.py Normal file
View File

@@ -0,0 +1,353 @@
from __future__ import annotations
import asyncio
import base64
import hashlib
import hmac
import json
import os
from collections.abc import AsyncGenerator
from datetime import datetime, timezone
from email.utils import format_datetime
from typing import Any
from urllib.parse import urlencode, urlparse
from loguru import logger
from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
InterimTranscriptionFrame,
TranscriptionFrame,
UserStoppedSpeakingFrame,
VADUserStartedSpeakingFrame,
)
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.settings import STTSettings
from pipecat.services.stt_service import STTService
from pipecat.transcriptions.language import Language
from pipecat.utils.time import time_now_iso8601
from websockets.asyncio.client import connect as websocket_connect
from websockets.protocol import State
DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat"
class XfyunASRService(STTService):
"""iFlytek/Xfyun streaming voice dictation service for Pipecat."""
def __init__(
self,
*,
app_id: str,
api_key: str,
api_secret: str,
url: str | None = None,
language: str = "zh_cn",
domain: str = "iat",
accent: str = "mandarin",
sample_rate: int = 16000,
encoding: str = "raw",
frame_size: int = 1280,
open_timeout: float = 10.0,
dynamic_correction: bool = False,
**kwargs,
) -> None:
super().__init__(
sample_rate=sample_rate,
settings=STTSettings(model=None, language=language),
**kwargs,
)
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
self._url = url or DEFAULT_XFYUN_ASR_URL
self._language = language
self._domain = domain
self._accent = accent
self._encoding = encoding
self._frame_size = frame_size
self._open_timeout = open_timeout
self._dynamic_correction = dynamic_correction
self._websocket = None
self._receive_task = None
self._audio_buffer = bytearray()
self._sent_first_frame = False
self._sent_final_frame = False
self._finalizing_turn = False
self._partials: list[str] = []
self._last_text = ""
async def cleanup(self) -> None:
await self._close_utterance()
await super().cleanup()
async def stop(self, frame: EndFrame) -> None:
await self._close_utterance()
await super().stop(frame)
async def cancel(self, frame: CancelFrame) -> None:
await self._close_utterance()
await super().cancel(frame)
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
await super().process_frame(frame, direction)
if isinstance(frame, UserStoppedSpeakingFrame):
# Aggregator-level turn end (broadcast once per logical user turn).
# This is the only boundary that finalizes/closes the xfyun
# websocket, so brief VAD pauses do not restart the ASR session.
await self._finish_utterance()
elif isinstance(frame, VADUserStartedSpeakingFrame):
await self._start_utterance()
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]:
if not audio:
yield None
return
if not self._websocket or self._websocket.state is not State.OPEN:
await self._start_utterance()
self._audio_buffer.extend(audio)
await self._flush_audio_buffer(final=False)
yield None
async def _start_utterance(self) -> None:
if self._websocket and self._websocket.state is State.OPEN:
return
if not self._app_id or not self._api_key or not self._api_secret:
await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret")
return
if self.sample_rate not in (8000, 16000):
await self.push_error("Xfyun ASR sample rate must be 8000 or 16000")
return
self._audio_buffer.clear()
self._partials = []
self._last_text = ""
self._sent_first_frame = False
self._sent_final_frame = False
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
try:
self._websocket = await websocket_connect(
auth_url,
max_size=None,
open_timeout=self._open_timeout,
)
except Exception as exc:
await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc)
self._websocket = None
return
self._receive_task = self.create_task(
self._receive_messages(),
name="xfyun_asr_receive",
)
async def _finish_utterance(self) -> None:
if not self._websocket or self._websocket.state is not State.OPEN:
return
await self._flush_audio_buffer(final=True)
if not self._sent_first_frame:
await self._close_utterance()
return
if not self._sent_final_frame:
self._finalizing_turn = True
await self._send_payload({"data": {"status": 2}})
self.request_finalize()
self._sent_final_frame = True
async def _close_utterance(self) -> None:
current_task = asyncio.current_task()
if self._receive_task and self._receive_task is not current_task:
await self.cancel_task(self._receive_task)
self._receive_task = None
websocket = self._websocket
self._websocket = None
if websocket and websocket.state is State.OPEN:
try:
await websocket.close()
except Exception:
pass
self._audio_buffer.clear()
self._sent_first_frame = False
self._sent_final_frame = False
self._finalizing_turn = False
async def _flush_audio_buffer(self, *, final: bool) -> None:
while len(self._audio_buffer) >= self._frame_size:
chunk = bytes(self._audio_buffer[: self._frame_size])
del self._audio_buffer[: self._frame_size]
await self._send_audio_chunk(chunk, status=1)
if final and self._audio_buffer:
chunk = bytes(self._audio_buffer)
self._audio_buffer.clear()
await self._send_audio_chunk(chunk, status=1)
async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None:
if not audio:
return
if not self._sent_first_frame:
business = {
"language": self._language,
"domain": self._domain,
"accent": self._accent,
}
if self._dynamic_correction:
business["dwa"] = "wpgs"
payload = {
"common": {"app_id": self._app_id},
"business": business,
"data": {
"status": 0,
"format": f"audio/L16;rate={self.sample_rate}",
"encoding": self._encoding,
"audio": base64.b64encode(audio).decode("utf-8"),
},
}
self._sent_first_frame = True
else:
payload = {
"data": {
"status": status,
"format": f"audio/L16;rate={self.sample_rate}",
"encoding": self._encoding,
"audio": base64.b64encode(audio).decode("utf-8"),
}
}
await self._send_payload(payload)
async def _send_payload(self, payload: dict[str, Any]) -> None:
if not self._websocket or self._websocket.state is not State.OPEN:
return
await self._websocket.send(json.dumps(payload, ensure_ascii=False))
async def _receive_messages(self) -> None:
websocket = self._websocket
if not websocket:
return
try:
async for message in websocket:
await self._process_response(json.loads(message))
except Exception as exc:
if self._websocket is websocket:
await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc)
finally:
if self._websocket is websocket:
self._websocket = None
self._receive_task = None
async def _process_response(self, payload: dict[str, Any]) -> None:
code = payload.get("code", -1)
if code != 0:
message = payload.get("message", "unknown error")
sid = payload.get("sid")
await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}")
return
data = payload.get("data")
if not isinstance(data, dict):
return
is_final_response = data.get("status") == 2
recognition = data.get("result")
if isinstance(recognition, dict):
text = self._apply_recognition_result(recognition)
if text and text != self._last_text:
self._last_text = text
if not self._finalizing_turn and not is_final_response:
await self.push_frame(
InterimTranscriptionFrame(
text,
self._user_id,
time_now_iso8601(),
_language_or_none(self._language),
result=payload,
)
)
if is_final_response:
final_text = self._last_text
if final_text:
self.confirm_finalize()
await self.push_frame(
TranscriptionFrame(
final_text,
self._user_id,
time_now_iso8601(),
_language_or_none(self._language),
result=payload,
)
)
await self._close_utterance()
def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
partial = _extract_text_from_result(recognition)
if not partial:
return self._last_text
if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
start, end = recognition["rg"]
if 1 <= start <= len(self._partials):
self._partials[start - 1 : end] = [partial]
else:
logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}")
else:
self._partials.append(partial)
return "".join(self._partials)
def _extract_text_from_result(result: dict[str, Any]) -> str:
words: list[str] = []
for item in result.get("ws", []):
for candidate in item.get("cw", []):
word = candidate.get("w")
if word:
words.append(word)
return "".join(words)
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
parsed = urlparse(url)
host = parsed.netloc
path = parsed.path or "/v2/iat"
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
request_line = f"GET {path} HTTP/1.1"
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
signature_sha = hmac.new(
api_secret.encode("utf-8"),
signature_origin.encode("utf-8"),
digestmod=hashlib.sha256,
).digest()
signature = base64.b64encode(signature_sha).decode("utf-8")
authorization_origin = (
f'api_key="{api_key}", algorithm="hmac-sha256", '
f'headers="host date request-line", signature="{signature}"'
)
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
query = urlencode({"authorization": authorization, "date": date, "host": host})
return f"{url}?{query}"
def _language_or_none(value: str) -> Language | None:
try:
return Language(value)
except ValueError:
return None

257
src/voice/xfyun_tts.py Normal file
View File

@@ -0,0 +1,257 @@
from __future__ import annotations
import base64
import hashlib
import hmac
import json
import os
import re
import unicodedata
from collections.abc import AsyncGenerator, AsyncIterator
from datetime import datetime, timezone
from email.utils import format_datetime
from typing import Any
from urllib.parse import urlencode, urlparse
from loguru import logger
from pipecat.frames.frames import ErrorFrame, Frame
from pipecat.services.settings import TTSSettings
from pipecat.services.tts_service import TTSService
from websockets.asyncio.client import connect
DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
# rejects (or returns empty audio for) text containing emoji and other
# non-BMP symbols, which surfaces as "request finished without audio data".
_EMOJI_AND_SYMBOL_RE = re.compile(
"["
"\U0001F300-\U0001FAFF" # misc pictographs, emoji, symbols, transport, etc.
"\U00002600-\U000027BF" # misc symbols and dingbats
"\U0001F1E6-\U0001F1FF" # regional indicators (flags)
"\uFE00-\uFE0F" # variation selectors
"\u200D" # zero-width joiner
"]",
flags=re.UNICODE,
)
class XfyunTTSService(TTSService):
"""iFlytek/Xfyun online TTS service for Pipecat.
Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
receives one JSON request per synthesis, and streams text WebSocket
messages containing base64-encoded audio chunks. This service requests
raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
"""
def __init__(
self,
*,
app_id: str,
api_key: str,
api_secret: str,
voice: str,
url: str | None = None,
sample_rate: int = 16000,
source_sample_rate: int = 16000,
encoding: str = "raw",
text_encoding: str = "UTF8",
speed: int = 50,
volume: int = 50,
pitch: int = 50,
timeout: float = 30.0,
**kwargs,
) -> None:
super().__init__(
sample_rate=sample_rate,
settings=TTSSettings(model=None, voice=voice, language=None),
**kwargs,
)
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
self._voice = voice
self._url = url or DEFAULT_XFYUN_TTS_URL
self._source_sample_rate = source_sample_rate
self._encoding = encoding
self._text_encoding = text_encoding
self._speed = speed
self._volume = volume
self._pitch = pitch
self._timeout = timeout
self._last_failure_detail: str | None = None
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
if not text:
return
if not self._app_id or not self._api_key or not self._api_secret:
yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
return
sanitized = _sanitize_text_for_tts(text)
if not sanitized:
logger.debug(
f"{self}: skipping Xfyun TTS, text became empty after sanitization "
f"(original={text!r})"
)
return
if sanitized != text:
logger.debug(
f"{self}: sanitized Xfyun TTS text "
f"(original={text!r}, sanitized={sanitized!r})"
)
if len(sanitized.encode("utf-8")) >= 8000:
yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
return
if self._encoding != "raw":
yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
return
try:
await self.start_tts_usage_metrics(sanitized)
first_frame = True
async for frame in self._stream_audio_frames_from_iterator(
self._iter_audio_chunks(sanitized),
in_sample_rate=self._source_sample_rate,
context_id=context_id,
):
if first_frame:
await self.stop_ttfb_metrics()
first_frame = False
yield frame
if first_frame:
detail = self._last_failure_detail or "no audio frames received"
yield ErrorFrame(
error=(
f"Xfyun TTS request finished without audio data ({detail}); "
f"text={sanitized!r}"
)
)
except Exception as exc:
yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")
async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
request = self._build_request_frame(text)
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
self._last_failure_detail = None
frames_received = 0
audio_bytes_received = 0
last_status: int | None = None
last_sid: str | None = None
saw_status_2 = False
async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
await websocket.send(json.dumps(request, ensure_ascii=False))
async for raw_message in websocket:
frames_received += 1
payload = json.loads(raw_message)
code = payload.get("code", -1)
sid = payload.get("sid")
if sid:
last_sid = sid
if code != 0:
err_msg = payload.get("message", "unknown error")
raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")
data = payload.get("data")
if not isinstance(data, dict):
continue
last_status = data.get("status", last_status)
audio_b64 = data.get("audio")
if audio_b64:
audio_bytes = base64.b64decode(audio_b64)
audio_bytes_received += len(audio_bytes)
yield audio_bytes
if data.get("status") == 2:
saw_status_2 = True
break
if audio_bytes_received == 0:
self._last_failure_detail = (
f"frames={frames_received}, audio_bytes=0, "
f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
)
logger.warning(
f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
)
def _build_request_frame(self, text: str) -> dict[str, Any]:
business: dict[str, Any] = {
"aue": self._encoding,
"auf": f"audio/L16;rate={self._source_sample_rate}",
"vcn": self._voice,
"speed": self._speed,
"volume": self._volume,
"pitch": self._pitch,
"tte": self._text_encoding,
}
return {
"common": {"app_id": self._app_id},
"business": business,
"data": {
"status": 2,
"text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
},
}
def _sanitize_text_for_tts(text: str) -> str:
"""Strip characters Xfyun's online TTS cannot synthesize.
The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
dingbats, regional-indicator flags, variation selectors, and zero-width
joiners. When such characters appear in the input the synthesis can
finish without any audio data ("Xfyun TTS request finished without audio
data"). We also drop control characters (other than common whitespace)
and "Symbol, Other" codepoints, then collapse runs of whitespace.
"""
if not text:
return text
cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
filtered: list[str] = []
for ch in cleaned:
category = unicodedata.category(ch)
if category == "So":
continue
if category.startswith("C") and ch not in ("\n", "\r", "\t"):
continue
filtered.append(ch)
return re.sub(r"\s+", " ", "".join(filtered)).strip()
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
parsed = urlparse(url)
host = parsed.netloc
path = parsed.path or "/v2/tts"
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
request_line = f"GET {path} HTTP/1.1"
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
signature_sha = hmac.new(
api_secret.encode("utf-8"),
signature_origin.encode("utf-8"),
digestmod=hashlib.sha256,
).digest()
signature = base64.b64encode(signature_sha).decode("utf-8")
authorization_origin = (
f'api_key="{api_key}", algorithm="hmac-sha256", '
f'headers="host date request-line", signature="{signature}"'
)
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
query = urlencode({"authorization": authorization, "date": date, "host": host})
return f"{url}?{query}"

106
static/voice-demo/README.md Normal file
View File

@@ -0,0 +1,106 @@
# Webpage Example — Realtime Voice Chat
A self-contained browser client for the engine's product websocket
(`/ws-product`, protocol `va.ws.v1`).
## Features
- **Connect / Disconnect** to any `ws://` or `wss://` URL.
- **Microphone selector + mic on/off toggle** — available input devices
are listed with `enumerateDevices`, and getUserMedia is requested with
`echoCancellation`, `noiseSuppression`, and `autoGainControl` so the
browser handles AEC against the bot's voice.
- **Text composer** — type a message and press <kbd>Enter</kbd> to send
an `input.text` event (Shift+Enter for newline). Sending interrupts
any in-flight bot audio so the next reply is heard cleanly.
- **Chat history** rendered from `input.transcript.final` (you, when
spoken), streamed `response.text.delta` / `response.text.final`
(assistant — deltas arrive ahead of the synthesized audio), and locally
for text you submit (the engine doesn't echo text input back as a
transcript).
- **WebSocket log** panel for connection state and compact send/receive
events. Audio chunks are summarized so the UI does not flood.
- **Gapless TTS playback** by scheduling each `response.audio.delta`
chunk back-to-back on the AudioContext.
- **Live VU meter** + mic and bot activity indicators.
- **Clear** button to reset history.
No build step, no dependencies — just three files plus an AudioWorklet.
## Layout
```text
examples/webpage/
├── index.html
├── styles.css
├── app.js
└── pcm-recorder.worklet.js
```
## Run
1. Start the engine (default port `8000`):
```bash
cd AI-VideoAssistant-engine-v5-pipecat-minimal
source .venv/bin/activate
export OPENAI_API_KEY=...
uvicorn engine.main:app --host 127.0.0.1 --port 8000
```
2. Open the demo page served by the same process:
```text
http://127.0.0.1:8000/demo/
```
The default websocket URL is derived from the page host
(`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a
microphone if needed, click **Enable mic**, and start speaking.
Mount path and on/off are controlled in `config.json`:
```json
"server": {
"serve_webpage": true,
"webpage_mount": "/demo"
}
```
Set `"serve_webpage": false` in production if you serve the UI elsewhere.
### Standalone static server (optional)
You can still serve the files from another port for UI-only iteration.
Add that origin to `server.cors_origins` in `config.json` if needed:
```bash
cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage
python -m http.server 8080
```
Then open <http://localhost:8080> and point the URL field at
`ws://127.0.0.1:8000/ws-product`.
> The browser's mic API requires a secure context. `http://localhost`
> qualifies; if you serve from another host, use HTTPS and a `wss://`
> URL.
## Audio details
- Input: mono Float32 from `getUserMedia` is resampled in the
AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and
sent as **binary** websocket messages (the server accepts either
binary or the JSON+base64 form).
- Output: each `response.audio.delta` carries base64-encoded PCM16 @
16 kHz; chunks are decoded and scheduled back-to-back through Web
Audio. The browser handles resampling to the device rate.
## Notes
- Use headphones if you still hear echo despite browser AEC; the bot's
voice leaking back into the open mic is the most common cause of
feedback loops.
- The engine's session has an inactivity timeout
(`session.inactivity_timeout_sec` in `config.json`). If the bot
doesn't respond after a long silence, reconnect.

899
static/voice-demo/app.js Normal file
View File

@@ -0,0 +1,899 @@
/**
* Minimal browser client for the AI VideoAssistant engine's product
* websocket (`/ws-product`, protocol `va.ws.v1`).
*
* Responsibilities:
* - Open/close the websocket and run the session handshake.
* - List/select microphones and capture mic audio with browser AEC enabled.
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
* as binary websocket messages.
* - Play `response.audio.delta` frames gaplessly through Web Audio.
* - Render a chat-style history of user transcripts and bot text deltas.
*/
const SAMPLE_RATE = 16000;
const CHANNELS = 1;
const FRAME_MS = 20;
const PROTOCOL = "va.ws.v1";
const MAX_WS_LOG_LINES = 120;
const AUDIO_DELTA_LOG_INTERVAL_MS = 1000;
function defaultWsUrl() {
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
return `${scheme}//${location.host}/ws-product`;
}
const els = {
url: document.getElementById("ws-url"),
connectBtn: document.getElementById("connect-btn"),
statusDot: document.getElementById("status-dot"),
statusText: document.getElementById("status-text"),
chatLog: document.getElementById("chat-log"),
micBtn: document.getElementById("mic-btn"),
micSelect: document.getElementById("mic-select"),
micLabel: document.querySelector(".mic-btn__label"),
micIndicator: document.getElementById("mic-indicator"),
botIndicator: document.getElementById("bot-indicator"),
clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"),
meterFill: document.getElementById("meter-fill"),
composer: document.getElementById("composer"),
textInput: document.getElementById("text-input"),
sendBtn: document.getElementById("send-btn"),
};
const state = {
ws: null,
connected: false,
connecting: false,
audioContext: null,
micStream: null,
micSourceNode: null,
recorderNode: null,
micEnabled: false,
micDevices: [],
selectedMicDeviceId: "",
// Output scheduling.
nextPlaybackTime: 0,
playbackEndsAt: 0,
scheduledSources: [],
botActive: false,
botUiTimer: null,
// Chat state.
currentAssistantBubble: null,
// VU meter smoothing.
meterLevel: 0,
// Compact websocket logging.
audioDeltaLogCount: 0,
audioDeltaLogBytes: 0,
lastAudioDeltaLogAt: 0,
audioSendLogCount: 0,
audioSendLogBytes: 0,
lastAudioSendLogAt: 0,
};
/* ------------------------------------------------------------------ UI */
function setStatus(kind, text) {
els.statusDot.className = `status__dot status__dot--${kind}`;
els.statusText.textContent = text;
}
function setConnectButton() {
if (state.connecting) {
els.connectBtn.textContent = "Connecting…";
els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) {
els.connectBtn.textContent = "Disconnect";
els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect");
} else {
els.connectBtn.textContent = "Connect";
els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect");
}
}
function setMicButton() {
els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
els.micIndicator.classList.toggle("is-active", state.micEnabled);
}
function setMicSelectEnabled() {
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
}
function setComposerEnabled(enabled) {
els.textInput.disabled = !enabled;
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
}
function setBotIndicator(active) {
els.botIndicator.classList.toggle("is-active", active);
}
function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant";
bubble.appendChild(tag);
}
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text;
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text");
body.textContent += text;
scrollChatToBottom();
}
function scrollChatToBottom() {
els.chatLog.scrollTop = els.chatLog.scrollHeight;
}
function clearChat() {
els.chatLog.innerHTML = "";
state.currentAssistantBubble = null;
const empty = document.createElement("div");
empty.className = "chat__empty";
empty.innerHTML = "<p>Chat cleared.</p>";
els.chatLog.appendChild(empty);
}
function truncateLogValue(value, maxLength = 160) {
const text = String(value);
if (text.length <= maxLength) return text;
return `${text.slice(0, maxLength - 1)}`;
}
function compactWsPayload(payload) {
if (!payload || typeof payload !== "object") return String(payload);
const compact = { ...payload };
if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`;
}
if (typeof compact.text === "string") {
compact.text = truncateLogValue(compact.text);
}
try {
return JSON.stringify(compact);
} catch (_) {
return payload.type || "unserializable websocket payload";
}
}
function addWsLog(direction, detail, kind = direction) {
if (els.wsLog.querySelector(".ws-log__empty")) {
els.wsLog.innerHTML = "";
}
const entry = document.createElement("div");
entry.className = `ws-log__entry ws-log__entry--${kind}`;
const time = document.createElement("span");
time.className = "ws-log__time";
time.textContent = new Date().toLocaleTimeString([], {
hour12: false,
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
});
const dir = document.createElement("span");
dir.className = "ws-log__direction";
dir.textContent =
direction === "send"
? "SEND"
: direction === "recv"
? "RECV"
: direction.toUpperCase();
const body = document.createElement("span");
body.className = "ws-log__detail";
body.textContent = detail;
entry.append(time, dir, body);
els.wsLog.appendChild(entry);
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
els.wsLog.firstElementChild.remove();
}
els.wsLog.scrollTop = els.wsLog.scrollHeight;
}
function flushAudioDeltaLog() {
if (state.audioDeltaLogCount === 0) return;
addWsLog(
"recv",
`response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`,
);
state.audioDeltaLogCount = 0;
state.audioDeltaLogBytes = 0;
state.lastAudioDeltaLogAt = performance.now();
}
function flushAudioSendLog() {
if (state.audioSendLogCount === 0) return;
addWsLog(
"send",
`input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`,
);
state.audioSendLogCount = 0;
state.audioSendLogBytes = 0;
state.lastAudioSendLogAt = performance.now();
}
function flushPendingWsLogs() {
flushAudioDeltaLog();
flushAudioSendLog();
}
function logWsPayload(direction, payload) {
if (direction === "send") {
flushAudioSendLog();
} else {
flushAudioDeltaLog();
}
if (direction === "recv" && payload?.type === "response.audio.delta") {
state.audioDeltaLogCount += 1;
state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0;
const now = performance.now();
if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
flushAudioDeltaLog();
}
return;
}
addWsLog(direction, compactWsPayload(payload));
}
function logBinarySend(byteLength) {
state.audioSendLogCount += 1;
state.audioSendLogBytes += byteLength;
const now = performance.now();
if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
flushAudioSendLog();
}
}
function wsSend(data) {
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
if (typeof data === "string") {
try {
logWsPayload("send", JSON.parse(data));
} catch (_) {
flushAudioSendLog();
flushAudioDeltaLog();
addWsLog("send", truncateLogValue(data));
}
} else {
const byteLength =
data instanceof ArrayBuffer
? data.byteLength
: ArrayBuffer.isView(data)
? data.byteLength
: 0;
if (byteLength > 0) {
logBinarySend(byteLength);
}
}
state.ws.send(data);
return true;
}
function clearWsLog() {
state.audioDeltaLogCount = 0;
state.audioDeltaLogBytes = 0;
state.audioSendLogCount = 0;
state.audioSendLogBytes = 0;
els.wsLog.innerHTML =
'<div class="ws-log__empty">No websocket events yet.</div>';
}
/* ---------------------------------------------------------------- Audio */
async function ensureAudioContext() {
if (!state.audioContext) {
const Ctx = window.AudioContext || window.webkitAudioContext;
state.audioContext = new Ctx();
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
}
if (state.audioContext.state === "suspended") {
await state.audioContext.resume();
}
return state.audioContext;
}
function renderMicDevices() {
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
els.micSelect.innerHTML = "";
const defaultOption = document.createElement("option");
defaultOption.value = "";
defaultOption.textContent = "Default microphone";
els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => {
const option = document.createElement("option");
option.value = device.deviceId;
option.textContent = device.label || `Microphone ${index + 1}`;
els.micSelect.appendChild(option);
});
const hasPrevious = state.micDevices.some(
(device) => device.deviceId === previousValue,
);
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
els.micSelect.value = state.selectedMicDeviceId;
setMicSelectEnabled();
}
async function refreshMicDevices() {
if (!navigator.mediaDevices?.enumerateDevices) {
setMicSelectEnabled();
return;
}
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.micDevices = devices.filter((device) => device.kind === "audioinput");
renderMicDevices();
} catch (err) {
console.warn("Could not enumerate microphones", err);
setMicSelectEnabled();
}
}
async function startMic() {
const ctx = await ensureAudioContext();
const audioConstraints = {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
};
if (state.selectedMicDeviceId) {
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
}
state.micStream = await navigator.mediaDevices.getUserMedia({
audio: audioConstraints,
video: false,
});
await refreshMicDevices();
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
numberOfInputs: 1,
numberOfOutputs: 0,
channelCount: 1,
processorOptions: {
targetSampleRate: SAMPLE_RATE,
frameMs: FRAME_MS,
},
});
state.recorderNode.port.onmessage = (event) => {
const data = event.data;
if (!data || data.type !== "frame") return;
updateMeter(data.rms || 0);
if (state.connected) {
wsSend(data.buffer);
}
};
state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true;
addWsLog("system", "mic capture started (binary input.audio frames)");
setMicButton();
}
function stopMic() {
const wasEnabled = state.micEnabled;
if (state.recorderNode) {
try {
state.recorderNode.port.onmessage = null;
state.recorderNode.disconnect();
} catch (_) {
/* ignore */
}
state.recorderNode = null;
}
if (state.micSourceNode) {
try {
state.micSourceNode.disconnect();
} catch (_) {
/* ignore */
}
state.micSourceNode = null;
}
if (state.micStream) {
for (const track of state.micStream.getTracks()) {
try {
track.stop();
} catch (_) {
/* ignore */
}
}
state.micStream = null;
}
state.micEnabled = false;
updateMeter(0);
if (wasEnabled) {
flushAudioSendLog();
addWsLog("system", "mic capture stopped");
}
setMicButton();
}
function updateMeter(rms) {
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
const target = Math.min(1, rms * 2.4);
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
}
/* ---------------------------------------------------- Bot audio playback */
function schedulePlayback(int16) {
const ctx = state.audioContext;
if (!ctx) return;
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
}
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
buffer.copyToChannel(float32, 0);
const src = ctx.createBufferSource();
src.buffer = buffer;
src.connect(ctx.destination);
const now = ctx.currentTime;
// Schedule immediately after the previously scheduled chunk to keep
// playback contiguous, with a tiny safety margin if we fell behind.
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
src.start(startAt);
state.nextPlaybackTime = startAt + buffer.duration;
state.playbackEndsAt = state.nextPlaybackTime;
src.onended = () => {
const idx = state.scheduledSources.indexOf(src);
if (idx >= 0) state.scheduledSources.splice(idx, 1);
};
state.scheduledSources.push(src);
setBotIndicator(true);
if (state.botUiTimer) clearTimeout(state.botUiTimer);
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
state.botUiTimer = setTimeout(() => {
if (state.audioContext &&
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
setBotIndicator(false);
}
}, msUntilEnd);
}
function stopPlaybackQueue() {
for (const src of state.scheduledSources) {
try {
src.onended = null;
src.stop();
src.disconnect();
} catch (_) {
/* already stopped */
}
}
state.scheduledSources = [];
resetPlaybackClock();
if (state.botUiTimer) {
clearTimeout(state.botUiTimer);
state.botUiTimer = null;
}
setBotIndicator(false);
}
function resetPlaybackClock() {
if (state.audioContext) {
state.nextPlaybackTime = state.audioContext.currentTime;
state.playbackEndsAt = state.audioContext.currentTime;
}
}
/* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) {
if (!text) return;
state.currentAssistantBubble = null;
addBubble("user", text);
}
function sendText(text) {
const value = (text || "").trim();
if (!value) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.text",
text: value,
interrupt: true,
};
// The engine does not echo text input back as a transcript event, so we
// render the user bubble locally. Also interrupt any in-flight bot audio
// so the next reply is heard cleanly. We deliberately do NOT clear
// `currentAssistantBubble` here — the engine will emit a
// `response.text.final(interrupted=true)` for the in-flight assistant
// turn, which finalizes that bubble in place. A brand-new bubble for the
// reply will be created when `response.text.started` arrives.
wsSend(JSON.stringify(message));
stopPlaybackQueue();
addBubble("user", value);
return true;
}
function handleAssistantDelta(text) {
if (!text) return;
if (!state.currentAssistantBubble) {
state.currentAssistantBubble = addBubble("assistant", "");
}
appendToBubble(state.currentAssistantBubble, text);
}
function handleAssistantStarted() {
state.currentAssistantBubble = null;
}
function handleAssistantFinal(text, interrupted) {
if (!text) {
state.currentAssistantBubble = null;
return;
}
if (state.currentAssistantBubble) {
const body = state.currentAssistantBubble.querySelector(".bubble__text");
body.textContent = text;
} else {
state.currentAssistantBubble = addBubble("assistant", text);
}
if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted");
}
state.currentAssistantBubble = null;
scrollChatToBottom();
}
function finalizeAssistantBubble() {
state.currentAssistantBubble = null;
}
/* ---------------------------------------------------------- Websocket IO */
function decodeBase64ToInt16(b64) {
const binary = atob(b64);
const len = binary.length;
const bytes = new Uint8Array(len);
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
}
function handleEvent(event) {
switch (event.type) {
case "response.audio.delta":
if (typeof event.audio === "string") {
schedulePlayback(decodeBase64ToInt16(event.audio));
}
break;
case "response.audio.started":
setBotIndicator(true);
break;
case "response.audio.stopped":
finalizeAssistantBubble();
// The indicator turns off automatically when the playback queue drains.
break;
case "response.text.delta":
handleAssistantDelta(event.text);
break;
case "response.text.started":
handleAssistantStarted();
break;
case "response.text.final":
handleAssistantFinal(event.text, event.interrupted);
break;
case "input.transcript.final":
handleUserTranscript(event.text);
break;
case "input.transcript.interim":
// Ignore partial ASR updates; chat history renders committed user turns.
break;
case "transport.message":
// Reserved for future structured messages; ignore silently.
break;
default:
// Unknown event type: log for debugging.
console.debug("ws event", event);
}
}
async function connect() {
if (state.connected || state.connecting) return;
const url = (els.url.value || "").trim();
if (!url) {
setStatus("error", "Missing URL");
return;
}
state.connecting = true;
setStatus("connecting", "Connecting…");
setConnectButton();
addWsLog("system", `connecting ${url}`);
try {
// Pre-warm audio context on user gesture so playback works on Safari.
await ensureAudioContext();
} catch (err) {
console.error("AudioContext failed", err);
state.connecting = false;
setStatus("error", "Audio init failed");
setConnectButton();
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
return;
}
let ws;
try {
ws = new WebSocket(url);
} catch (err) {
console.error("WebSocket constructor failed", err);
state.connecting = false;
setStatus("error", "Bad URL");
setConnectButton();
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
return;
}
ws.binaryType = "arraybuffer";
state.ws = ws;
ws.addEventListener("open", () => {
const startMessage = {
type: "session.start",
protocol: PROTOCOL,
audio: {
encoding: "pcm_s16le",
sample_rate: SAMPLE_RATE,
channels: CHANNELS,
},
};
state.connecting = false;
state.connected = true;
resetPlaybackClock();
addWsLog("system", "websocket open");
setStatus("connected", "Connected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
refreshMicDevices();
wsSend(JSON.stringify(startMessage));
addBubble("system", "Session started.");
setComposerEnabled(true);
els.textInput.focus();
});
ws.addEventListener("message", (event) => {
const data = event.data;
if (typeof data === "string") {
let parsed;
try {
parsed = JSON.parse(data);
} catch (err) {
console.warn("Bad JSON from server", err, data);
addWsLog(
"error",
`invalid JSON from server: ${truncateLogValue(data)}`,
"error",
);
return;
}
logWsPayload("recv", parsed);
handleEvent(parsed);
} else if (data instanceof ArrayBuffer) {
// Server doesn't currently send binary, but handle it just in case.
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
schedulePlayback(new Int16Array(data));
}
});
ws.addEventListener("error", (err) => {
console.error("WebSocket error", err);
setStatus("error", "Connection error");
addWsLog("error", "websocket error", "error");
});
ws.addEventListener("close", (event) => {
const wasConnected = state.connected;
state.ws = null;
state.connected = false;
state.connecting = false;
if (state.micEnabled) stopMic();
stopPlaybackQueue();
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);
setBotIndicator(false);
flushPendingWsLogs();
addWsLog(
"system",
`websocket close code=${event.code}${
event.reason ? ` reason=${event.reason}` : ""
}`,
);
if (wasConnected) {
addBubble(
"system",
`Session ended${event.reason ? `${event.reason}` : ""}.`,
);
setStatus("idle", "Disconnected");
} else {
setStatus("error", "Connection closed");
}
});
}
function disconnect() {
if (!state.ws) return;
try {
if (state.ws.readyState === WebSocket.OPEN) {
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
wsSend(JSON.stringify(stopMessage));
}
} catch (_) {
/* ignore */
}
try {
state.ws.close(1000, "client_disconnect");
} catch (_) {
/* ignore */
}
}
/* ---------------------------------------------------------------- Wiring */
els.connectBtn.addEventListener("click", () => {
if (state.connected) disconnect();
else connect();
});
els.micBtn.addEventListener("click", async () => {
if (!state.connected) return;
els.micBtn.disabled = true;
try {
if (state.micEnabled) {
stopMic();
} else {
await startMic();
}
} catch (err) {
console.error("Mic error", err);
addBubble("system", `Mic error: ${err.message || err}`);
} finally {
els.micBtn.disabled = !state.connected;
}
});
els.micSelect.addEventListener("change", async () => {
state.selectedMicDeviceId = els.micSelect.value;
if (!state.micEnabled) return;
els.micSelect.disabled = true;
els.micBtn.disabled = true;
try {
stopMic();
await startMic();
} catch (err) {
console.error("Mic switch error", err);
addBubble("system", `Mic switch error: ${err.message || err}`);
} finally {
setMicButton();
setMicSelectEnabled();
}
});
if (navigator.mediaDevices?.addEventListener) {
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
}
els.clearBtn.addEventListener("click", () => {
clearChat();
});
els.clearWsLogBtn.addEventListener("click", () => {
clearWsLog();
});
function autosizeTextarea() {
const ta = els.textInput;
ta.style.height = "auto";
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
}
function submitText() {
const value = els.textInput.value;
if (!sendText(value)) return;
els.textInput.value = "";
autosizeTextarea();
setComposerEnabled(state.connected);
}
els.composer.addEventListener("submit", (event) => {
event.preventDefault();
submitText();
});
els.textInput.addEventListener("input", () => {
autosizeTextarea();
setComposerEnabled(state.connected);
});
els.textInput.addEventListener("keydown", (event) => {
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
event.preventDefault();
submitText();
}
});
window.addEventListener("beforeunload", () => {
if (state.ws) {
try {
state.ws.close();
} catch (_) {
/* ignore */
}
}
if (state.audioContext) {
try {
state.audioContext.close();
} catch (_) {
/* ignore */
}
}
});
els.url.value = defaultWsUrl();
setStatus("idle", "Disconnected");
setConnectButton();
setMicButton();
setMicSelectEnabled();
setComposerEnabled(false);

View File

@@ -0,0 +1,158 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>VA Voice Chat &mdash; /ws-product</title>
<link rel="stylesheet" href="./styles.css" />
</head>
<body>
<main class="app">
<header class="app__header">
<div class="brand">
<span class="brand__dot" aria-hidden="true"></span>
<h1>VA Voice Chat</h1>
</div>
<div class="connection">
<label class="connection__field">
<span>WebSocket URL</span>
<input
id="ws-url"
type="text"
placeholder="ws://host/ws-product"
spellcheck="false"
autocomplete="off"
/>
</label>
<button id="connect-btn" class="btn btn--primary" type="button">
Connect
</button>
</div>
<div class="status">
<span id="status-dot" class="status__dot status__dot--idle"></span>
<span id="status-text" class="status__text">Disconnected</span>
</div>
</header>
<div class="app__body">
<div class="app__main">
<section class="chat" aria-label="Conversation history">
<div id="chat-log" class="chat__log" role="log" aria-live="polite">
<div class="chat__empty">
<p>Connect to the engine, enable your mic, and start talking.</p>
<p class="chat__hint">
Audio is streamed as PCM16 mono @ 16&nbsp;kHz over
<code>/ws-product</code>.
</p>
</div>
</div>
</section>
<footer class="controls" aria-label="Chat controls">
<div class="meter" aria-hidden="true">
<div id="meter-fill" class="meter__fill"></div>
</div>
<form id="composer" class="composer" autocomplete="off">
<textarea
id="text-input"
class="composer__input"
rows="1"
placeholder="Type a message, or use the mic…"
disabled
></textarea>
<button
id="send-btn"
class="btn btn--primary composer__send"
type="submit"
disabled
title="Send message (Enter)"
>
Send
</button>
</form>
<div class="controls__row">
<label class="device-picker">
<span class="device-picker__label">Microphone</span>
<select id="mic-select" class="device-picker__select" disabled>
<option value="">Default microphone</option>
</select>
</label>
<button
id="mic-btn"
class="mic-btn"
type="button"
disabled
aria-pressed="false"
title="Mic is off"
>
<svg
class="mic-btn__icon"
viewBox="0 0 24 24"
width="24"
height="24"
aria-hidden="true"
>
<path
d="M12 14a3 3 0 0 0 3-3V6a3 3 0 1 0-6 0v5a3 3 0 0 0 3 3Z"
fill="currentColor"
/>
<path
d="M19 11a1 1 0 1 0-2 0 5 5 0 0 1-10 0 1 1 0 1 0-2 0 7 7 0 0 0 6 6.92V21a1 1 0 1 0 2 0v-3.08A7 7 0 0 0 19 11Z"
fill="currentColor"
/>
</svg>
<span class="mic-btn__label">Enable mic</span>
</button>
<div class="indicators">
<span id="mic-indicator" class="indicator">
<span class="indicator__dot indicator__dot--mic"></span>
<span class="indicator__label">Mic</span>
</span>
<span id="bot-indicator" class="indicator">
<span class="indicator__dot indicator__dot--bot"></span>
<span class="indicator__label">Bot</span>
</span>
</div>
<button id="clear-btn" class="btn btn--ghost" type="button">
Clear
</button>
</div>
<p class="hint">
Press <kbd>Enter</kbd> to send, <kbd>Shift</kbd>+<kbd>Enter</kbd>
for newline. Sending text will interrupt the bot if it's speaking.
Browser echo cancellation is on; use headphones if echo persists.
</p>
</footer>
</div>
<section class="ws-log" aria-label="WebSocket log">
<div class="ws-log__header">
<div class="ws-log__header-left">
<h2>WebSocket Log</h2>
<div class="ws-log__legend" aria-hidden="true">
<span class="ws-log__legend-item ws-log__legend-item--send">Send</span>
<span class="ws-log__legend-item ws-log__legend-item--recv">Recv</span>
</div>
</div>
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
Clear log
</button>
</div>
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
<div class="ws-log__empty">No websocket events yet.</div>
</div>
</section>
</div>
</main>
<script type="module" src="./app.js"></script>
</body>
</html>

View File

@@ -0,0 +1,104 @@
/**
* PCM Recorder AudioWorklet.
*
* Captures mono Float32 mic samples at the AudioContext's native rate,
* resamples them to a target sample rate (default 16 kHz) with linear
* interpolation, then ships PCM16 frames of a fixed duration (default 20 ms)
* to the main thread via `port.postMessage(ArrayBuffer)`.
*
* It also computes a simple RMS level per frame for the UI VU meter so the
* main thread doesn't have to re-process the audio.
*/
class PcmRecorderProcessor extends AudioWorkletProcessor {
constructor(options) {
super();
const opts = (options && options.processorOptions) || {};
this._targetSampleRate = opts.targetSampleRate || 16000;
this._frameMs = opts.frameMs || 20;
this._frameSamples = Math.round(
(this._targetSampleRate * this._frameMs) / 1000,
);
// Resampling state.
// `ratio` is input samples per output sample.
this._ratio = sampleRate / this._targetSampleRate;
this._inputBuffer = new Float32Array(0);
// Float position in `_inputBuffer` for the next output sample.
this._inputOffset = 0;
// Output framing state.
this._frameBuffer = new Int16Array(this._frameSamples);
this._frameIndex = 0;
// VU meter accumulator.
this._rmsSumSquares = 0;
this._rmsCount = 0;
}
process(inputs) {
const input = inputs[0];
if (!input || input.length === 0) return true;
const channel = input[0];
if (!channel || channel.length === 0) return true;
// Append new samples to the input buffer.
const merged = new Float32Array(this._inputBuffer.length + channel.length);
merged.set(this._inputBuffer, 0);
merged.set(channel, this._inputBuffer.length);
this._inputBuffer = merged;
const ratio = this._ratio;
const inLen = this._inputBuffer.length;
let pos = this._inputOffset;
while (pos + 1 < inLen) {
const lo = Math.floor(pos);
const hi = lo + 1;
const w = pos - lo;
const sample =
this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w;
this._rmsSumSquares += sample * sample;
this._rmsCount += 1;
let s = sample;
if (s > 1) s = 1;
else if (s < -1) s = -1;
this._frameBuffer[this._frameIndex++] =
s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
if (this._frameIndex === this._frameSamples) {
const frame = new Int16Array(this._frameSamples);
frame.set(this._frameBuffer);
const rms =
this._rmsCount > 0
? Math.sqrt(this._rmsSumSquares / this._rmsCount)
: 0;
this.port.postMessage(
{ type: "frame", buffer: frame.buffer, rms },
[frame.buffer],
);
this._frameIndex = 0;
this._rmsSumSquares = 0;
this._rmsCount = 0;
}
pos += ratio;
}
// Trim consumed samples from the input buffer; keep at least the last
// sample we still need to interpolate against on the next call.
const consumed = Math.floor(pos);
if (consumed > 0) {
this._inputBuffer = this._inputBuffer.slice(consumed);
pos -= consumed;
}
this._inputOffset = pos;
return true;
}
}
registerProcessor("pcm-recorder", PcmRecorderProcessor);

View File

@@ -0,0 +1,739 @@
:root {
color-scheme: dark;
--bg: #0b0d12;
--bg-elevated: #131722;
--bg-soft: #1a2030;
--border: #232a3b;
--text: #e6e9f2;
--text-dim: #95a0bb;
--accent: #4f8cff;
--accent-strong: #6aa1ff;
--user: #2f7ff0;
--assistant: #2a3145;
--danger: #ff5577;
--success: #2dd28b;
--warning: #ffb84d;
--radius: 14px;
--shadow: 0 10px 30px rgba(0, 0, 0, 0.35);
font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto,
"Helvetica Neue", Arial, sans-serif;
}
* {
box-sizing: border-box;
}
html,
body {
margin: 0;
padding: 0;
height: 100%;
background: radial-gradient(
1200px 600px at 80% -10%,
rgba(79, 140, 255, 0.18),
transparent 60%
),
radial-gradient(
900px 500px at -10% 110%,
rgba(45, 210, 139, 0.12),
transparent 60%
),
var(--bg);
color: var(--text);
}
body {
display: flex;
justify-content: center;
padding: 12px;
}
.app {
display: grid;
grid-template-rows: auto minmax(0, 1fr);
gap: 12px;
width: min(1440px, 100%);
height: calc(100dvh - 24px);
max-height: calc(100vh - 24px);
background: var(--bg-elevated);
border: 1px solid var(--border);
border-radius: 20px;
box-shadow: var(--shadow);
padding: 14px;
}
.app__body {
display: grid;
grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
gap: 12px;
min-height: 0;
}
.app__main {
display: grid;
grid-template-rows: minmax(0, 1fr) auto;
gap: 0;
min-height: 0;
overflow: hidden;
background: var(--bg);
border: 1px solid var(--border);
border-radius: var(--radius);
}
/* Header ---------------------------------------------------------------- */
.app__header {
display: grid;
grid-template-columns: auto minmax(0, 1fr) auto;
align-items: center;
gap: 12px;
padding-bottom: 10px;
border-bottom: 1px solid var(--border);
}
.brand {
display: flex;
align-items: center;
gap: 10px;
}
.brand h1 {
font-size: 16px;
margin: 0;
letter-spacing: 0.2px;
}
.brand__dot {
width: 10px;
height: 10px;
border-radius: 50%;
background: var(--accent);
box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.18);
}
.connection {
display: flex;
align-items: end;
gap: 8px;
min-width: 0;
}
.connection__field {
display: flex;
flex-direction: column;
gap: 4px;
min-width: 0;
flex: 1;
}
.connection__field span {
font-size: 11px;
color: var(--text-dim);
text-transform: uppercase;
letter-spacing: 0.8px;
}
.connection__field input {
background: var(--bg-soft);
color: var(--text);
border: 1px solid var(--border);
border-radius: 10px;
padding: 7px 10px;
font: inherit;
font-size: 12px;
outline: none;
width: 100%;
min-width: 0;
}
.connection__field input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
}
.status {
display: flex;
align-items: center;
gap: 8px;
font-size: 13px;
color: var(--text-dim);
white-space: nowrap;
}
.status__dot {
width: 9px;
height: 9px;
border-radius: 50%;
background: var(--text-dim);
}
.status__dot--idle {
background: var(--text-dim);
}
.status__dot--connecting {
background: var(--warning);
animation: pulse 1.2s ease-in-out infinite;
}
.status__dot--connected {
background: var(--success);
}
.status__dot--error {
background: var(--danger);
}
/* Chat ------------------------------------------------------------------ */
.chat {
overflow: hidden;
display: flex;
flex-direction: column;
min-height: 0;
}
.chat__log {
flex: 1;
overflow-y: auto;
padding: 18px;
display: flex;
flex-direction: column;
gap: 10px;
scroll-behavior: smooth;
}
.chat__empty {
margin: auto;
text-align: center;
color: var(--text-dim);
}
.chat__empty p {
margin: 4px 0;
}
.chat__hint code {
background: var(--bg-soft);
border: 1px solid var(--border);
border-radius: 6px;
padding: 1px 6px;
font-size: 12px;
}
.bubble {
max-width: 78%;
padding: 10px 14px;
border-radius: 14px;
line-height: 1.45;
font-size: 14px;
white-space: pre-wrap;
word-wrap: break-word;
animation: bubble-in 0.16s ease-out;
}
.bubble--user {
background: var(--user);
color: #fff;
align-self: flex-end;
border-bottom-right-radius: 4px;
}
.bubble--assistant {
background: var(--assistant);
color: var(--text);
align-self: flex-start;
border-bottom-left-radius: 4px;
}
.bubble--assistant.bubble--interrupted {
opacity: 0.75;
border-left: 2px solid var(--warning);
}
.bubble--system {
align-self: center;
background: transparent;
color: var(--text-dim);
font-size: 12px;
border: 1px dashed var(--border);
padding: 6px 10px;
border-radius: 999px;
}
.bubble__role {
display: block;
font-size: 10px;
text-transform: uppercase;
letter-spacing: 0.8px;
opacity: 0.7;
margin-bottom: 4px;
}
/* WebSocket log --------------------------------------------------------- */
.ws-log {
display: flex;
flex-direction: column;
min-height: 0;
background: var(--bg);
border: 1px solid var(--border);
border-radius: var(--radius);
overflow: hidden;
}
.ws-log__header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 8px;
padding: 8px 12px;
border-bottom: 1px solid var(--border);
flex-shrink: 0;
background: var(--bg-soft);
}
.ws-log__legend {
display: flex;
align-items: center;
gap: 8px;
font-size: 10px;
color: var(--text-dim);
}
.ws-log__legend-item {
display: inline-flex;
align-items: center;
gap: 4px;
}
.ws-log__legend-item::before {
content: "";
width: 8px;
height: 8px;
border-radius: 2px;
}
.ws-log__legend-item--send::before {
background: var(--success);
}
.ws-log__legend-item--recv::before {
background: var(--accent-strong);
}
.ws-log__header h2 {
margin: 0;
font-size: 12px;
color: var(--text-dim);
text-transform: uppercase;
letter-spacing: 0.8px;
}
.ws-log__header-left {
display: flex;
align-items: center;
gap: 12px;
min-width: 0;
}
.ws-log__body {
flex: 1;
min-height: 0;
overflow-y: auto;
overflow-x: hidden;
padding: 6px 8px;
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
font-size: 10.5px;
line-height: 1.4;
color: var(--text-dim);
}
.ws-log__entry {
display: grid;
grid-template-columns: 58px 42px minmax(0, 1fr);
gap: 6px;
align-items: start;
padding: 5px 4px;
border-radius: 6px;
white-space: pre-wrap;
word-break: break-word;
}
.ws-log__entry:hover {
background: rgba(255, 255, 255, 0.03);
}
.ws-log__time {
color: var(--text-dim);
opacity: 0.7;
font-size: 10px;
}
.ws-log__direction {
font-weight: 700;
font-size: 9px;
letter-spacing: 0.4px;
text-transform: uppercase;
padding: 2px 0;
}
.ws-log__detail {
min-width: 0;
overflow-wrap: anywhere;
}
.ws-log__entry--send .ws-log__direction {
color: var(--success);
}
.ws-log__entry--recv .ws-log__direction {
color: var(--accent-strong);
}
.ws-log__entry--system .ws-log__direction {
color: var(--text-dim);
}
.ws-log__entry--error .ws-log__direction {
color: var(--danger);
}
.ws-log__empty {
color: var(--text-dim);
opacity: 0.7;
padding: 8px 4px;
}
/* Controls -------------------------------------------------------------- */
.controls {
display: grid;
gap: 8px;
padding: 10px 12px 8px;
border-top: 1px solid var(--border);
background: var(--bg-elevated);
}
.meter {
height: 6px;
background: var(--bg-soft);
border-radius: 999px;
overflow: hidden;
}
.meter__fill {
height: 100%;
width: 0%;
background: linear-gradient(90deg, var(--success), var(--accent));
transition: width 80ms linear;
}
.composer {
display: flex;
align-items: flex-end;
gap: 8px;
}
.composer__input {
flex: 1;
resize: none;
background: var(--bg-soft);
color: var(--text);
border: 1px solid var(--border);
border-radius: 12px;
padding: 10px 12px;
font: inherit;
font-size: 14px;
line-height: 1.4;
outline: none;
min-height: 42px;
max-height: 180px;
overflow-y: auto;
}
.composer__input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
}
.composer__input:disabled {
opacity: 0.55;
cursor: not-allowed;
}
.composer__send {
padding: 10px 18px;
border-radius: 12px;
min-width: 84px;
align-self: flex-end;
}
.composer__send:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.controls__row {
display: flex;
align-items: center;
gap: 10px;
flex-wrap: wrap;
}
.device-picker {
display: flex;
flex-direction: column;
gap: 3px;
min-width: 0;
flex: 1 1 160px;
max-width: 240px;
}
.device-picker__label {
font-size: 11px;
color: var(--text-dim);
text-transform: uppercase;
letter-spacing: 0.8px;
}
.device-picker__select {
appearance: none;
background: var(--bg-soft);
color: var(--text);
border: 1px solid var(--border);
border-radius: 10px;
padding: 7px 28px 7px 10px;
font: inherit;
font-size: 12px;
outline: none;
width: 100%;
cursor: pointer;
}
.device-picker__select:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
}
.device-picker__select:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.mic-btn {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 8px 12px;
border-radius: 999px;
background: var(--bg-soft);
color: var(--text);
border: 1px solid var(--border);
font: inherit;
font-size: 12px;
font-weight: 600;
cursor: pointer;
flex-shrink: 0;
transition: transform 0.08s ease, background 0.15s ease, color 0.15s ease,
border-color 0.15s ease;
}
.mic-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.mic-btn:not(:disabled):hover {
border-color: var(--accent);
}
.mic-btn:not(:disabled):active {
transform: scale(0.98);
}
.mic-btn[aria-pressed="true"] {
background: var(--danger);
border-color: var(--danger);
color: #fff;
box-shadow: 0 0 0 6px rgba(255, 85, 119, 0.18);
}
.mic-btn__icon {
display: block;
}
.indicators {
display: flex;
gap: 12px;
margin-left: auto;
}
.indicator {
display: inline-flex;
align-items: center;
gap: 6px;
font-size: 12px;
color: var(--text-dim);
}
.indicator__dot {
width: 8px;
height: 8px;
border-radius: 50%;
background: var(--bg-soft);
border: 1px solid var(--border);
}
.indicator.is-active .indicator__dot--mic {
background: var(--success);
border-color: var(--success);
box-shadow: 0 0 0 4px rgba(45, 210, 139, 0.2);
}
.indicator.is-active .indicator__dot--bot {
background: var(--accent);
border-color: var(--accent);
box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.22);
animation: pulse 1s ease-in-out infinite;
}
.indicator.is-active .indicator__label {
color: var(--text);
}
.btn {
appearance: none;
border: 1px solid var(--border);
background: var(--bg-soft);
color: var(--text);
padding: 9px 14px;
border-radius: 10px;
font: inherit;
font-size: 13px;
cursor: pointer;
transition: background 0.15s ease, border-color 0.15s ease;
}
.btn:hover {
border-color: var(--accent);
}
.btn--primary {
background: var(--accent);
color: #fff;
border-color: var(--accent);
}
.btn--primary:hover {
background: var(--accent-strong);
border-color: var(--accent-strong);
}
.btn--primary.is-disconnect {
background: transparent;
color: var(--danger);
border-color: var(--danger);
}
.btn--ghost {
background: transparent;
}
.hint {
margin: 0;
font-size: 11px;
color: var(--text-dim);
opacity: 0.85;
}
.hint kbd {
background: var(--bg-soft);
border: 1px solid var(--border);
border-bottom-width: 2px;
border-radius: 4px;
padding: 0 5px;
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
font-size: 11px;
color: var(--text);
}
/* Animations ------------------------------------------------------------ */
@keyframes pulse {
0%,
100% {
opacity: 1;
}
50% {
opacity: 0.55;
}
}
@keyframes bubble-in {
from {
opacity: 0;
transform: translateY(4px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Responsive ------------------------------------------------------------ */
@media (max-width: 820px) {
.app__body {
grid-template-columns: 1fr;
grid-template-rows: minmax(0, 1fr) min(240px, 32vh);
}
.ws-log__legend {
display: none;
}
.hint {
display: none;
}
}
@media (max-width: 720px) {
body {
padding: 0;
}
.app {
height: 100dvh;
max-height: 100vh;
border-radius: 0;
border: none;
padding: 10px;
}
.app__header {
grid-template-columns: 1fr;
}
.connection {
flex-direction: column;
align-items: stretch;
}
.ws-log__entry {
grid-template-columns: 54px 38px minmax(0, 1fr);
}
.status {
justify-content: flex-end;
}
.device-picker {
flex: 1 1 100%;
max-width: none;
}
.indicators {
margin-left: 0;
}
}