diff --git a/config/voice.json b/config/voice.json new file mode 100644 index 0000000..64f64af --- /dev/null +++ b/config/voice.json @@ -0,0 +1,75 @@ +{ + "server": { + "host": "0.0.0.0", + "port": 8000, + "cors_origins": ["http://localhost:3000", "http://localhost:8080"], + "serve_webpage": true, + "webpage_mount": "/voice-demo" + }, + "audio": { + "sample_rate_hz": 16000, + "channels": 1, + "frame_ms": 20 + }, + "session": { + "inactivity_timeout_sec": 60 + }, + "turn": { + "vad": { + "confidence": 0.7, + "start_secs": 0.2, + "stop_secs": 0.4, + "min_volume": 0.6 + }, + "interruption_min_chars": 3, + "interruption_use_interim": true, + "interruption_short_replies": [ + "是", + "是的", + "对", + "对的", + "嗯", + "好", + "好的", + "行", + "可以", + "没问题", + "不是", + "不", + "不行", + "不用", + "不要", + "没有", + "否" + ], + "user_speech_timeout_sec": 0.8 + }, + "agent": { + "system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.", + "greeting": "Please introduce yourself briefly.", + "greeting_mode": "generated" + }, + "services": { + "stt": { + "provider": "openai", + "api_key": "", + "base_url": null, + "model": "gpt-4o-mini-transcribe", + "language": "en" + }, + "llm": { + "provider": "openai", + "api_key": "", + "base_url": null, + "model": "gpt-4o-mini", + "temperature": 0.7 + }, + "tts": { + "provider": "openai", + "api_key": "", + "base_url": null, + "model": "gpt-4o-mini-tts", + "voice": "alloy" + } + } +} diff --git a/requirements.txt b/requirements.txt index 7ab0031..3baac68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ fastapi>=0.104.0 -uvicorn>=0.24.0 +uvicorn[standard]>=0.24.0 +pipecat-ai[websocket,openai,silero] +websockets>=13.1,<16.0 pydantic>=2.4.2 python-dotenv>=1.0.0 httpx>=0.25.0 @@ -12,7 +14,7 @@ pydantic-settings==2.1.0 python-multipart==0.0.6 python-jose[cryptography]==3.3.0 passlib[bcrypt]==1.7.4 -openai==1.55.3 +openai>=1.74.0,<3 loguru>=0.7.0 pandas requests diff --git a/src/main.py b/src/main.py index 25f7d45..6d399cb 100644 --- a/src/main.py +++ b/src/main.py @@ -1,8 +1,8 @@ from fastapi import FastAPI -import sys from .api.endpoints import router as api_router from .core.fastgpt_client import lifespan from .core.logging_config import setup_logging +from .voice.routes import register_voice # Setup logging first setup_logging() @@ -18,4 +18,5 @@ app = FastAPI( def read_root(): return {"message": "Server is running."} -app.include_router(api_router) \ No newline at end of file +app.include_router(api_router) +register_voice(app) \ No newline at end of file diff --git a/src/voice/__init__.py b/src/voice/__init__.py new file mode 100644 index 0000000..79f896a --- /dev/null +++ b/src/voice/__init__.py @@ -0,0 +1 @@ +"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat.""" diff --git a/src/voice/config.py b/src/voice/config.py new file mode 100644 index 0000000..327e47b --- /dev/null +++ b/src/voice/config.py @@ -0,0 +1,202 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +DEFAULT_VOICE_CONFIG = PROJECT_ROOT / "config" / "voice.json" + + +@dataclass(frozen=True) +class ServerConfig: + host: str = "0.0.0.0" + port: int = 8000 + cors_origins: list[str] = field(default_factory=list) + serve_webpage: bool = True + webpage_mount: str = "/voice-demo" + + +@dataclass(frozen=True) +class AudioConfig: + sample_rate_hz: int = 16000 + channels: int = 1 + frame_ms: int = 20 + + @property + def frame_bytes(self) -> int: + return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2 + + +@dataclass(frozen=True) +class SessionConfig: + inactivity_timeout_sec: int = 60 + + +@dataclass(frozen=True) +class VADConfig: + confidence: float = 0.7 + start_secs: float = 0.2 + stop_secs: float = 0.6 + min_volume: float = 0.6 + + +@dataclass(frozen=True) +class TurnConfig: + vad: VADConfig = field(default_factory=VADConfig) + user_speech_timeout_sec: float = 1.0 + interruption_min_chars: int = 3 + interruption_use_interim: bool = True + interruption_short_replies: list[str] = field( + default_factory=lambda: [ + "是", + "是的", + "对", + "对的", + "嗯", + "好", + "好的", + "行", + "可以", + "没问题", + "不是", + "不", + "不行", + "不用", + "不要", + "没有", + "否", + "no", + "yes", + "ok", + "okay", + ] + ) + + +@dataclass(frozen=True) +class AgentConfig: + system_prompt: str = "You are a helpful, friendly voice assistant." + greeting: str | None = None + greeting_mode: str = "generated" + + +@dataclass(frozen=True) +class LLMConfig: + provider: str = "openai" + api_key: str = "" + base_url: str | None = None + model: str = "gpt-4o-mini" + temperature: float | None = 0.7 + + +@dataclass(frozen=True) +class STTConfig: + provider: str = "openai" + app_id: str = "" + api_key: str = "" + api_secret: str = "" + base_url: str | None = None + model: str = "gpt-4o-mini-transcribe" + language: str | None = "en" + domain: str = "iat" + accent: str = "mandarin" + encoding: str = "raw" + frame_size: int = 1280 + timeout_sec: float = 10.0 + dynamic_correction: bool = False + + +@dataclass(frozen=True) +class TTSConfig: + provider: str = "openai" + app_id: str = "" + api_key: str = "" + api_secret: str = "" + base_url: str | None = None + model: str = "gpt-4o-mini-tts" + voice: str = "alloy" + aue: str = "raw" + tte: str = "UTF8" + speed: int = 50 + volume: int = 50 + pitch: int = 50 + timeout_sec: float = 30.0 + source_sample_rate_hz: int | None = None + + +@dataclass(frozen=True) +class ServicesConfig: + llm: LLMConfig = field(default_factory=LLMConfig) + stt: STTConfig = field(default_factory=STTConfig) + tts: TTSConfig = field(default_factory=TTSConfig) + + +@dataclass(frozen=True) +class EngineConfig: + server: ServerConfig = field(default_factory=ServerConfig) + audio: AudioConfig = field(default_factory=AudioConfig) + session: SessionConfig = field(default_factory=SessionConfig) + turn: TurnConfig = field(default_factory=TurnConfig) + agent: AgentConfig = field(default_factory=AgentConfig) + services: ServicesConfig = field(default_factory=ServicesConfig) + + +def load_config(path: str | Path | None = None) -> EngineConfig: + config_path = Path(path) if path is not None else DEFAULT_VOICE_CONFIG + if not config_path.is_absolute(): + config_path = PROJECT_ROOT / config_path + data = json.loads(config_path.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise ValueError(f"Config file must contain a JSON object: {config_path}") + return config_from_dict(data) + + +def config_from_dict(data: dict) -> EngineConfig: + services = _dict(data.get("services")) + agent = _dict(data.get("agent")) + if agent.get("greeting") == "": + agent["greeting"] = None + if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"): + raise ValueError("agent.greeting_mode must be one of: generated, fixed, off") + + stt = _dict(services.get("stt") or services.get("asr")) + if stt.get("language") == "": + stt["language"] = None + + turn = _dict(data.get("turn")) + vad = _dict(turn.get("vad")) + + return EngineConfig( + server=ServerConfig(**_dict(data.get("server"))), + audio=AudioConfig(**_dict(data.get("audio"))), + session=SessionConfig(**_dict(data.get("session"))), + turn=TurnConfig( + vad=VADConfig(**vad), + user_speech_timeout_sec=float( + turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec) + ), + interruption_min_chars=int( + turn.get("interruption_min_chars", TurnConfig().interruption_min_chars) + ), + interruption_use_interim=bool( + turn.get("interruption_use_interim", TurnConfig().interruption_use_interim) + ), + interruption_short_replies=list( + turn.get( + "interruption_short_replies", + TurnConfig().interruption_short_replies, + ) + ), + ), + agent=AgentConfig(**agent), + services=ServicesConfig( + llm=LLMConfig(**_dict(services.get("llm"))), + stt=STTConfig(**stt), + tts=TTSConfig(**_dict(services.get("tts"))), + ), + ) + + +def _dict(value: object) -> dict: + return dict(value) if isinstance(value, dict) else {} diff --git a/src/voice/pipeline.py b/src/voice/pipeline.py new file mode 100644 index 0000000..417015a --- /dev/null +++ b/src/voice/pipeline.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +from loguru import logger + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.audio.vad.vad_analyzer import VADParams +from pipecat.frames.frames import ( + LLMRunFrame, + OutputTransportMessageUrgentFrame, + TTSSpeakFrame, +) +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.llm_context import LLMContext +from pipecat.processors.aggregators.llm_response_universal import ( + AssistantTurnStoppedMessage, + LLMContextAggregatorPair, + LLMUserAggregatorParams, + UserTurnStoppedMessage, +) +from pipecat.serializers.base_serializer import FrameSerializer +from pipecat.transports.websocket.fastapi import ( + FastAPIWebsocketParams, + FastAPIWebsocketTransport, +) +from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import ( + SpeechTimeoutUserTurnStopStrategy, +) +from pipecat.turns.user_turn_strategies import UserTurnStrategies + +from .config import EngineConfig +from .protocol import ProductWebsocketSerializer +from .services import create_llm_service, create_stt_service, create_tts_service +from .text_input import ProductTextInputProcessor +from .text_stream import ProductTextStreamProcessor +from .transcript_stream import ProductTranscriptStreamProcessor +from .turn_start import InterruptionGateUserTurnStartStrategy + + +async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None: + await run_pipeline_with_serializer( + websocket, + config, + serializer=ProductWebsocketSerializer( + sample_rate=config.audio.sample_rate_hz, + channels=config.audio.channels, + ), + client_label="Product JSON", + ) + + +async def run_pipeline_with_serializer( + websocket, + config: EngineConfig, + *, + serializer: FrameSerializer, + client_label: str, +) -> None: + transport = FastAPIWebsocketTransport( + websocket=websocket, + params=FastAPIWebsocketParams( + audio_in_enabled=True, + audio_out_enabled=True, + audio_in_sample_rate=config.audio.sample_rate_hz, + audio_out_sample_rate=config.audio.sample_rate_hz, + audio_in_channels=config.audio.channels, + audio_out_channels=config.audio.channels, + serializer=serializer, + session_timeout=None, + ), + ) + + stt = create_stt_service(config.services.stt, config.audio) + llm = create_llm_service(config.services.llm) + tts = create_tts_service(config.services.tts, config.audio) + + messages = [{"role": "system", "content": config.agent.system_prompt}] + if config.agent.greeting and config.agent.greeting_mode == "generated": + messages.append({"role": "system", "content": config.agent.greeting}) + + context = LLMContext(messages) + + vad_params = VADParams( + confidence=config.turn.vad.confidence, + start_secs=config.turn.vad.start_secs, + stop_secs=config.turn.vad.stop_secs, + min_volume=config.turn.vad.min_volume, + ) + user_turn_strategies = UserTurnStrategies( + start=[ + InterruptionGateUserTurnStartStrategy( + min_chars_when_bot_speaking=config.turn.interruption_min_chars, + allowed_short_replies=config.turn.interruption_short_replies, + use_interim=config.turn.interruption_use_interim, + ), + ], + stop=[ + SpeechTimeoutUserTurnStopStrategy( + user_speech_timeout=config.turn.user_speech_timeout_sec, + ), + ], + ) + user_aggregator, assistant_aggregator = LLMContextAggregatorPair( + context, + user_params=LLMUserAggregatorParams( + vad_analyzer=SileroVADAnalyzer(params=vad_params), + user_turn_strategies=user_turn_strategies, + ), + ) + + pipeline = Pipeline( + [ + transport.input(), + ProductTextInputProcessor(), + stt, + ProductTranscriptStreamProcessor(), + user_aggregator, + llm, + ProductTextStreamProcessor(), + tts, + transport.output(), + assistant_aggregator, + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + audio_in_sample_rate=config.audio.sample_rate_hz, + audio_out_sample_rate=config.audio.sample_rate_hz, + enable_metrics=True, + enable_usage_metrics=True, + enable_heartbeats=True, + ), + idle_timeout_secs=config.session.inactivity_timeout_sec, + ) + + @transport.event_handler("on_client_connected") + async def on_client_connected(_transport, _client): + logger.info(f"{client_label} websocket client connected") + if config.agent.greeting_mode == "fixed" and config.agent.greeting: + await task.queue_frames([TTSSpeakFrame(config.agent.greeting)]) + elif config.agent.greeting_mode == "generated": + await task.queue_frames([LLMRunFrame()]) + + @transport.event_handler("on_client_disconnected") + async def on_client_disconnected(_transport, _client): + logger.info(f"{client_label} websocket client disconnected") + await task.cancel() + + @transport.event_handler("on_session_timeout") + async def on_session_timeout(_transport, _client): + logger.info(f"{client_label} websocket session timed out") + await task.cancel() + + @user_aggregator.event_handler("on_user_turn_stopped") + async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage): + logger.info(f"User: {message.content}") + text = (message.content or "").strip() + if not text: + return + await task.queue_frame( + OutputTransportMessageUrgentFrame( + message={ + "type": "input.transcript.final", + "text": text, + "user_id": message.user_id, + "timestamp": message.timestamp, + } + ) + ) + + @assistant_aggregator.event_handler("on_assistant_turn_stopped") + async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage): + logger.info(f"Assistant: {message.content}") + + runner = PipelineRunner(handle_sigint=False) + await runner.run(task) diff --git a/src/voice/protocol.py b/src/voice/protocol.py new file mode 100644 index 0000000..79d4473 --- /dev/null +++ b/src/voice/protocol.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import base64 +import json +from typing import Any + +from loguru import logger + +from pipecat.frames.frames import ( + CancelFrame, + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + EndFrame, + Frame, + InputAudioRawFrame, + InputTransportMessageFrame, + OutputAudioRawFrame, + OutputTransportMessageFrame, + OutputTransportMessageUrgentFrame, + TextFrame, + TranscriptionFrame, +) +from pipecat.serializers.base_serializer import FrameSerializer + + +class ProductWebsocketSerializer(FrameSerializer): + """Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport.""" + + protocol = "va.ws.v1" + + def __init__(self, *, sample_rate: int, channels: int): + super().__init__() + self._sample_rate = sample_rate + self._channels = channels + self._sequence = 0 + + async def serialize(self, frame: Frame) -> str | bytes | None: + if isinstance(frame, OutputAudioRawFrame): + return self._event( + "response.audio.delta", + audio=base64.b64encode(frame.audio).decode("ascii"), + bytes=len(frame.audio), + sample_rate=frame.sample_rate, + channels=frame.num_channels, + ) + + if isinstance(frame, BotStartedSpeakingFrame): + return self._event("response.audio.started") + + if isinstance(frame, BotStoppedSpeakingFrame): + return self._event("response.audio.stopped") + + if isinstance(frame, TranscriptionFrame): + return self._event( + "input.transcript.final", + text=frame.text, + user_id=frame.user_id, + timestamp=frame.timestamp, + ) + + if isinstance(frame, TextFrame): + return self._event("response.text.delta", text=frame.text) + + if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)): + if self.should_ignore_frame(frame): + return None + message = frame.message + if isinstance(message, dict) and isinstance(message.get("type"), str): + event_type = message["type"] + payload = {k: v for k, v in message.items() if k != "type"} + return self._event(event_type, **payload) + return self._event("transport.message", message=message) + + return None + + async def deserialize(self, data: str | bytes) -> Frame | None: + if isinstance(data, bytes): + return InputAudioRawFrame( + audio=data, + sample_rate=self._sample_rate, + num_channels=self._channels, + ) + + try: + message = json.loads(data) + except json.JSONDecodeError as exc: + logger.warning(f"Invalid product websocket JSON: {exc}") + return None + + if not isinstance(message, dict): + logger.warning("Product websocket message must be a JSON object") + return None + + message_type = message.get("type") + if message_type == "session.start": + return InputTransportMessageFrame( + message={ + "type": "session.started", + "protocol": self.protocol, + "audio": { + "encoding": "pcm_s16le", + "sample_rate": self._sample_rate, + "channels": self._channels, + }, + } + ) + + if message_type == "session.stop": + return EndFrame() + + if message_type == "response.cancel": + return CancelFrame(reason="client_cancelled") + + if message_type == "input.audio": + audio = message.get("audio") or message.get("data") + if not isinstance(audio, str): + logger.warning("input.audio requires base64 'audio' or 'data'") + return None + try: + pcm = base64.b64decode(audio) + except ValueError as exc: + logger.warning(f"Invalid input.audio base64: {exc}") + return None + return InputAudioRawFrame( + audio=pcm, + sample_rate=int(message.get("sample_rate") or self._sample_rate), + num_channels=int(message.get("channels") or self._channels), + ) + + if message_type == "input.text": + text = message.get("text") + if not isinstance(text, str) or not text.strip(): + logger.warning("input.text requires non-empty 'text'") + return None + return InputTransportMessageFrame( + message={ + "type": "input.text", + "text": text, + "interrupt": bool(message.get("interrupt", True)), + } + ) + + if message_type == "transport.message": + payload = message.get("message") + return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message) + + logger.warning(f"Unsupported product websocket message type: {message_type!r}") + return None + + def _event(self, event_type: str, **payload: Any) -> str: + self._sequence += 1 + return json.dumps( + { + "type": event_type, + "protocol": self.protocol, + "seq": self._sequence, + **payload, + }, + ensure_ascii=False, + ) diff --git a/src/voice/routes.py b/src/voice/routes.py new file mode 100644 index 0000000..7b21621 --- /dev/null +++ b/src/voice/routes.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from functools import lru_cache +from pathlib import Path + +from fastapi import APIRouter, FastAPI, WebSocket +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles +from loguru import logger + +from .config import DEFAULT_VOICE_CONFIG, EngineConfig, load_config +from .pipeline import run_product_voice_pipeline + +PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent +VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo" + +router = APIRouter(tags=["voice"]) + + +@lru_cache(maxsize=1) +def get_voice_config() -> EngineConfig: + return load_config(DEFAULT_VOICE_CONFIG) + + +def _normalize_mount_path(path: str) -> str: + normalized = path.strip() or "/voice-demo" + if not normalized.startswith("/"): + normalized = f"/{normalized}" + return normalized.rstrip("/") or "/" + + +@router.get("/voice/health") +async def voice_health() -> dict[str, object]: + config = get_voice_config() + mount = ( + _normalize_mount_path(config.server.webpage_mount) + if config.server.serve_webpage + else None + ) + return { + "status": "healthy", + "protocols": { + "/ws-product": "va.ws.v1.json_base64", + }, + "features": { + "product_text_input": True, + "product_text_interrupt": True, + }, + "demo": mount, + "llm_provider": config.services.llm.provider, + "stt_provider": config.services.stt.provider, + "tts_provider": config.services.tts.provider, + } + + +@router.websocket("/ws-product") +async def product_websocket_endpoint(websocket: WebSocket) -> None: + await websocket.accept() + config = get_voice_config() + await run_product_voice_pipeline(websocket, config) + + +def register_voice(app: FastAPI) -> None: + """Mount voice websocket routes and optional browser demo static files.""" + if not DEFAULT_VOICE_CONFIG.exists(): + logger.warning(f"Voice config not found at {DEFAULT_VOICE_CONFIG}; voice demo disabled") + return + + config = get_voice_config() + app.include_router(router) + + if config.server.cors_origins: + app.add_middleware( + CORSMiddleware, + allow_origins=config.server.cors_origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir(): + mount = _normalize_mount_path(config.server.webpage_mount) + app.mount( + mount, + StaticFiles(directory=str(VOICE_DEMO_DIR), html=True), + name="voice-demo", + ) + logger.info(f"Voice demo mounted at {mount}") + else: + logger.info("Voice demo static page disabled or missing") + + logger.info("Voice websocket registered at /ws-product") diff --git a/src/voice/services.py b/src/voice/services.py new file mode 100644 index 0000000..b322be6 --- /dev/null +++ b/src/voice/services.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +from collections.abc import AsyncGenerator + +from openai import BadRequestError +from openai import NOT_GIVEN + +from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame +from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE +from pipecat.services.openai.llm import OpenAILLMService +from pipecat.services.openai.stt import OpenAISTTService +from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService +from pipecat.transcriptions.language import Language + +from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig +from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService +from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService + + +def create_stt_service(config: STTConfig, audio: AudioConfig | None = None): + if config.provider == "xfyun": + sample_rate = audio.sample_rate_hz if audio else 16000 + return XfyunASRService( + app_id=config.app_id, + api_key=config.api_key or "", + api_secret=config.api_secret, + url=config.base_url or DEFAULT_XFYUN_ASR_URL, + language=config.language or "zh_cn", + domain=config.domain, + accent=config.accent, + sample_rate=sample_rate, + encoding=config.encoding, + frame_size=config.frame_size, + open_timeout=config.timeout_sec, + dynamic_correction=config.dynamic_correction, + ) + + _require_provider(config.provider, "openai", "stt") + return OpenAISTTService( + api_key=config.api_key or None, + base_url=config.base_url, + settings=OpenAISTTService.Settings( + model=config.model, + language=_language(config.language), + ), + ) + + +def create_llm_service(config: LLMConfig): + _require_provider(config.provider, "openai", "llm") + return OpenAILLMService( + api_key=config.api_key or None, + base_url=config.base_url, + settings=OpenAILLMService.Settings( + model=config.model, + temperature=config.temperature if config.temperature is not None else NOT_GIVEN, + ), + ) + + +def create_tts_service(config: TTSConfig, audio: AudioConfig): + if config.provider == "xfyun": + source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz + if source_sample_rate not in (8000, 16000): + raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000") + return XfyunTTSService( + app_id=config.app_id, + api_key=config.api_key or "", + api_secret=config.api_secret, + voice=config.voice, + url=config.base_url or DEFAULT_XFYUN_TTS_URL, + sample_rate=audio.sample_rate_hz, + source_sample_rate=source_sample_rate, + encoding=config.aue, + text_encoding=config.tte, + speed=config.speed, + volume=config.volume, + pitch=config.pitch, + timeout=config.timeout_sec, + ) + + _require_provider(config.provider, "openai", "tts") + service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService + return service_class( + api_key=config.api_key or None, + base_url=config.base_url, + sample_rate=audio.sample_rate_hz, + source_sample_rate=config.source_sample_rate_hz, + settings=OpenAITTSService.Settings( + model=config.model, + voice=config.voice, + ), + ) + + +class OpenAICompatibleTTSService(OpenAITTSService): + """OpenAI-compatible TTS service that permits provider-specific voice ids.""" + + def __init__(self, *, source_sample_rate: int | None = None, **kwargs): + super().__init__(**kwargs) + self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE + + async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: + voice = self._settings.voice + if not voice: + yield ErrorFrame(error="TTS voice must be specified") + return + + try: + create_params = { + "input": text, + "model": self._settings.model, + "voice": voice, + "response_format": "pcm", + } + + if self._settings.instructions: + create_params["instructions"] = self._settings.instructions + + if self._settings.speed: + create_params["speed"] = self._settings.speed + + async with self._client.audio.speech.with_streaming_response.create( + **create_params + ) as response: + if response.status_code != 200: + error = await response.text() + yield ErrorFrame( + error=f"TTS request failed (status: {response.status_code}, error: {error})" + ) + return + + await self.start_tts_usage_metrics(text) + + async def audio_chunks(): + async for chunk in response.iter_bytes(self.chunk_size): + if chunk: + yield chunk + + first_frame = True + async for frame in self._stream_audio_frames_from_iterator( + audio_chunks(), + in_sample_rate=self._source_sample_rate, + context_id=context_id, + ): + if first_frame: + await self.stop_ttfb_metrics() + first_frame = False + yield frame + except BadRequestError as exc: + yield ErrorFrame(error=f"TTS request failed: {exc}") + except Exception as exc: + yield ErrorFrame(error=f"TTS request failed: {exc}") + + +def _require_provider(actual: str, expected: str, service_name: str) -> None: + if actual != expected: + raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}") + + +def _language(value: str | None) -> Language | None: + if value is None: + return None + normalized = value.replace("-", "_").upper() + return getattr(Language, normalized, value) diff --git a/src/voice/text_input.py b/src/voice/text_input.py new file mode 100644 index 0000000..a864ceb --- /dev/null +++ b/src/voice/text_input.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from loguru import logger + +from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class ProductTextInputProcessor(FrameProcessor): + """Converts product text-input transport messages into LLM turns.""" + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if not isinstance(frame, InputTransportMessageFrame): + await self.push_frame(frame, direction) + return + + message = frame.message + if not isinstance(message, dict) or message.get("type") != "input.text": + await self.push_frame(frame, direction) + return + + text = str(message.get("text") or "").strip() + if not text: + return + + if message.get("interrupt", True): + logger.info("Text input interrupting current response") + await self.broadcast_interruption() + + await self.push_frame( + LLMMessagesAppendFrame( + messages=[{"role": "user", "content": text}], + run_llm=True, + ), + FrameDirection.DOWNSTREAM, + ) diff --git a/src/voice/text_stream.py b/src/voice/text_stream.py new file mode 100644 index 0000000..253a9fd --- /dev/null +++ b/src/voice/text_stream.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from pipecat.frames.frames import ( + Frame, + InterruptionFrame, + LLMFullResponseEndFrame, + LLMFullResponseStartFrame, + LLMTextFrame, + OutputTransportMessageUrgentFrame, + TTSSpeakFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class ProductTextStreamProcessor(FrameProcessor): + """Mirrors LLM text frames as streaming protocol events.""" + + def __init__(self) -> None: + super().__init__() + self._aggregation: list[str] = [] + self._turn_active = False + + async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: + await super().process_frame(frame, direction) + + if isinstance(frame, LLMFullResponseStartFrame): + await self._start_turn() + elif isinstance(frame, LLMTextFrame): + if frame.text: + await self._delta(frame.text) + elif isinstance(frame, LLMFullResponseEndFrame): + await self._end_turn(interrupted=False) + elif isinstance(frame, InterruptionFrame): + await self._end_turn(interrupted=True) + elif isinstance(frame, TTSSpeakFrame): + text = frame.text or "" + await self._start_turn() + if text: + await self._delta(text) + await self._end_turn(interrupted=False) + + await self.push_frame(frame, direction) + + async def _start_turn(self) -> None: + if self._turn_active: + return + self._turn_active = True + self._aggregation = [] + await self._emit("response.text.started") + + async def _delta(self, text: str) -> None: + if not self._turn_active: + await self._start_turn() + self._aggregation.append(text) + await self._emit("response.text.delta", text=text) + + async def _end_turn(self, *, interrupted: bool) -> None: + if not self._turn_active: + return + full_text = "".join(self._aggregation) + self._turn_active = False + self._aggregation = [] + await self._emit( + "response.text.final", + text=full_text, + interrupted=interrupted, + ) + + async def _emit(self, event_type: str, **payload: object) -> None: + await self.push_frame( + OutputTransportMessageUrgentFrame( + message={"type": event_type, **payload}, + ), + FrameDirection.DOWNSTREAM, + ) diff --git a/src/voice/transcript_stream.py b/src/voice/transcript_stream.py new file mode 100644 index 0000000..e44bb0a --- /dev/null +++ b/src/voice/transcript_stream.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from pipecat.frames.frames import ( + Frame, + InterimTranscriptionFrame, + OutputTransportMessageUrgentFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + +class ProductTranscriptStreamProcessor(FrameProcessor): + """Mirrors interim STT frames to the product websocket protocol.""" + + async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: + await super().process_frame(frame, direction) + + if isinstance(frame, InterimTranscriptionFrame): + await self.push_frame( + OutputTransportMessageUrgentFrame( + message={ + "type": "input.transcript.interim", + "text": frame.text, + "user_id": frame.user_id, + "timestamp": frame.timestamp, + } + ), + FrameDirection.DOWNSTREAM, + ) + + await self.push_frame(frame, direction) diff --git a/src/voice/turn_start.py b/src/voice/turn_start.py new file mode 100644 index 0000000..cdc5643 --- /dev/null +++ b/src/voice/turn_start.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import re + +from loguru import logger +from pipecat.frames.frames import ( + BotStartedSpeakingFrame, + BotStoppedSpeakingFrame, + Frame, + InterimTranscriptionFrame, + TranscriptionFrame, +) +from pipecat.turns.types import ProcessFrameResult +from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy + + +_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE) + + +class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy): + """Starts user turns only after likely intentional speech.""" + + def __init__( + self, + *, + min_chars_when_bot_speaking: int, + allowed_short_replies: list[str], + use_interim: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self._min_chars_when_bot_speaking = min_chars_when_bot_speaking + self._allowed_short_replies = { + self._normalize_text(reply) for reply in allowed_short_replies if reply.strip() + } + self._use_interim = use_interim + self._bot_speaking = False + + async def reset(self): + await super().reset() + + async def process_frame(self, frame: Frame) -> ProcessFrameResult: + if isinstance(frame, BotStartedSpeakingFrame): + self._bot_speaking = True + return ProcessFrameResult.CONTINUE + if isinstance(frame, BotStoppedSpeakingFrame): + self._bot_speaking = False + return ProcessFrameResult.CONTINUE + if isinstance(frame, InterimTranscriptionFrame) and self._use_interim: + return await self._handle_transcription(frame.text, interim=True) + if isinstance(frame, TranscriptionFrame): + return await self._handle_transcription(frame.text, interim=False) + + return ProcessFrameResult.CONTINUE + + async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult: + normalized = self._normalize_text(text) + if not normalized: + return ProcessFrameResult.CONTINUE + + if not self._bot_speaking: + await self.trigger_user_turn_started() + return ProcessFrameResult.STOP + + should_interrupt = self._should_interrupt(normalized) + logger.debug( + f"{self} interruption_gate text={text!r} normalized={normalized!r} " + f"should_interrupt={should_interrupt} interim={interim}" + ) + + if should_interrupt: + await self.trigger_user_turn_started() + return ProcessFrameResult.STOP + + await self.trigger_reset_aggregation() + return ProcessFrameResult.CONTINUE + + def _should_interrupt(self, normalized: str) -> bool: + return ( + normalized in self._allowed_short_replies + or len(normalized) >= self._min_chars_when_bot_speaking + ) + + @staticmethod + def _normalize_text(text: str) -> str: + return "".join(_COUNTABLE_TEXT_RE.findall(text.lower())) diff --git a/src/voice/xfyun_asr.py b/src/voice/xfyun_asr.py new file mode 100644 index 0000000..939e5c7 --- /dev/null +++ b/src/voice/xfyun_asr.py @@ -0,0 +1,353 @@ +from __future__ import annotations + +import asyncio +import base64 +import hashlib +import hmac +import json +import os +from collections.abc import AsyncGenerator +from datetime import datetime, timezone +from email.utils import format_datetime +from typing import Any +from urllib.parse import urlencode, urlparse + +from loguru import logger + +from pipecat.frames.frames import ( + CancelFrame, + EndFrame, + Frame, + InterimTranscriptionFrame, + TranscriptionFrame, + UserStoppedSpeakingFrame, + VADUserStartedSpeakingFrame, +) +from pipecat.processors.frame_processor import FrameDirection +from pipecat.services.settings import STTSettings +from pipecat.services.stt_service import STTService +from pipecat.transcriptions.language import Language +from pipecat.utils.time import time_now_iso8601 +from websockets.asyncio.client import connect as websocket_connect +from websockets.protocol import State + + +DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat" + + +class XfyunASRService(STTService): + """iFlytek/Xfyun streaming voice dictation service for Pipecat.""" + + def __init__( + self, + *, + app_id: str, + api_key: str, + api_secret: str, + url: str | None = None, + language: str = "zh_cn", + domain: str = "iat", + accent: str = "mandarin", + sample_rate: int = 16000, + encoding: str = "raw", + frame_size: int = 1280, + open_timeout: float = 10.0, + dynamic_correction: bool = False, + **kwargs, + ) -> None: + super().__init__( + sample_rate=sample_rate, + settings=STTSettings(model=None, language=language), + **kwargs, + ) + self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "") + self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "") + self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "") + self._url = url or DEFAULT_XFYUN_ASR_URL + self._language = language + self._domain = domain + self._accent = accent + self._encoding = encoding + self._frame_size = frame_size + self._open_timeout = open_timeout + self._dynamic_correction = dynamic_correction + + self._websocket = None + self._receive_task = None + self._audio_buffer = bytearray() + self._sent_first_frame = False + self._sent_final_frame = False + self._finalizing_turn = False + self._partials: list[str] = [] + self._last_text = "" + + async def cleanup(self) -> None: + await self._close_utterance() + await super().cleanup() + + async def stop(self, frame: EndFrame) -> None: + await self._close_utterance() + await super().stop(frame) + + async def cancel(self, frame: CancelFrame) -> None: + await self._close_utterance() + await super().cancel(frame) + + async def process_frame(self, frame: Frame, direction: FrameDirection) -> None: + await super().process_frame(frame, direction) + + if isinstance(frame, UserStoppedSpeakingFrame): + # Aggregator-level turn end (broadcast once per logical user turn). + # This is the only boundary that finalizes/closes the xfyun + # websocket, so brief VAD pauses do not restart the ASR session. + await self._finish_utterance() + elif isinstance(frame, VADUserStartedSpeakingFrame): + await self._start_utterance() + + async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]: + if not audio: + yield None + return + + if not self._websocket or self._websocket.state is not State.OPEN: + await self._start_utterance() + + self._audio_buffer.extend(audio) + await self._flush_audio_buffer(final=False) + yield None + + async def _start_utterance(self) -> None: + if self._websocket and self._websocket.state is State.OPEN: + return + + if not self._app_id or not self._api_key or not self._api_secret: + await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret") + return + + if self.sample_rate not in (8000, 16000): + await self.push_error("Xfyun ASR sample rate must be 8000 or 16000") + return + + self._audio_buffer.clear() + self._partials = [] + self._last_text = "" + self._sent_first_frame = False + self._sent_final_frame = False + + auth_url = _build_auth_url(self._url, self._api_key, self._api_secret) + try: + self._websocket = await websocket_connect( + auth_url, + max_size=None, + open_timeout=self._open_timeout, + ) + except Exception as exc: + await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc) + self._websocket = None + return + + self._receive_task = self.create_task( + self._receive_messages(), + name="xfyun_asr_receive", + ) + + async def _finish_utterance(self) -> None: + if not self._websocket or self._websocket.state is not State.OPEN: + return + + await self._flush_audio_buffer(final=True) + if not self._sent_first_frame: + await self._close_utterance() + return + + if not self._sent_final_frame: + self._finalizing_turn = True + await self._send_payload({"data": {"status": 2}}) + self.request_finalize() + self._sent_final_frame = True + + async def _close_utterance(self) -> None: + current_task = asyncio.current_task() + if self._receive_task and self._receive_task is not current_task: + await self.cancel_task(self._receive_task) + self._receive_task = None + + websocket = self._websocket + self._websocket = None + if websocket and websocket.state is State.OPEN: + try: + await websocket.close() + except Exception: + pass + + self._audio_buffer.clear() + self._sent_first_frame = False + self._sent_final_frame = False + self._finalizing_turn = False + + async def _flush_audio_buffer(self, *, final: bool) -> None: + while len(self._audio_buffer) >= self._frame_size: + chunk = bytes(self._audio_buffer[: self._frame_size]) + del self._audio_buffer[: self._frame_size] + await self._send_audio_chunk(chunk, status=1) + + if final and self._audio_buffer: + chunk = bytes(self._audio_buffer) + self._audio_buffer.clear() + await self._send_audio_chunk(chunk, status=1) + + async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None: + if not audio: + return + + if not self._sent_first_frame: + business = { + "language": self._language, + "domain": self._domain, + "accent": self._accent, + } + if self._dynamic_correction: + business["dwa"] = "wpgs" + + payload = { + "common": {"app_id": self._app_id}, + "business": business, + "data": { + "status": 0, + "format": f"audio/L16;rate={self.sample_rate}", + "encoding": self._encoding, + "audio": base64.b64encode(audio).decode("utf-8"), + }, + } + self._sent_first_frame = True + else: + payload = { + "data": { + "status": status, + "format": f"audio/L16;rate={self.sample_rate}", + "encoding": self._encoding, + "audio": base64.b64encode(audio).decode("utf-8"), + } + } + + await self._send_payload(payload) + + async def _send_payload(self, payload: dict[str, Any]) -> None: + if not self._websocket or self._websocket.state is not State.OPEN: + return + await self._websocket.send(json.dumps(payload, ensure_ascii=False)) + + async def _receive_messages(self) -> None: + websocket = self._websocket + if not websocket: + return + + try: + async for message in websocket: + await self._process_response(json.loads(message)) + except Exception as exc: + if self._websocket is websocket: + await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc) + finally: + if self._websocket is websocket: + self._websocket = None + self._receive_task = None + + async def _process_response(self, payload: dict[str, Any]) -> None: + code = payload.get("code", -1) + if code != 0: + message = payload.get("message", "unknown error") + sid = payload.get("sid") + await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}") + return + + data = payload.get("data") + if not isinstance(data, dict): + return + + is_final_response = data.get("status") == 2 + recognition = data.get("result") + if isinstance(recognition, dict): + text = self._apply_recognition_result(recognition) + if text and text != self._last_text: + self._last_text = text + if not self._finalizing_turn and not is_final_response: + await self.push_frame( + InterimTranscriptionFrame( + text, + self._user_id, + time_now_iso8601(), + _language_or_none(self._language), + result=payload, + ) + ) + + if is_final_response: + final_text = self._last_text + if final_text: + self.confirm_finalize() + await self.push_frame( + TranscriptionFrame( + final_text, + self._user_id, + time_now_iso8601(), + _language_or_none(self._language), + result=payload, + ) + ) + await self._close_utterance() + + def _apply_recognition_result(self, recognition: dict[str, Any]) -> str: + partial = _extract_text_from_result(recognition) + if not partial: + return self._last_text + + if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"): + start, end = recognition["rg"] + if 1 <= start <= len(self._partials): + self._partials[start - 1 : end] = [partial] + else: + logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}") + else: + self._partials.append(partial) + + return "".join(self._partials) + + +def _extract_text_from_result(result: dict[str, Any]) -> str: + words: list[str] = [] + for item in result.get("ws", []): + for candidate in item.get("cw", []): + word = candidate.get("w") + if word: + words.append(word) + return "".join(words) + + +def _build_auth_url(url: str, api_key: str, api_secret: str) -> str: + parsed = urlparse(url) + host = parsed.netloc + path = parsed.path or "/v2/iat" + date = format_datetime(datetime.now(timezone.utc), usegmt=True) + request_line = f"GET {path} HTTP/1.1" + signature_origin = f"host: {host}\ndate: {date}\n{request_line}" + signature_sha = hmac.new( + api_secret.encode("utf-8"), + signature_origin.encode("utf-8"), + digestmod=hashlib.sha256, + ).digest() + signature = base64.b64encode(signature_sha).decode("utf-8") + authorization_origin = ( + f'api_key="{api_key}", algorithm="hmac-sha256", ' + f'headers="host date request-line", signature="{signature}"' + ) + authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8") + query = urlencode({"authorization": authorization, "date": date, "host": host}) + return f"{url}?{query}" + + +def _language_or_none(value: str) -> Language | None: + try: + return Language(value) + except ValueError: + return None diff --git a/src/voice/xfyun_tts.py b/src/voice/xfyun_tts.py new file mode 100644 index 0000000..bbaca36 --- /dev/null +++ b/src/voice/xfyun_tts.py @@ -0,0 +1,257 @@ +from __future__ import annotations + +import base64 +import hashlib +import hmac +import json +import os +import re +import unicodedata +from collections.abc import AsyncGenerator, AsyncIterator +from datetime import datetime, timezone +from email.utils import format_datetime +from typing import Any +from urllib.parse import urlencode, urlparse + +from loguru import logger + +from pipecat.frames.frames import ErrorFrame, Frame +from pipecat.services.settings import TTSSettings +from pipecat.services.tts_service import TTSService +from websockets.asyncio.client import connect + + +DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts" + +# Strip characters Xfyun's online TTS cannot synthesize. The engine silently +# rejects (or returns empty audio for) text containing emoji and other +# non-BMP symbols, which surfaces as "request finished without audio data". +_EMOJI_AND_SYMBOL_RE = re.compile( + "[" + "\U0001F300-\U0001FAFF" # misc pictographs, emoji, symbols, transport, etc. + "\U00002600-\U000027BF" # misc symbols and dingbats + "\U0001F1E6-\U0001F1FF" # regional indicators (flags) + "\uFE00-\uFE0F" # variation selectors + "\u200D" # zero-width joiner + "]", + flags=re.UNICODE, +) + + +class XfyunTTSService(TTSService): + """iFlytek/Xfyun online TTS service for Pipecat. + + Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL, + receives one JSON request per synthesis, and streams text WebSocket + messages containing base64-encoded audio chunks. This service requests + raw PCM so the chunks can become Pipecat audio frames without MP3 decode. + """ + + def __init__( + self, + *, + app_id: str, + api_key: str, + api_secret: str, + voice: str, + url: str | None = None, + sample_rate: int = 16000, + source_sample_rate: int = 16000, + encoding: str = "raw", + text_encoding: str = "UTF8", + speed: int = 50, + volume: int = 50, + pitch: int = 50, + timeout: float = 30.0, + **kwargs, + ) -> None: + super().__init__( + sample_rate=sample_rate, + settings=TTSSettings(model=None, voice=voice, language=None), + **kwargs, + ) + self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "") + self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "") + self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "") + self._voice = voice + self._url = url or DEFAULT_XFYUN_TTS_URL + self._source_sample_rate = source_sample_rate + self._encoding = encoding + self._text_encoding = text_encoding + self._speed = speed + self._volume = volume + self._pitch = pitch + self._timeout = timeout + self._last_failure_detail: str | None = None + + async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]: + if not text: + return + + if not self._app_id or not self._api_key or not self._api_secret: + yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret") + return + + sanitized = _sanitize_text_for_tts(text) + if not sanitized: + logger.debug( + f"{self}: skipping Xfyun TTS, text became empty after sanitization " + f"(original={text!r})" + ) + return + + if sanitized != text: + logger.debug( + f"{self}: sanitized Xfyun TTS text " + f"(original={text!r}, sanitized={sanitized!r})" + ) + + if len(sanitized.encode("utf-8")) >= 8000: + yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes") + return + + if self._encoding != "raw": + yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw") + return + + try: + await self.start_tts_usage_metrics(sanitized) + + first_frame = True + async for frame in self._stream_audio_frames_from_iterator( + self._iter_audio_chunks(sanitized), + in_sample_rate=self._source_sample_rate, + context_id=context_id, + ): + if first_frame: + await self.stop_ttfb_metrics() + first_frame = False + yield frame + + if first_frame: + detail = self._last_failure_detail or "no audio frames received" + yield ErrorFrame( + error=( + f"Xfyun TTS request finished without audio data ({detail}); " + f"text={sanitized!r}" + ) + ) + except Exception as exc: + yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}") + + async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]: + request = self._build_request_frame(text) + auth_url = _build_auth_url(self._url, self._api_key, self._api_secret) + + self._last_failure_detail = None + frames_received = 0 + audio_bytes_received = 0 + last_status: int | None = None + last_sid: str | None = None + saw_status_2 = False + + async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket: + await websocket.send(json.dumps(request, ensure_ascii=False)) + + async for raw_message in websocket: + frames_received += 1 + payload = json.loads(raw_message) + code = payload.get("code", -1) + sid = payload.get("sid") + if sid: + last_sid = sid + if code != 0: + err_msg = payload.get("message", "unknown error") + raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}") + + data = payload.get("data") + if not isinstance(data, dict): + continue + + last_status = data.get("status", last_status) + + audio_b64 = data.get("audio") + if audio_b64: + audio_bytes = base64.b64decode(audio_b64) + audio_bytes_received += len(audio_bytes) + yield audio_bytes + + if data.get("status") == 2: + saw_status_2 = True + break + + if audio_bytes_received == 0: + self._last_failure_detail = ( + f"frames={frames_received}, audio_bytes=0, " + f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}" + ) + logger.warning( + f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})" + ) + + def _build_request_frame(self, text: str) -> dict[str, Any]: + business: dict[str, Any] = { + "aue": self._encoding, + "auf": f"audio/L16;rate={self._source_sample_rate}", + "vcn": self._voice, + "speed": self._speed, + "volume": self._volume, + "pitch": self._pitch, + "tte": self._text_encoding, + } + + return { + "common": {"app_id": self._app_id}, + "business": business, + "data": { + "status": 2, + "text": base64.b64encode(text.encode("utf-8")).decode("utf-8"), + }, + } + + +def _sanitize_text_for_tts(text: str) -> str: + """Strip characters Xfyun's online TTS cannot synthesize. + + The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs, + dingbats, regional-indicator flags, variation selectors, and zero-width + joiners. When such characters appear in the input the synthesis can + finish without any audio data ("Xfyun TTS request finished without audio + data"). We also drop control characters (other than common whitespace) + and "Symbol, Other" codepoints, then collapse runs of whitespace. + """ + if not text: + return text + + cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text) + filtered: list[str] = [] + for ch in cleaned: + category = unicodedata.category(ch) + if category == "So": + continue + if category.startswith("C") and ch not in ("\n", "\r", "\t"): + continue + filtered.append(ch) + return re.sub(r"\s+", " ", "".join(filtered)).strip() + + +def _build_auth_url(url: str, api_key: str, api_secret: str) -> str: + parsed = urlparse(url) + host = parsed.netloc + path = parsed.path or "/v2/tts" + date = format_datetime(datetime.now(timezone.utc), usegmt=True) + request_line = f"GET {path} HTTP/1.1" + signature_origin = f"host: {host}\ndate: {date}\n{request_line}" + signature_sha = hmac.new( + api_secret.encode("utf-8"), + signature_origin.encode("utf-8"), + digestmod=hashlib.sha256, + ).digest() + signature = base64.b64encode(signature_sha).decode("utf-8") + authorization_origin = ( + f'api_key="{api_key}", algorithm="hmac-sha256", ' + f'headers="host date request-line", signature="{signature}"' + ) + authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8") + query = urlencode({"authorization": authorization, "date": date, "host": host}) + return f"{url}?{query}" diff --git a/static/voice-demo/README.md b/static/voice-demo/README.md new file mode 100644 index 0000000..8832eb8 --- /dev/null +++ b/static/voice-demo/README.md @@ -0,0 +1,106 @@ +# Webpage Example — Realtime Voice Chat + +A self-contained browser client for the engine's product websocket +(`/ws-product`, protocol `va.ws.v1`). + +## Features + +- **Connect / Disconnect** to any `ws://` or `wss://` URL. +- **Microphone selector + mic on/off toggle** — available input devices + are listed with `enumerateDevices`, and getUserMedia is requested with + `echoCancellation`, `noiseSuppression`, and `autoGainControl` so the + browser handles AEC against the bot's voice. +- **Text composer** — type a message and press Enter to send + an `input.text` event (Shift+Enter for newline). Sending interrupts + any in-flight bot audio so the next reply is heard cleanly. +- **Chat history** rendered from `input.transcript.final` (you, when + spoken), streamed `response.text.delta` / `response.text.final` + (assistant — deltas arrive ahead of the synthesized audio), and locally + for text you submit (the engine doesn't echo text input back as a + transcript). +- **WebSocket log** panel for connection state and compact send/receive + events. Audio chunks are summarized so the UI does not flood. +- **Gapless TTS playback** by scheduling each `response.audio.delta` + chunk back-to-back on the AudioContext. +- **Live VU meter** + mic and bot activity indicators. +- **Clear** button to reset history. + +No build step, no dependencies — just three files plus an AudioWorklet. + +## Layout + +```text +examples/webpage/ +├── index.html +├── styles.css +├── app.js +└── pcm-recorder.worklet.js +``` + +## Run + +1. Start the engine (default port `8000`): + + ```bash + cd AI-VideoAssistant-engine-v5-pipecat-minimal + source .venv/bin/activate + export OPENAI_API_KEY=... + uvicorn engine.main:app --host 127.0.0.1 --port 8000 + ``` + +2. Open the demo page served by the same process: + + ```text + http://127.0.0.1:8000/demo/ + ``` + + The default websocket URL is derived from the page host + (`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a + microphone if needed, click **Enable mic**, and start speaking. + + Mount path and on/off are controlled in `config.json`: + + ```json + "server": { + "serve_webpage": true, + "webpage_mount": "/demo" + } + ``` + + Set `"serve_webpage": false` in production if you serve the UI elsewhere. + +### Standalone static server (optional) + +You can still serve the files from another port for UI-only iteration. +Add that origin to `server.cors_origins` in `config.json` if needed: + +```bash +cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage +python -m http.server 8080 +``` + +Then open and point the URL field at +`ws://127.0.0.1:8000/ws-product`. + +> The browser's mic API requires a secure context. `http://localhost` +> qualifies; if you serve from another host, use HTTPS and a `wss://` +> URL. + +## Audio details + +- Input: mono Float32 from `getUserMedia` is resampled in the + AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and + sent as **binary** websocket messages (the server accepts either + binary or the JSON+base64 form). +- Output: each `response.audio.delta` carries base64-encoded PCM16 @ + 16 kHz; chunks are decoded and scheduled back-to-back through Web + Audio. The browser handles resampling to the device rate. + +## Notes + +- Use headphones if you still hear echo despite browser AEC; the bot's + voice leaking back into the open mic is the most common cause of + feedback loops. +- The engine's session has an inactivity timeout + (`session.inactivity_timeout_sec` in `config.json`). If the bot + doesn't respond after a long silence, reconnect. diff --git a/static/voice-demo/app.js b/static/voice-demo/app.js new file mode 100644 index 0000000..a41e216 --- /dev/null +++ b/static/voice-demo/app.js @@ -0,0 +1,899 @@ +/** + * Minimal browser client for the AI VideoAssistant engine's product + * websocket (`/ws-product`, protocol `va.ws.v1`). + * + * Responsibilities: + * - Open/close the websocket and run the session handshake. + * - List/select microphones and capture mic audio with browser AEC enabled. + * - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames + * as binary websocket messages. + * - Play `response.audio.delta` frames gaplessly through Web Audio. + * - Render a chat-style history of user transcripts and bot text deltas. + */ + +const SAMPLE_RATE = 16000; +const CHANNELS = 1; +const FRAME_MS = 20; +const PROTOCOL = "va.ws.v1"; +const MAX_WS_LOG_LINES = 120; +const AUDIO_DELTA_LOG_INTERVAL_MS = 1000; + +function defaultWsUrl() { + const scheme = location.protocol === "https:" ? "wss:" : "ws:"; + return `${scheme}//${location.host}/ws-product`; +} + +const els = { + url: document.getElementById("ws-url"), + connectBtn: document.getElementById("connect-btn"), + statusDot: document.getElementById("status-dot"), + statusText: document.getElementById("status-text"), + chatLog: document.getElementById("chat-log"), + micBtn: document.getElementById("mic-btn"), + micSelect: document.getElementById("mic-select"), + micLabel: document.querySelector(".mic-btn__label"), + micIndicator: document.getElementById("mic-indicator"), + botIndicator: document.getElementById("bot-indicator"), + clearBtn: document.getElementById("clear-btn"), + clearWsLogBtn: document.getElementById("clear-ws-log-btn"), + wsLog: document.getElementById("ws-log"), + meterFill: document.getElementById("meter-fill"), + composer: document.getElementById("composer"), + textInput: document.getElementById("text-input"), + sendBtn: document.getElementById("send-btn"), +}; + +const state = { + ws: null, + connected: false, + connecting: false, + + audioContext: null, + micStream: null, + micSourceNode: null, + recorderNode: null, + + micEnabled: false, + micDevices: [], + selectedMicDeviceId: "", + + // Output scheduling. + nextPlaybackTime: 0, + playbackEndsAt: 0, + scheduledSources: [], + botActive: false, + botUiTimer: null, + + // Chat state. + currentAssistantBubble: null, + + // VU meter smoothing. + meterLevel: 0, + + // Compact websocket logging. + audioDeltaLogCount: 0, + audioDeltaLogBytes: 0, + lastAudioDeltaLogAt: 0, + audioSendLogCount: 0, + audioSendLogBytes: 0, + lastAudioSendLogAt: 0, +}; + +/* ------------------------------------------------------------------ UI */ + +function setStatus(kind, text) { + els.statusDot.className = `status__dot status__dot--${kind}`; + els.statusText.textContent = text; +} + +function setConnectButton() { + if (state.connecting) { + els.connectBtn.textContent = "Connecting…"; + els.connectBtn.disabled = true; + els.connectBtn.classList.remove("is-disconnect"); + } else if (state.connected) { + els.connectBtn.textContent = "Disconnect"; + els.connectBtn.disabled = false; + els.connectBtn.classList.add("is-disconnect"); + } else { + els.connectBtn.textContent = "Connect"; + els.connectBtn.disabled = false; + els.connectBtn.classList.remove("is-disconnect"); + } +} + +function setMicButton() { + els.micBtn.disabled = !state.connected; + els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false"); + els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic"; + els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic"; + els.micIndicator.classList.toggle("is-active", state.micEnabled); +} + +function setMicSelectEnabled() { + els.micSelect.disabled = !state.connected || !navigator.mediaDevices; +} + +function setComposerEnabled(enabled) { + els.textInput.disabled = !enabled; + els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0; +} + +function setBotIndicator(active) { + els.botIndicator.classList.toggle("is-active", active); +} + +function addBubble(role, text) { + if (els.chatLog.querySelector(".chat__empty")) { + els.chatLog.innerHTML = ""; + } + const bubble = document.createElement("div"); + bubble.className = `bubble bubble--${role}`; + if (role !== "system") { + const tag = document.createElement("span"); + tag.className = "bubble__role"; + tag.textContent = role === "user" ? "You" : "Assistant"; + bubble.appendChild(tag); + } + const body = document.createElement("span"); + body.className = "bubble__text"; + body.textContent = text; + bubble.appendChild(body); + els.chatLog.appendChild(bubble); + scrollChatToBottom(); + return bubble; +} + +function appendToBubble(bubble, text) { + const body = bubble.querySelector(".bubble__text"); + body.textContent += text; + scrollChatToBottom(); +} + +function scrollChatToBottom() { + els.chatLog.scrollTop = els.chatLog.scrollHeight; +} + +function clearChat() { + els.chatLog.innerHTML = ""; + state.currentAssistantBubble = null; + const empty = document.createElement("div"); + empty.className = "chat__empty"; + empty.innerHTML = "

Chat cleared.

"; + els.chatLog.appendChild(empty); +} + +function truncateLogValue(value, maxLength = 160) { + const text = String(value); + if (text.length <= maxLength) return text; + return `${text.slice(0, maxLength - 1)}…`; +} + +function compactWsPayload(payload) { + if (!payload || typeof payload !== "object") return String(payload); + const compact = { ...payload }; + + if (typeof compact.audio === "string") { + compact.audio = ``; + } + if (typeof compact.data === "string" && compact.data.length > 160) { + compact.data = ``; + } + if (typeof compact.text === "string") { + compact.text = truncateLogValue(compact.text); + } + + try { + return JSON.stringify(compact); + } catch (_) { + return payload.type || "unserializable websocket payload"; + } +} + +function addWsLog(direction, detail, kind = direction) { + if (els.wsLog.querySelector(".ws-log__empty")) { + els.wsLog.innerHTML = ""; + } + + const entry = document.createElement("div"); + entry.className = `ws-log__entry ws-log__entry--${kind}`; + + const time = document.createElement("span"); + time.className = "ws-log__time"; + time.textContent = new Date().toLocaleTimeString([], { + hour12: false, + hour: "2-digit", + minute: "2-digit", + second: "2-digit", + }); + + const dir = document.createElement("span"); + dir.className = "ws-log__direction"; + dir.textContent = + direction === "send" + ? "SEND" + : direction === "recv" + ? "RECV" + : direction.toUpperCase(); + + const body = document.createElement("span"); + body.className = "ws-log__detail"; + body.textContent = detail; + + entry.append(time, dir, body); + els.wsLog.appendChild(entry); + + while (els.wsLog.children.length > MAX_WS_LOG_LINES) { + els.wsLog.firstElementChild.remove(); + } + els.wsLog.scrollTop = els.wsLog.scrollHeight; +} + +function flushAudioDeltaLog() { + if (state.audioDeltaLogCount === 0) return; + addWsLog( + "recv", + `response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`, + ); + state.audioDeltaLogCount = 0; + state.audioDeltaLogBytes = 0; + state.lastAudioDeltaLogAt = performance.now(); +} + +function flushAudioSendLog() { + if (state.audioSendLogCount === 0) return; + addWsLog( + "send", + `input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`, + ); + state.audioSendLogCount = 0; + state.audioSendLogBytes = 0; + state.lastAudioSendLogAt = performance.now(); +} + +function flushPendingWsLogs() { + flushAudioDeltaLog(); + flushAudioSendLog(); +} + +function logWsPayload(direction, payload) { + if (direction === "send") { + flushAudioSendLog(); + } else { + flushAudioDeltaLog(); + } + + if (direction === "recv" && payload?.type === "response.audio.delta") { + state.audioDeltaLogCount += 1; + state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0; + const now = performance.now(); + if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) { + flushAudioDeltaLog(); + } + return; + } + + addWsLog(direction, compactWsPayload(payload)); +} + +function logBinarySend(byteLength) { + state.audioSendLogCount += 1; + state.audioSendLogBytes += byteLength; + const now = performance.now(); + if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) { + flushAudioSendLog(); + } +} + +function wsSend(data) { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false; + + if (typeof data === "string") { + try { + logWsPayload("send", JSON.parse(data)); + } catch (_) { + flushAudioSendLog(); + flushAudioDeltaLog(); + addWsLog("send", truncateLogValue(data)); + } + } else { + const byteLength = + data instanceof ArrayBuffer + ? data.byteLength + : ArrayBuffer.isView(data) + ? data.byteLength + : 0; + if (byteLength > 0) { + logBinarySend(byteLength); + } + } + + state.ws.send(data); + return true; +} + +function clearWsLog() { + state.audioDeltaLogCount = 0; + state.audioDeltaLogBytes = 0; + state.audioSendLogCount = 0; + state.audioSendLogBytes = 0; + els.wsLog.innerHTML = + '
No websocket events yet.
'; +} + +/* ---------------------------------------------------------------- Audio */ + +async function ensureAudioContext() { + if (!state.audioContext) { + const Ctx = window.AudioContext || window.webkitAudioContext; + state.audioContext = new Ctx(); + await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js"); + } + if (state.audioContext.state === "suspended") { + await state.audioContext.resume(); + } + return state.audioContext; +} + +function renderMicDevices() { + const previousValue = state.selectedMicDeviceId || els.micSelect.value; + els.micSelect.innerHTML = ""; + + const defaultOption = document.createElement("option"); + defaultOption.value = ""; + defaultOption.textContent = "Default microphone"; + els.micSelect.appendChild(defaultOption); + + state.micDevices.forEach((device, index) => { + const option = document.createElement("option"); + option.value = device.deviceId; + option.textContent = device.label || `Microphone ${index + 1}`; + els.micSelect.appendChild(option); + }); + + const hasPrevious = state.micDevices.some( + (device) => device.deviceId === previousValue, + ); + state.selectedMicDeviceId = hasPrevious ? previousValue : ""; + els.micSelect.value = state.selectedMicDeviceId; + setMicSelectEnabled(); +} + +async function refreshMicDevices() { + if (!navigator.mediaDevices?.enumerateDevices) { + setMicSelectEnabled(); + return; + } + + try { + const devices = await navigator.mediaDevices.enumerateDevices(); + state.micDevices = devices.filter((device) => device.kind === "audioinput"); + renderMicDevices(); + } catch (err) { + console.warn("Could not enumerate microphones", err); + setMicSelectEnabled(); + } +} + +async function startMic() { + const ctx = await ensureAudioContext(); + const audioConstraints = { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + channelCount: 1, + }; + if (state.selectedMicDeviceId) { + audioConstraints.deviceId = { exact: state.selectedMicDeviceId }; + } + + state.micStream = await navigator.mediaDevices.getUserMedia({ + audio: audioConstraints, + video: false, + }); + await refreshMicDevices(); + + state.micSourceNode = ctx.createMediaStreamSource(state.micStream); + state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", { + numberOfInputs: 1, + numberOfOutputs: 0, + channelCount: 1, + processorOptions: { + targetSampleRate: SAMPLE_RATE, + frameMs: FRAME_MS, + }, + }); + state.recorderNode.port.onmessage = (event) => { + const data = event.data; + if (!data || data.type !== "frame") return; + updateMeter(data.rms || 0); + if (state.connected) { + wsSend(data.buffer); + } + }; + + state.micSourceNode.connect(state.recorderNode); + state.micEnabled = true; + addWsLog("system", "mic capture started (binary input.audio frames)"); + setMicButton(); +} + +function stopMic() { + const wasEnabled = state.micEnabled; + if (state.recorderNode) { + try { + state.recorderNode.port.onmessage = null; + state.recorderNode.disconnect(); + } catch (_) { + /* ignore */ + } + state.recorderNode = null; + } + if (state.micSourceNode) { + try { + state.micSourceNode.disconnect(); + } catch (_) { + /* ignore */ + } + state.micSourceNode = null; + } + if (state.micStream) { + for (const track of state.micStream.getTracks()) { + try { + track.stop(); + } catch (_) { + /* ignore */ + } + } + state.micStream = null; + } + state.micEnabled = false; + updateMeter(0); + if (wasEnabled) { + flushAudioSendLog(); + addWsLog("system", "mic capture stopped"); + } + setMicButton(); +} + +function updateMeter(rms) { + // Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech. + const target = Math.min(1, rms * 2.4); + state.meterLevel = state.meterLevel * 0.5 + target * 0.5; + els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`; +} + +/* ---------------------------------------------------- Bot audio playback */ + +function schedulePlayback(int16) { + const ctx = state.audioContext; + if (!ctx) return; + + const float32 = new Float32Array(int16.length); + for (let i = 0; i < int16.length; i++) { + float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff); + } + const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE); + buffer.copyToChannel(float32, 0); + + const src = ctx.createBufferSource(); + src.buffer = buffer; + src.connect(ctx.destination); + + const now = ctx.currentTime; + // Schedule immediately after the previously scheduled chunk to keep + // playback contiguous, with a tiny safety margin if we fell behind. + const startAt = Math.max(now + 0.02, state.nextPlaybackTime); + src.start(startAt); + state.nextPlaybackTime = startAt + buffer.duration; + state.playbackEndsAt = state.nextPlaybackTime; + + src.onended = () => { + const idx = state.scheduledSources.indexOf(src); + if (idx >= 0) state.scheduledSources.splice(idx, 1); + }; + state.scheduledSources.push(src); + + setBotIndicator(true); + if (state.botUiTimer) clearTimeout(state.botUiTimer); + const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120; + state.botUiTimer = setTimeout(() => { + if (state.audioContext && + state.audioContext.currentTime >= state.playbackEndsAt - 0.01) { + setBotIndicator(false); + } + }, msUntilEnd); +} + +function stopPlaybackQueue() { + for (const src of state.scheduledSources) { + try { + src.onended = null; + src.stop(); + src.disconnect(); + } catch (_) { + /* already stopped */ + } + } + state.scheduledSources = []; + resetPlaybackClock(); + if (state.botUiTimer) { + clearTimeout(state.botUiTimer); + state.botUiTimer = null; + } + setBotIndicator(false); +} + +function resetPlaybackClock() { + if (state.audioContext) { + state.nextPlaybackTime = state.audioContext.currentTime; + state.playbackEndsAt = state.audioContext.currentTime; + } +} + +/* --------------------------------------------------------- Chat updates */ + +function handleUserTranscript(text) { + if (!text) return; + state.currentAssistantBubble = null; + addBubble("user", text); +} + +function sendText(text) { + const value = (text || "").trim(); + if (!value) return false; + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false; + const message = { + type: "input.text", + text: value, + interrupt: true, + }; + + // The engine does not echo text input back as a transcript event, so we + // render the user bubble locally. Also interrupt any in-flight bot audio + // so the next reply is heard cleanly. We deliberately do NOT clear + // `currentAssistantBubble` here — the engine will emit a + // `response.text.final(interrupted=true)` for the in-flight assistant + // turn, which finalizes that bubble in place. A brand-new bubble for the + // reply will be created when `response.text.started` arrives. + wsSend(JSON.stringify(message)); + stopPlaybackQueue(); + addBubble("user", value); + return true; +} + +function handleAssistantDelta(text) { + if (!text) return; + if (!state.currentAssistantBubble) { + state.currentAssistantBubble = addBubble("assistant", ""); + } + appendToBubble(state.currentAssistantBubble, text); +} + +function handleAssistantStarted() { + state.currentAssistantBubble = null; +} + +function handleAssistantFinal(text, interrupted) { + if (!text) { + state.currentAssistantBubble = null; + return; + } + if (state.currentAssistantBubble) { + const body = state.currentAssistantBubble.querySelector(".bubble__text"); + body.textContent = text; + } else { + state.currentAssistantBubble = addBubble("assistant", text); + } + if (interrupted) { + state.currentAssistantBubble.classList.add("bubble--interrupted"); + } + state.currentAssistantBubble = null; + scrollChatToBottom(); +} + +function finalizeAssistantBubble() { + state.currentAssistantBubble = null; +} + +/* ---------------------------------------------------------- Websocket IO */ + +function decodeBase64ToInt16(b64) { + const binary = atob(b64); + const len = binary.length; + const bytes = new Uint8Array(len); + for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i); + return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2); +} + +function handleEvent(event) { + switch (event.type) { + case "response.audio.delta": + if (typeof event.audio === "string") { + schedulePlayback(decodeBase64ToInt16(event.audio)); + } + break; + case "response.audio.started": + setBotIndicator(true); + break; + case "response.audio.stopped": + finalizeAssistantBubble(); + // The indicator turns off automatically when the playback queue drains. + break; + case "response.text.delta": + handleAssistantDelta(event.text); + break; + case "response.text.started": + handleAssistantStarted(); + break; + case "response.text.final": + handleAssistantFinal(event.text, event.interrupted); + break; + case "input.transcript.final": + handleUserTranscript(event.text); + break; + case "input.transcript.interim": + // Ignore partial ASR updates; chat history renders committed user turns. + break; + case "transport.message": + // Reserved for future structured messages; ignore silently. + break; + default: + // Unknown event type: log for debugging. + console.debug("ws event", event); + } +} + +async function connect() { + if (state.connected || state.connecting) return; + const url = (els.url.value || "").trim(); + if (!url) { + setStatus("error", "Missing URL"); + return; + } + + state.connecting = true; + setStatus("connecting", "Connecting…"); + setConnectButton(); + addWsLog("system", `connecting ${url}`); + + try { + // Pre-warm audio context on user gesture so playback works on Safari. + await ensureAudioContext(); + } catch (err) { + console.error("AudioContext failed", err); + state.connecting = false; + setStatus("error", "Audio init failed"); + setConnectButton(); + addWsLog("error", `audio init failed: ${err.message || err}`, "error"); + return; + } + + let ws; + try { + ws = new WebSocket(url); + } catch (err) { + console.error("WebSocket constructor failed", err); + state.connecting = false; + setStatus("error", "Bad URL"); + setConnectButton(); + addWsLog("error", `bad websocket URL: ${err.message || err}`, "error"); + return; + } + ws.binaryType = "arraybuffer"; + state.ws = ws; + + ws.addEventListener("open", () => { + const startMessage = { + type: "session.start", + protocol: PROTOCOL, + audio: { + encoding: "pcm_s16le", + sample_rate: SAMPLE_RATE, + channels: CHANNELS, + }, + }; + + state.connecting = false; + state.connected = true; + resetPlaybackClock(); + addWsLog("system", "websocket open"); + setStatus("connected", "Connected"); + setConnectButton(); + setMicButton(); + setMicSelectEnabled(); + refreshMicDevices(); + + wsSend(JSON.stringify(startMessage)); + addBubble("system", "Session started."); + setComposerEnabled(true); + els.textInput.focus(); + }); + + ws.addEventListener("message", (event) => { + const data = event.data; + if (typeof data === "string") { + let parsed; + try { + parsed = JSON.parse(data); + } catch (err) { + console.warn("Bad JSON from server", err, data); + addWsLog( + "error", + `invalid JSON from server: ${truncateLogValue(data)}`, + "error", + ); + return; + } + logWsPayload("recv", parsed); + handleEvent(parsed); + } else if (data instanceof ArrayBuffer) { + // Server doesn't currently send binary, but handle it just in case. + addWsLog("recv", `binary audio ${data.byteLength} bytes`); + schedulePlayback(new Int16Array(data)); + } + }); + + ws.addEventListener("error", (err) => { + console.error("WebSocket error", err); + setStatus("error", "Connection error"); + addWsLog("error", "websocket error", "error"); + }); + + ws.addEventListener("close", (event) => { + const wasConnected = state.connected; + state.ws = null; + state.connected = false; + state.connecting = false; + if (state.micEnabled) stopMic(); + stopPlaybackQueue(); + setConnectButton(); + setMicButton(); + setMicSelectEnabled(); + setComposerEnabled(false); + setBotIndicator(false); + flushPendingWsLogs(); + addWsLog( + "system", + `websocket close code=${event.code}${ + event.reason ? ` reason=${event.reason}` : "" + }`, + ); + if (wasConnected) { + addBubble( + "system", + `Session ended${event.reason ? ` — ${event.reason}` : ""}.`, + ); + setStatus("idle", "Disconnected"); + } else { + setStatus("error", "Connection closed"); + } + }); +} + +function disconnect() { + if (!state.ws) return; + try { + if (state.ws.readyState === WebSocket.OPEN) { + const stopMessage = { type: "session.stop", reason: "client_disconnect" }; + wsSend(JSON.stringify(stopMessage)); + } + } catch (_) { + /* ignore */ + } + try { + state.ws.close(1000, "client_disconnect"); + } catch (_) { + /* ignore */ + } +} + +/* ---------------------------------------------------------------- Wiring */ + +els.connectBtn.addEventListener("click", () => { + if (state.connected) disconnect(); + else connect(); +}); + +els.micBtn.addEventListener("click", async () => { + if (!state.connected) return; + els.micBtn.disabled = true; + try { + if (state.micEnabled) { + stopMic(); + } else { + await startMic(); + } + } catch (err) { + console.error("Mic error", err); + addBubble("system", `Mic error: ${err.message || err}`); + } finally { + els.micBtn.disabled = !state.connected; + } +}); + +els.micSelect.addEventListener("change", async () => { + state.selectedMicDeviceId = els.micSelect.value; + if (!state.micEnabled) return; + + els.micSelect.disabled = true; + els.micBtn.disabled = true; + try { + stopMic(); + await startMic(); + } catch (err) { + console.error("Mic switch error", err); + addBubble("system", `Mic switch error: ${err.message || err}`); + } finally { + setMicButton(); + setMicSelectEnabled(); + } +}); + +if (navigator.mediaDevices?.addEventListener) { + navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices); +} + +els.clearBtn.addEventListener("click", () => { + clearChat(); +}); + +els.clearWsLogBtn.addEventListener("click", () => { + clearWsLog(); +}); + +function autosizeTextarea() { + const ta = els.textInput; + ta.style.height = "auto"; + ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`; +} + +function submitText() { + const value = els.textInput.value; + if (!sendText(value)) return; + els.textInput.value = ""; + autosizeTextarea(); + setComposerEnabled(state.connected); +} + +els.composer.addEventListener("submit", (event) => { + event.preventDefault(); + submitText(); +}); + +els.textInput.addEventListener("input", () => { + autosizeTextarea(); + setComposerEnabled(state.connected); +}); + +els.textInput.addEventListener("keydown", (event) => { + if (event.key === "Enter" && !event.shiftKey && !event.isComposing) { + event.preventDefault(); + submitText(); + } +}); + +window.addEventListener("beforeunload", () => { + if (state.ws) { + try { + state.ws.close(); + } catch (_) { + /* ignore */ + } + } + if (state.audioContext) { + try { + state.audioContext.close(); + } catch (_) { + /* ignore */ + } + } +}); + +els.url.value = defaultWsUrl(); + +setStatus("idle", "Disconnected"); +setConnectButton(); +setMicButton(); +setMicSelectEnabled(); +setComposerEnabled(false); diff --git a/static/voice-demo/index.html b/static/voice-demo/index.html new file mode 100644 index 0000000..deef69e --- /dev/null +++ b/static/voice-demo/index.html @@ -0,0 +1,158 @@ + + + + + + VA Voice Chat — /ws-product + + + +
+
+
+ +

VA Voice Chat

+
+ +
+ + +
+ +
+ + Disconnected +
+
+ +
+
+
+
+
+

Connect to the engine, enable your mic, and start talking.

+

+ Audio is streamed as PCM16 mono @ 16 kHz over + /ws-product. +

+
+
+
+ +
+ + +
+ + +
+ +
+ + + + +
+ + + Mic + + + + Bot + +
+ + +
+ +

+ Press Enter to send, Shift+Enter + for newline. Sending text will interrupt the bot if it's speaking. + Browser echo cancellation is on; use headphones if echo persists. +

+
+
+ +
+
+
+

WebSocket Log

+ +
+ +
+
+
No websocket events yet.
+
+
+
+
+ + + + diff --git a/static/voice-demo/pcm-recorder.worklet.js b/static/voice-demo/pcm-recorder.worklet.js new file mode 100644 index 0000000..db501ae --- /dev/null +++ b/static/voice-demo/pcm-recorder.worklet.js @@ -0,0 +1,104 @@ +/** + * PCM Recorder AudioWorklet. + * + * Captures mono Float32 mic samples at the AudioContext's native rate, + * resamples them to a target sample rate (default 16 kHz) with linear + * interpolation, then ships PCM16 frames of a fixed duration (default 20 ms) + * to the main thread via `port.postMessage(ArrayBuffer)`. + * + * It also computes a simple RMS level per frame for the UI VU meter so the + * main thread doesn't have to re-process the audio. + */ + +class PcmRecorderProcessor extends AudioWorkletProcessor { + constructor(options) { + super(); + + const opts = (options && options.processorOptions) || {}; + this._targetSampleRate = opts.targetSampleRate || 16000; + this._frameMs = opts.frameMs || 20; + this._frameSamples = Math.round( + (this._targetSampleRate * this._frameMs) / 1000, + ); + + // Resampling state. + // `ratio` is input samples per output sample. + this._ratio = sampleRate / this._targetSampleRate; + this._inputBuffer = new Float32Array(0); + // Float position in `_inputBuffer` for the next output sample. + this._inputOffset = 0; + + // Output framing state. + this._frameBuffer = new Int16Array(this._frameSamples); + this._frameIndex = 0; + + // VU meter accumulator. + this._rmsSumSquares = 0; + this._rmsCount = 0; + } + + process(inputs) { + const input = inputs[0]; + if (!input || input.length === 0) return true; + const channel = input[0]; + if (!channel || channel.length === 0) return true; + + // Append new samples to the input buffer. + const merged = new Float32Array(this._inputBuffer.length + channel.length); + merged.set(this._inputBuffer, 0); + merged.set(channel, this._inputBuffer.length); + this._inputBuffer = merged; + + const ratio = this._ratio; + const inLen = this._inputBuffer.length; + let pos = this._inputOffset; + + while (pos + 1 < inLen) { + const lo = Math.floor(pos); + const hi = lo + 1; + const w = pos - lo; + const sample = + this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w; + + this._rmsSumSquares += sample * sample; + this._rmsCount += 1; + + let s = sample; + if (s > 1) s = 1; + else if (s < -1) s = -1; + this._frameBuffer[this._frameIndex++] = + s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff); + + if (this._frameIndex === this._frameSamples) { + const frame = new Int16Array(this._frameSamples); + frame.set(this._frameBuffer); + const rms = + this._rmsCount > 0 + ? Math.sqrt(this._rmsSumSquares / this._rmsCount) + : 0; + this.port.postMessage( + { type: "frame", buffer: frame.buffer, rms }, + [frame.buffer], + ); + this._frameIndex = 0; + this._rmsSumSquares = 0; + this._rmsCount = 0; + } + + pos += ratio; + } + + // Trim consumed samples from the input buffer; keep at least the last + // sample we still need to interpolate against on the next call. + const consumed = Math.floor(pos); + if (consumed > 0) { + this._inputBuffer = this._inputBuffer.slice(consumed); + pos -= consumed; + } + this._inputOffset = pos; + + return true; + } +} + +registerProcessor("pcm-recorder", PcmRecorderProcessor); diff --git a/static/voice-demo/styles.css b/static/voice-demo/styles.css new file mode 100644 index 0000000..28ede32 --- /dev/null +++ b/static/voice-demo/styles.css @@ -0,0 +1,739 @@ +:root { + color-scheme: dark; + --bg: #0b0d12; + --bg-elevated: #131722; + --bg-soft: #1a2030; + --border: #232a3b; + --text: #e6e9f2; + --text-dim: #95a0bb; + --accent: #4f8cff; + --accent-strong: #6aa1ff; + --user: #2f7ff0; + --assistant: #2a3145; + --danger: #ff5577; + --success: #2dd28b; + --warning: #ffb84d; + --radius: 14px; + --shadow: 0 10px 30px rgba(0, 0, 0, 0.35); + font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, + "Helvetica Neue", Arial, sans-serif; +} + +* { + box-sizing: border-box; +} + +html, +body { + margin: 0; + padding: 0; + height: 100%; + background: radial-gradient( + 1200px 600px at 80% -10%, + rgba(79, 140, 255, 0.18), + transparent 60% + ), + radial-gradient( + 900px 500px at -10% 110%, + rgba(45, 210, 139, 0.12), + transparent 60% + ), + var(--bg); + color: var(--text); +} + +body { + display: flex; + justify-content: center; + padding: 12px; +} + +.app { + display: grid; + grid-template-rows: auto minmax(0, 1fr); + gap: 12px; + width: min(1440px, 100%); + height: calc(100dvh - 24px); + max-height: calc(100vh - 24px); + background: var(--bg-elevated); + border: 1px solid var(--border); + border-radius: 20px; + box-shadow: var(--shadow); + padding: 14px; +} + +.app__body { + display: grid; + grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px); + gap: 12px; + min-height: 0; +} + +.app__main { + display: grid; + grid-template-rows: minmax(0, 1fr) auto; + gap: 0; + min-height: 0; + overflow: hidden; + background: var(--bg); + border: 1px solid var(--border); + border-radius: var(--radius); +} + +/* Header ---------------------------------------------------------------- */ + +.app__header { + display: grid; + grid-template-columns: auto minmax(0, 1fr) auto; + align-items: center; + gap: 12px; + padding-bottom: 10px; + border-bottom: 1px solid var(--border); +} + +.brand { + display: flex; + align-items: center; + gap: 10px; +} + +.brand h1 { + font-size: 16px; + margin: 0; + letter-spacing: 0.2px; +} + +.brand__dot { + width: 10px; + height: 10px; + border-radius: 50%; + background: var(--accent); + box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.18); +} + +.connection { + display: flex; + align-items: end; + gap: 8px; + min-width: 0; +} + +.connection__field { + display: flex; + flex-direction: column; + gap: 4px; + min-width: 0; + flex: 1; +} + +.connection__field span { + font-size: 11px; + color: var(--text-dim); + text-transform: uppercase; + letter-spacing: 0.8px; +} + +.connection__field input { + background: var(--bg-soft); + color: var(--text); + border: 1px solid var(--border); + border-radius: 10px; + padding: 7px 10px; + font: inherit; + font-size: 12px; + outline: none; + width: 100%; + min-width: 0; +} + +.connection__field input:focus { + border-color: var(--accent); + box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18); +} + +.status { + display: flex; + align-items: center; + gap: 8px; + font-size: 13px; + color: var(--text-dim); + white-space: nowrap; +} + +.status__dot { + width: 9px; + height: 9px; + border-radius: 50%; + background: var(--text-dim); +} + +.status__dot--idle { + background: var(--text-dim); +} + +.status__dot--connecting { + background: var(--warning); + animation: pulse 1.2s ease-in-out infinite; +} + +.status__dot--connected { + background: var(--success); +} + +.status__dot--error { + background: var(--danger); +} + +/* Chat ------------------------------------------------------------------ */ + +.chat { + overflow: hidden; + display: flex; + flex-direction: column; + min-height: 0; +} + +.chat__log { + flex: 1; + overflow-y: auto; + padding: 18px; + display: flex; + flex-direction: column; + gap: 10px; + scroll-behavior: smooth; +} + +.chat__empty { + margin: auto; + text-align: center; + color: var(--text-dim); +} + +.chat__empty p { + margin: 4px 0; +} + +.chat__hint code { + background: var(--bg-soft); + border: 1px solid var(--border); + border-radius: 6px; + padding: 1px 6px; + font-size: 12px; +} + +.bubble { + max-width: 78%; + padding: 10px 14px; + border-radius: 14px; + line-height: 1.45; + font-size: 14px; + white-space: pre-wrap; + word-wrap: break-word; + animation: bubble-in 0.16s ease-out; +} + +.bubble--user { + background: var(--user); + color: #fff; + align-self: flex-end; + border-bottom-right-radius: 4px; +} + +.bubble--assistant { + background: var(--assistant); + color: var(--text); + align-self: flex-start; + border-bottom-left-radius: 4px; +} + +.bubble--assistant.bubble--interrupted { + opacity: 0.75; + border-left: 2px solid var(--warning); +} + +.bubble--system { + align-self: center; + background: transparent; + color: var(--text-dim); + font-size: 12px; + border: 1px dashed var(--border); + padding: 6px 10px; + border-radius: 999px; +} + +.bubble__role { + display: block; + font-size: 10px; + text-transform: uppercase; + letter-spacing: 0.8px; + opacity: 0.7; + margin-bottom: 4px; +} + +/* WebSocket log --------------------------------------------------------- */ + +.ws-log { + display: flex; + flex-direction: column; + min-height: 0; + background: var(--bg); + border: 1px solid var(--border); + border-radius: var(--radius); + overflow: hidden; +} + +.ws-log__header { + display: flex; + align-items: center; + justify-content: space-between; + gap: 8px; + padding: 8px 12px; + border-bottom: 1px solid var(--border); + flex-shrink: 0; + background: var(--bg-soft); +} + +.ws-log__legend { + display: flex; + align-items: center; + gap: 8px; + font-size: 10px; + color: var(--text-dim); +} + +.ws-log__legend-item { + display: inline-flex; + align-items: center; + gap: 4px; +} + +.ws-log__legend-item::before { + content: ""; + width: 8px; + height: 8px; + border-radius: 2px; +} + +.ws-log__legend-item--send::before { + background: var(--success); +} + +.ws-log__legend-item--recv::before { + background: var(--accent-strong); +} + +.ws-log__header h2 { + margin: 0; + font-size: 12px; + color: var(--text-dim); + text-transform: uppercase; + letter-spacing: 0.8px; +} + +.ws-log__header-left { + display: flex; + align-items: center; + gap: 12px; + min-width: 0; +} + +.ws-log__body { + flex: 1; + min-height: 0; + overflow-y: auto; + overflow-x: hidden; + padding: 6px 8px; + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 10.5px; + line-height: 1.4; + color: var(--text-dim); +} + +.ws-log__entry { + display: grid; + grid-template-columns: 58px 42px minmax(0, 1fr); + gap: 6px; + align-items: start; + padding: 5px 4px; + border-radius: 6px; + white-space: pre-wrap; + word-break: break-word; +} + +.ws-log__entry:hover { + background: rgba(255, 255, 255, 0.03); +} + +.ws-log__time { + color: var(--text-dim); + opacity: 0.7; + font-size: 10px; +} + +.ws-log__direction { + font-weight: 700; + font-size: 9px; + letter-spacing: 0.4px; + text-transform: uppercase; + padding: 2px 0; +} + +.ws-log__detail { + min-width: 0; + overflow-wrap: anywhere; +} + +.ws-log__entry--send .ws-log__direction { + color: var(--success); +} + +.ws-log__entry--recv .ws-log__direction { + color: var(--accent-strong); +} + +.ws-log__entry--system .ws-log__direction { + color: var(--text-dim); +} + +.ws-log__entry--error .ws-log__direction { + color: var(--danger); +} + +.ws-log__empty { + color: var(--text-dim); + opacity: 0.7; + padding: 8px 4px; +} + +/* Controls -------------------------------------------------------------- */ + +.controls { + display: grid; + gap: 8px; + padding: 10px 12px 8px; + border-top: 1px solid var(--border); + background: var(--bg-elevated); +} + +.meter { + height: 6px; + background: var(--bg-soft); + border-radius: 999px; + overflow: hidden; +} + +.meter__fill { + height: 100%; + width: 0%; + background: linear-gradient(90deg, var(--success), var(--accent)); + transition: width 80ms linear; +} + +.composer { + display: flex; + align-items: flex-end; + gap: 8px; +} + +.composer__input { + flex: 1; + resize: none; + background: var(--bg-soft); + color: var(--text); + border: 1px solid var(--border); + border-radius: 12px; + padding: 10px 12px; + font: inherit; + font-size: 14px; + line-height: 1.4; + outline: none; + min-height: 42px; + max-height: 180px; + overflow-y: auto; +} + +.composer__input:focus { + border-color: var(--accent); + box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18); +} + +.composer__input:disabled { + opacity: 0.55; + cursor: not-allowed; +} + +.composer__send { + padding: 10px 18px; + border-radius: 12px; + min-width: 84px; + align-self: flex-end; +} + +.composer__send:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.controls__row { + display: flex; + align-items: center; + gap: 10px; + flex-wrap: wrap; +} + +.device-picker { + display: flex; + flex-direction: column; + gap: 3px; + min-width: 0; + flex: 1 1 160px; + max-width: 240px; +} + +.device-picker__label { + font-size: 11px; + color: var(--text-dim); + text-transform: uppercase; + letter-spacing: 0.8px; +} + +.device-picker__select { + appearance: none; + background: var(--bg-soft); + color: var(--text); + border: 1px solid var(--border); + border-radius: 10px; + padding: 7px 28px 7px 10px; + font: inherit; + font-size: 12px; + outline: none; + width: 100%; + cursor: pointer; +} + +.device-picker__select:focus { + border-color: var(--accent); + box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18); +} + +.device-picker__select:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.mic-btn { + display: inline-flex; + align-items: center; + gap: 6px; + padding: 8px 12px; + border-radius: 999px; + background: var(--bg-soft); + color: var(--text); + border: 1px solid var(--border); + font: inherit; + font-size: 12px; + font-weight: 600; + cursor: pointer; + flex-shrink: 0; + transition: transform 0.08s ease, background 0.15s ease, color 0.15s ease, + border-color 0.15s ease; +} + +.mic-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.mic-btn:not(:disabled):hover { + border-color: var(--accent); +} + +.mic-btn:not(:disabled):active { + transform: scale(0.98); +} + +.mic-btn[aria-pressed="true"] { + background: var(--danger); + border-color: var(--danger); + color: #fff; + box-shadow: 0 0 0 6px rgba(255, 85, 119, 0.18); +} + +.mic-btn__icon { + display: block; +} + +.indicators { + display: flex; + gap: 12px; + margin-left: auto; +} + +.indicator { + display: inline-flex; + align-items: center; + gap: 6px; + font-size: 12px; + color: var(--text-dim); +} + +.indicator__dot { + width: 8px; + height: 8px; + border-radius: 50%; + background: var(--bg-soft); + border: 1px solid var(--border); +} + +.indicator.is-active .indicator__dot--mic { + background: var(--success); + border-color: var(--success); + box-shadow: 0 0 0 4px rgba(45, 210, 139, 0.2); +} + +.indicator.is-active .indicator__dot--bot { + background: var(--accent); + border-color: var(--accent); + box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.22); + animation: pulse 1s ease-in-out infinite; +} + +.indicator.is-active .indicator__label { + color: var(--text); +} + +.btn { + appearance: none; + border: 1px solid var(--border); + background: var(--bg-soft); + color: var(--text); + padding: 9px 14px; + border-radius: 10px; + font: inherit; + font-size: 13px; + cursor: pointer; + transition: background 0.15s ease, border-color 0.15s ease; +} + +.btn:hover { + border-color: var(--accent); +} + +.btn--primary { + background: var(--accent); + color: #fff; + border-color: var(--accent); +} + +.btn--primary:hover { + background: var(--accent-strong); + border-color: var(--accent-strong); +} + +.btn--primary.is-disconnect { + background: transparent; + color: var(--danger); + border-color: var(--danger); +} + +.btn--ghost { + background: transparent; +} + +.hint { + margin: 0; + font-size: 11px; + color: var(--text-dim); + opacity: 0.85; +} + +.hint kbd { + background: var(--bg-soft); + border: 1px solid var(--border); + border-bottom-width: 2px; + border-radius: 4px; + padding: 0 5px; + font-family: ui-monospace, SFMono-Regular, Menlo, monospace; + font-size: 11px; + color: var(--text); +} + +/* Animations ------------------------------------------------------------ */ + +@keyframes pulse { + 0%, + 100% { + opacity: 1; + } + 50% { + opacity: 0.55; + } +} + +@keyframes bubble-in { + from { + opacity: 0; + transform: translateY(4px); + } + to { + opacity: 1; + transform: translateY(0); + } +} + +/* Responsive ------------------------------------------------------------ */ + +@media (max-width: 820px) { + .app__body { + grid-template-columns: 1fr; + grid-template-rows: minmax(0, 1fr) min(240px, 32vh); + } + + .ws-log__legend { + display: none; + } + + .hint { + display: none; + } +} + +@media (max-width: 720px) { + body { + padding: 0; + } + + .app { + height: 100dvh; + max-height: 100vh; + border-radius: 0; + border: none; + padding: 10px; + } + + .app__header { + grid-template-columns: 1fr; + } + + .connection { + flex-direction: column; + align-items: stretch; + } + + .ws-log__entry { + grid-template-columns: 54px 38px minmax(0, 1fr); + } + + .status { + justify-content: flex-end; + } + + .device-picker { + flex: 1 1 100%; + max-width: none; + } + + .indicators { + margin-left: 0; + } +}