Integrate product-ws voice demo on port 8000 alongside REST API.
Add src/voice Pipecat pipeline, browser demo at /voice-demo, and config/voice.json. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
75
config/voice.json
Normal file
75
config/voice.json
Normal file
@@ -0,0 +1,75 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 8000,
|
||||
"cors_origins": ["http://localhost:3000", "http://localhost:8080"],
|
||||
"serve_webpage": true,
|
||||
"webpage_mount": "/voice-demo"
|
||||
},
|
||||
"audio": {
|
||||
"sample_rate_hz": 16000,
|
||||
"channels": 1,
|
||||
"frame_ms": 20
|
||||
},
|
||||
"session": {
|
||||
"inactivity_timeout_sec": 60
|
||||
},
|
||||
"turn": {
|
||||
"vad": {
|
||||
"confidence": 0.7,
|
||||
"start_secs": 0.2,
|
||||
"stop_secs": 0.4,
|
||||
"min_volume": 0.6
|
||||
},
|
||||
"interruption_min_chars": 3,
|
||||
"interruption_use_interim": true,
|
||||
"interruption_short_replies": [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否"
|
||||
],
|
||||
"user_speech_timeout_sec": 0.8
|
||||
},
|
||||
"agent": {
|
||||
"system_prompt": "You are a helpful, friendly voice assistant. Keep responses concise and natural for spoken conversation.",
|
||||
"greeting": "Please introduce yourself briefly.",
|
||||
"greeting_mode": "generated"
|
||||
},
|
||||
"services": {
|
||||
"stt": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini-transcribe",
|
||||
"language": "en"
|
||||
},
|
||||
"llm": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini",
|
||||
"temperature": 0.7
|
||||
},
|
||||
"tts": {
|
||||
"provider": "openai",
|
||||
"api_key": "",
|
||||
"base_url": null,
|
||||
"model": "gpt-4o-mini-tts",
|
||||
"voice": "alloy"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
uvicorn[standard]>=0.24.0
|
||||
pipecat-ai[websocket,openai,silero]
|
||||
websockets>=13.1,<16.0
|
||||
pydantic>=2.4.2
|
||||
python-dotenv>=1.0.0
|
||||
httpx>=0.25.0
|
||||
@@ -12,7 +14,7 @@ pydantic-settings==2.1.0
|
||||
python-multipart==0.0.6
|
||||
python-jose[cryptography]==3.3.0
|
||||
passlib[bcrypt]==1.7.4
|
||||
openai==1.55.3
|
||||
openai>=1.74.0,<3
|
||||
loguru>=0.7.0
|
||||
pandas
|
||||
requests
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from fastapi import FastAPI
|
||||
import sys
|
||||
from .api.endpoints import router as api_router
|
||||
from .core.fastgpt_client import lifespan
|
||||
from .core.logging_config import setup_logging
|
||||
from .voice.routes import register_voice
|
||||
|
||||
# Setup logging first
|
||||
setup_logging()
|
||||
@@ -18,4 +18,5 @@ app = FastAPI(
|
||||
def read_root():
|
||||
return {"message": "Server is running."}
|
||||
|
||||
app.include_router(api_router)
|
||||
app.include_router(api_router)
|
||||
register_voice(app)
|
||||
1
src/voice/__init__.py
Normal file
1
src/voice/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Voice websocket demo (product-ws / va.ws.v1) powered by Pipecat."""
|
||||
202
src/voice/config.py
Normal file
202
src/voice/config.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
DEFAULT_VOICE_CONFIG = PROJECT_ROOT / "config" / "voice.json"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServerConfig:
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8000
|
||||
cors_origins: list[str] = field(default_factory=list)
|
||||
serve_webpage: bool = True
|
||||
webpage_mount: str = "/voice-demo"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AudioConfig:
|
||||
sample_rate_hz: int = 16000
|
||||
channels: int = 1
|
||||
frame_ms: int = 20
|
||||
|
||||
@property
|
||||
def frame_bytes(self) -> int:
|
||||
return int(self.sample_rate_hz * self.frame_ms / 1000) * self.channels * 2
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SessionConfig:
|
||||
inactivity_timeout_sec: int = 60
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VADConfig:
|
||||
confidence: float = 0.7
|
||||
start_secs: float = 0.2
|
||||
stop_secs: float = 0.6
|
||||
min_volume: float = 0.6
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TurnConfig:
|
||||
vad: VADConfig = field(default_factory=VADConfig)
|
||||
user_speech_timeout_sec: float = 1.0
|
||||
interruption_min_chars: int = 3
|
||||
interruption_use_interim: bool = True
|
||||
interruption_short_replies: list[str] = field(
|
||||
default_factory=lambda: [
|
||||
"是",
|
||||
"是的",
|
||||
"对",
|
||||
"对的",
|
||||
"嗯",
|
||||
"好",
|
||||
"好的",
|
||||
"行",
|
||||
"可以",
|
||||
"没问题",
|
||||
"不是",
|
||||
"不",
|
||||
"不行",
|
||||
"不用",
|
||||
"不要",
|
||||
"没有",
|
||||
"否",
|
||||
"no",
|
||||
"yes",
|
||||
"ok",
|
||||
"okay",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AgentConfig:
|
||||
system_prompt: str = "You are a helpful, friendly voice assistant."
|
||||
greeting: str | None = None
|
||||
greeting_mode: str = "generated"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LLMConfig:
|
||||
provider: str = "openai"
|
||||
api_key: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini"
|
||||
temperature: float | None = 0.7
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class STTConfig:
|
||||
provider: str = "openai"
|
||||
app_id: str = ""
|
||||
api_key: str = ""
|
||||
api_secret: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini-transcribe"
|
||||
language: str | None = "en"
|
||||
domain: str = "iat"
|
||||
accent: str = "mandarin"
|
||||
encoding: str = "raw"
|
||||
frame_size: int = 1280
|
||||
timeout_sec: float = 10.0
|
||||
dynamic_correction: bool = False
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TTSConfig:
|
||||
provider: str = "openai"
|
||||
app_id: str = ""
|
||||
api_key: str = ""
|
||||
api_secret: str = ""
|
||||
base_url: str | None = None
|
||||
model: str = "gpt-4o-mini-tts"
|
||||
voice: str = "alloy"
|
||||
aue: str = "raw"
|
||||
tte: str = "UTF8"
|
||||
speed: int = 50
|
||||
volume: int = 50
|
||||
pitch: int = 50
|
||||
timeout_sec: float = 30.0
|
||||
source_sample_rate_hz: int | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServicesConfig:
|
||||
llm: LLMConfig = field(default_factory=LLMConfig)
|
||||
stt: STTConfig = field(default_factory=STTConfig)
|
||||
tts: TTSConfig = field(default_factory=TTSConfig)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EngineConfig:
|
||||
server: ServerConfig = field(default_factory=ServerConfig)
|
||||
audio: AudioConfig = field(default_factory=AudioConfig)
|
||||
session: SessionConfig = field(default_factory=SessionConfig)
|
||||
turn: TurnConfig = field(default_factory=TurnConfig)
|
||||
agent: AgentConfig = field(default_factory=AgentConfig)
|
||||
services: ServicesConfig = field(default_factory=ServicesConfig)
|
||||
|
||||
|
||||
def load_config(path: str | Path | None = None) -> EngineConfig:
|
||||
config_path = Path(path) if path is not None else DEFAULT_VOICE_CONFIG
|
||||
if not config_path.is_absolute():
|
||||
config_path = PROJECT_ROOT / config_path
|
||||
data = json.loads(config_path.read_text(encoding="utf-8"))
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Config file must contain a JSON object: {config_path}")
|
||||
return config_from_dict(data)
|
||||
|
||||
|
||||
def config_from_dict(data: dict) -> EngineConfig:
|
||||
services = _dict(data.get("services"))
|
||||
agent = _dict(data.get("agent"))
|
||||
if agent.get("greeting") == "":
|
||||
agent["greeting"] = None
|
||||
if agent.get("greeting_mode") not in (None, "generated", "fixed", "off"):
|
||||
raise ValueError("agent.greeting_mode must be one of: generated, fixed, off")
|
||||
|
||||
stt = _dict(services.get("stt") or services.get("asr"))
|
||||
if stt.get("language") == "":
|
||||
stt["language"] = None
|
||||
|
||||
turn = _dict(data.get("turn"))
|
||||
vad = _dict(turn.get("vad"))
|
||||
|
||||
return EngineConfig(
|
||||
server=ServerConfig(**_dict(data.get("server"))),
|
||||
audio=AudioConfig(**_dict(data.get("audio"))),
|
||||
session=SessionConfig(**_dict(data.get("session"))),
|
||||
turn=TurnConfig(
|
||||
vad=VADConfig(**vad),
|
||||
user_speech_timeout_sec=float(
|
||||
turn.get("user_speech_timeout_sec", TurnConfig().user_speech_timeout_sec)
|
||||
),
|
||||
interruption_min_chars=int(
|
||||
turn.get("interruption_min_chars", TurnConfig().interruption_min_chars)
|
||||
),
|
||||
interruption_use_interim=bool(
|
||||
turn.get("interruption_use_interim", TurnConfig().interruption_use_interim)
|
||||
),
|
||||
interruption_short_replies=list(
|
||||
turn.get(
|
||||
"interruption_short_replies",
|
||||
TurnConfig().interruption_short_replies,
|
||||
)
|
||||
),
|
||||
),
|
||||
agent=AgentConfig(**agent),
|
||||
services=ServicesConfig(
|
||||
llm=LLMConfig(**_dict(services.get("llm"))),
|
||||
stt=STTConfig(**stt),
|
||||
tts=TTSConfig(**_dict(services.get("tts"))),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _dict(value: object) -> dict:
|
||||
return dict(value) if isinstance(value, dict) else {}
|
||||
179
src/voice/pipeline.py
Normal file
179
src/voice/pipeline.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.audio.vad.vad_analyzer import VADParams
|
||||
from pipecat.frames.frames import (
|
||||
LLMRunFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||
from pipecat.processors.aggregators.llm_response_universal import (
|
||||
AssistantTurnStoppedMessage,
|
||||
LLMContextAggregatorPair,
|
||||
LLMUserAggregatorParams,
|
||||
UserTurnStoppedMessage,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
from pipecat.transports.websocket.fastapi import (
|
||||
FastAPIWebsocketParams,
|
||||
FastAPIWebsocketTransport,
|
||||
)
|
||||
from pipecat.turns.user_stop.speech_timeout_user_turn_stop_strategy import (
|
||||
SpeechTimeoutUserTurnStopStrategy,
|
||||
)
|
||||
from pipecat.turns.user_turn_strategies import UserTurnStrategies
|
||||
|
||||
from .config import EngineConfig
|
||||
from .protocol import ProductWebsocketSerializer
|
||||
from .services import create_llm_service, create_stt_service, create_tts_service
|
||||
from .text_input import ProductTextInputProcessor
|
||||
from .text_stream import ProductTextStreamProcessor
|
||||
from .transcript_stream import ProductTranscriptStreamProcessor
|
||||
from .turn_start import InterruptionGateUserTurnStartStrategy
|
||||
|
||||
|
||||
async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
|
||||
await run_pipeline_with_serializer(
|
||||
websocket,
|
||||
config,
|
||||
serializer=ProductWebsocketSerializer(
|
||||
sample_rate=config.audio.sample_rate_hz,
|
||||
channels=config.audio.channels,
|
||||
),
|
||||
client_label="Product JSON",
|
||||
)
|
||||
|
||||
|
||||
async def run_pipeline_with_serializer(
|
||||
websocket,
|
||||
config: EngineConfig,
|
||||
*,
|
||||
serializer: FrameSerializer,
|
||||
client_label: str,
|
||||
) -> None:
|
||||
transport = FastAPIWebsocketTransport(
|
||||
websocket=websocket,
|
||||
params=FastAPIWebsocketParams(
|
||||
audio_in_enabled=True,
|
||||
audio_out_enabled=True,
|
||||
audio_in_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_in_channels=config.audio.channels,
|
||||
audio_out_channels=config.audio.channels,
|
||||
serializer=serializer,
|
||||
session_timeout=None,
|
||||
),
|
||||
)
|
||||
|
||||
stt = create_stt_service(config.services.stt, config.audio)
|
||||
llm = create_llm_service(config.services.llm)
|
||||
tts = create_tts_service(config.services.tts, config.audio)
|
||||
|
||||
messages = [{"role": "system", "content": config.agent.system_prompt}]
|
||||
if config.agent.greeting and config.agent.greeting_mode == "generated":
|
||||
messages.append({"role": "system", "content": config.agent.greeting})
|
||||
|
||||
context = LLMContext(messages)
|
||||
|
||||
vad_params = VADParams(
|
||||
confidence=config.turn.vad.confidence,
|
||||
start_secs=config.turn.vad.start_secs,
|
||||
stop_secs=config.turn.vad.stop_secs,
|
||||
min_volume=config.turn.vad.min_volume,
|
||||
)
|
||||
user_turn_strategies = UserTurnStrategies(
|
||||
start=[
|
||||
InterruptionGateUserTurnStartStrategy(
|
||||
min_chars_when_bot_speaking=config.turn.interruption_min_chars,
|
||||
allowed_short_replies=config.turn.interruption_short_replies,
|
||||
use_interim=config.turn.interruption_use_interim,
|
||||
),
|
||||
],
|
||||
stop=[
|
||||
SpeechTimeoutUserTurnStopStrategy(
|
||||
user_speech_timeout=config.turn.user_speech_timeout_sec,
|
||||
),
|
||||
],
|
||||
)
|
||||
user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
|
||||
context,
|
||||
user_params=LLMUserAggregatorParams(
|
||||
vad_analyzer=SileroVADAnalyzer(params=vad_params),
|
||||
user_turn_strategies=user_turn_strategies,
|
||||
),
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(),
|
||||
ProductTextInputProcessor(),
|
||||
stt,
|
||||
ProductTranscriptStreamProcessor(),
|
||||
user_aggregator,
|
||||
llm,
|
||||
ProductTextStreamProcessor(),
|
||||
tts,
|
||||
transport.output(),
|
||||
assistant_aggregator,
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
params=PipelineParams(
|
||||
audio_in_sample_rate=config.audio.sample_rate_hz,
|
||||
audio_out_sample_rate=config.audio.sample_rate_hz,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
enable_heartbeats=True,
|
||||
),
|
||||
idle_timeout_secs=config.session.inactivity_timeout_sec,
|
||||
)
|
||||
|
||||
@transport.event_handler("on_client_connected")
|
||||
async def on_client_connected(_transport, _client):
|
||||
logger.info(f"{client_label} websocket client connected")
|
||||
if config.agent.greeting_mode == "fixed" and config.agent.greeting:
|
||||
await task.queue_frames([TTSSpeakFrame(config.agent.greeting)])
|
||||
elif config.agent.greeting_mode == "generated":
|
||||
await task.queue_frames([LLMRunFrame()])
|
||||
|
||||
@transport.event_handler("on_client_disconnected")
|
||||
async def on_client_disconnected(_transport, _client):
|
||||
logger.info(f"{client_label} websocket client disconnected")
|
||||
await task.cancel()
|
||||
|
||||
@transport.event_handler("on_session_timeout")
|
||||
async def on_session_timeout(_transport, _client):
|
||||
logger.info(f"{client_label} websocket session timed out")
|
||||
await task.cancel()
|
||||
|
||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
|
||||
logger.info(f"User: {message.content}")
|
||||
text = (message.content or "").strip()
|
||||
if not text:
|
||||
return
|
||||
await task.queue_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": "input.transcript.final",
|
||||
"text": text,
|
||||
"user_id": message.user_id,
|
||||
"timestamp": message.timestamp,
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
@assistant_aggregator.event_handler("on_assistant_turn_stopped")
|
||||
async def on_assistant_turn_stopped(_aggregator, message: AssistantTurnStoppedMessage):
|
||||
logger.info(f"Assistant: {message.content}")
|
||||
|
||||
runner = PipelineRunner(handle_sigint=False)
|
||||
await runner.run(task)
|
||||
160
src/voice/protocol.py
Normal file
160
src/voice/protocol.py
Normal file
@@ -0,0 +1,160 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InputAudioRawFrame,
|
||||
InputTransportMessageFrame,
|
||||
OutputAudioRawFrame,
|
||||
OutputTransportMessageFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TextFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.serializers.base_serializer import FrameSerializer
|
||||
|
||||
|
||||
class ProductWebsocketSerializer(FrameSerializer):
|
||||
"""Stable app-facing JSON/base64 protocol adapter for Pipecat websocket transport."""
|
||||
|
||||
protocol = "va.ws.v1"
|
||||
|
||||
def __init__(self, *, sample_rate: int, channels: int):
|
||||
super().__init__()
|
||||
self._sample_rate = sample_rate
|
||||
self._channels = channels
|
||||
self._sequence = 0
|
||||
|
||||
async def serialize(self, frame: Frame) -> str | bytes | None:
|
||||
if isinstance(frame, OutputAudioRawFrame):
|
||||
return self._event(
|
||||
"response.audio.delta",
|
||||
audio=base64.b64encode(frame.audio).decode("ascii"),
|
||||
bytes=len(frame.audio),
|
||||
sample_rate=frame.sample_rate,
|
||||
channels=frame.num_channels,
|
||||
)
|
||||
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
return self._event("response.audio.started")
|
||||
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
return self._event("response.audio.stopped")
|
||||
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
return self._event(
|
||||
"input.transcript.final",
|
||||
text=frame.text,
|
||||
user_id=frame.user_id,
|
||||
timestamp=frame.timestamp,
|
||||
)
|
||||
|
||||
if isinstance(frame, TextFrame):
|
||||
return self._event("response.text.delta", text=frame.text)
|
||||
|
||||
if isinstance(frame, (OutputTransportMessageFrame, OutputTransportMessageUrgentFrame)):
|
||||
if self.should_ignore_frame(frame):
|
||||
return None
|
||||
message = frame.message
|
||||
if isinstance(message, dict) and isinstance(message.get("type"), str):
|
||||
event_type = message["type"]
|
||||
payload = {k: v for k, v in message.items() if k != "type"}
|
||||
return self._event(event_type, **payload)
|
||||
return self._event("transport.message", message=message)
|
||||
|
||||
return None
|
||||
|
||||
async def deserialize(self, data: str | bytes) -> Frame | None:
|
||||
if isinstance(data, bytes):
|
||||
return InputAudioRawFrame(
|
||||
audio=data,
|
||||
sample_rate=self._sample_rate,
|
||||
num_channels=self._channels,
|
||||
)
|
||||
|
||||
try:
|
||||
message = json.loads(data)
|
||||
except json.JSONDecodeError as exc:
|
||||
logger.warning(f"Invalid product websocket JSON: {exc}")
|
||||
return None
|
||||
|
||||
if not isinstance(message, dict):
|
||||
logger.warning("Product websocket message must be a JSON object")
|
||||
return None
|
||||
|
||||
message_type = message.get("type")
|
||||
if message_type == "session.start":
|
||||
return InputTransportMessageFrame(
|
||||
message={
|
||||
"type": "session.started",
|
||||
"protocol": self.protocol,
|
||||
"audio": {
|
||||
"encoding": "pcm_s16le",
|
||||
"sample_rate": self._sample_rate,
|
||||
"channels": self._channels,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
if message_type == "session.stop":
|
||||
return EndFrame()
|
||||
|
||||
if message_type == "response.cancel":
|
||||
return CancelFrame(reason="client_cancelled")
|
||||
|
||||
if message_type == "input.audio":
|
||||
audio = message.get("audio") or message.get("data")
|
||||
if not isinstance(audio, str):
|
||||
logger.warning("input.audio requires base64 'audio' or 'data'")
|
||||
return None
|
||||
try:
|
||||
pcm = base64.b64decode(audio)
|
||||
except ValueError as exc:
|
||||
logger.warning(f"Invalid input.audio base64: {exc}")
|
||||
return None
|
||||
return InputAudioRawFrame(
|
||||
audio=pcm,
|
||||
sample_rate=int(message.get("sample_rate") or self._sample_rate),
|
||||
num_channels=int(message.get("channels") or self._channels),
|
||||
)
|
||||
|
||||
if message_type == "input.text":
|
||||
text = message.get("text")
|
||||
if not isinstance(text, str) or not text.strip():
|
||||
logger.warning("input.text requires non-empty 'text'")
|
||||
return None
|
||||
return InputTransportMessageFrame(
|
||||
message={
|
||||
"type": "input.text",
|
||||
"text": text,
|
||||
"interrupt": bool(message.get("interrupt", True)),
|
||||
}
|
||||
)
|
||||
|
||||
if message_type == "transport.message":
|
||||
payload = message.get("message")
|
||||
return InputTransportMessageFrame(message=payload if isinstance(payload, dict) else message)
|
||||
|
||||
logger.warning(f"Unsupported product websocket message type: {message_type!r}")
|
||||
return None
|
||||
|
||||
def _event(self, event_type: str, **payload: Any) -> str:
|
||||
self._sequence += 1
|
||||
return json.dumps(
|
||||
{
|
||||
"type": event_type,
|
||||
"protocol": self.protocol,
|
||||
"seq": self._sequence,
|
||||
**payload,
|
||||
},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
92
src/voice/routes.py
Normal file
92
src/voice/routes.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, FastAPI, WebSocket
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from loguru import logger
|
||||
|
||||
from .config import DEFAULT_VOICE_CONFIG, EngineConfig, load_config
|
||||
from .pipeline import run_product_voice_pipeline
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
VOICE_DEMO_DIR = PROJECT_ROOT / "static" / "voice-demo"
|
||||
|
||||
router = APIRouter(tags=["voice"])
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_voice_config() -> EngineConfig:
|
||||
return load_config(DEFAULT_VOICE_CONFIG)
|
||||
|
||||
|
||||
def _normalize_mount_path(path: str) -> str:
|
||||
normalized = path.strip() or "/voice-demo"
|
||||
if not normalized.startswith("/"):
|
||||
normalized = f"/{normalized}"
|
||||
return normalized.rstrip("/") or "/"
|
||||
|
||||
|
||||
@router.get("/voice/health")
|
||||
async def voice_health() -> dict[str, object]:
|
||||
config = get_voice_config()
|
||||
mount = (
|
||||
_normalize_mount_path(config.server.webpage_mount)
|
||||
if config.server.serve_webpage
|
||||
else None
|
||||
)
|
||||
return {
|
||||
"status": "healthy",
|
||||
"protocols": {
|
||||
"/ws-product": "va.ws.v1.json_base64",
|
||||
},
|
||||
"features": {
|
||||
"product_text_input": True,
|
||||
"product_text_interrupt": True,
|
||||
},
|
||||
"demo": mount,
|
||||
"llm_provider": config.services.llm.provider,
|
||||
"stt_provider": config.services.stt.provider,
|
||||
"tts_provider": config.services.tts.provider,
|
||||
}
|
||||
|
||||
|
||||
@router.websocket("/ws-product")
|
||||
async def product_websocket_endpoint(websocket: WebSocket) -> None:
|
||||
await websocket.accept()
|
||||
config = get_voice_config()
|
||||
await run_product_voice_pipeline(websocket, config)
|
||||
|
||||
|
||||
def register_voice(app: FastAPI) -> None:
|
||||
"""Mount voice websocket routes and optional browser demo static files."""
|
||||
if not DEFAULT_VOICE_CONFIG.exists():
|
||||
logger.warning(f"Voice config not found at {DEFAULT_VOICE_CONFIG}; voice demo disabled")
|
||||
return
|
||||
|
||||
config = get_voice_config()
|
||||
app.include_router(router)
|
||||
|
||||
if config.server.cors_origins:
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=config.server.cors_origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
if config.server.serve_webpage and VOICE_DEMO_DIR.is_dir():
|
||||
mount = _normalize_mount_path(config.server.webpage_mount)
|
||||
app.mount(
|
||||
mount,
|
||||
StaticFiles(directory=str(VOICE_DEMO_DIR), html=True),
|
||||
name="voice-demo",
|
||||
)
|
||||
logger.info(f"Voice demo mounted at {mount}")
|
||||
else:
|
||||
logger.info("Voice demo static page disabled or missing")
|
||||
|
||||
logger.info("Voice websocket registered at /ws-product")
|
||||
165
src/voice/services.py
Normal file
165
src/voice/services.py
Normal file
@@ -0,0 +1,165 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
from openai import BadRequestError
|
||||
from openai import NOT_GIVEN
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame, TTSAudioRawFrame
|
||||
from pipecat.services.openai._constants import OPENAI_SAMPLE_RATE
|
||||
from pipecat.services.openai.llm import OpenAILLMService
|
||||
from pipecat.services.openai.stt import OpenAISTTService
|
||||
from pipecat.services.openai.tts import VALID_VOICES, OpenAITTSService
|
||||
from pipecat.transcriptions.language import Language
|
||||
|
||||
from .config import AudioConfig, LLMConfig, STTConfig, TTSConfig
|
||||
from .xfyun_asr import DEFAULT_XFYUN_ASR_URL, XfyunASRService
|
||||
from .xfyun_tts import DEFAULT_XFYUN_TTS_URL, XfyunTTSService
|
||||
|
||||
|
||||
def create_stt_service(config: STTConfig, audio: AudioConfig | None = None):
|
||||
if config.provider == "xfyun":
|
||||
sample_rate = audio.sample_rate_hz if audio else 16000
|
||||
return XfyunASRService(
|
||||
app_id=config.app_id,
|
||||
api_key=config.api_key or "",
|
||||
api_secret=config.api_secret,
|
||||
url=config.base_url or DEFAULT_XFYUN_ASR_URL,
|
||||
language=config.language or "zh_cn",
|
||||
domain=config.domain,
|
||||
accent=config.accent,
|
||||
sample_rate=sample_rate,
|
||||
encoding=config.encoding,
|
||||
frame_size=config.frame_size,
|
||||
open_timeout=config.timeout_sec,
|
||||
dynamic_correction=config.dynamic_correction,
|
||||
)
|
||||
|
||||
_require_provider(config.provider, "openai", "stt")
|
||||
return OpenAISTTService(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
settings=OpenAISTTService.Settings(
|
||||
model=config.model,
|
||||
language=_language(config.language),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_llm_service(config: LLMConfig):
|
||||
_require_provider(config.provider, "openai", "llm")
|
||||
return OpenAILLMService(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
settings=OpenAILLMService.Settings(
|
||||
model=config.model,
|
||||
temperature=config.temperature if config.temperature is not None else NOT_GIVEN,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def create_tts_service(config: TTSConfig, audio: AudioConfig):
|
||||
if config.provider == "xfyun":
|
||||
source_sample_rate = config.source_sample_rate_hz or audio.sample_rate_hz
|
||||
if source_sample_rate not in (8000, 16000):
|
||||
raise ValueError("Xfyun TTS source_sample_rate_hz must be 8000 or 16000")
|
||||
return XfyunTTSService(
|
||||
app_id=config.app_id,
|
||||
api_key=config.api_key or "",
|
||||
api_secret=config.api_secret,
|
||||
voice=config.voice,
|
||||
url=config.base_url or DEFAULT_XFYUN_TTS_URL,
|
||||
sample_rate=audio.sample_rate_hz,
|
||||
source_sample_rate=source_sample_rate,
|
||||
encoding=config.aue,
|
||||
text_encoding=config.tte,
|
||||
speed=config.speed,
|
||||
volume=config.volume,
|
||||
pitch=config.pitch,
|
||||
timeout=config.timeout_sec,
|
||||
)
|
||||
|
||||
_require_provider(config.provider, "openai", "tts")
|
||||
service_class = OpenAITTSService if config.voice in VALID_VOICES else OpenAICompatibleTTSService
|
||||
return service_class(
|
||||
api_key=config.api_key or None,
|
||||
base_url=config.base_url,
|
||||
sample_rate=audio.sample_rate_hz,
|
||||
source_sample_rate=config.source_sample_rate_hz,
|
||||
settings=OpenAITTSService.Settings(
|
||||
model=config.model,
|
||||
voice=config.voice,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class OpenAICompatibleTTSService(OpenAITTSService):
|
||||
"""OpenAI-compatible TTS service that permits provider-specific voice ids."""
|
||||
|
||||
def __init__(self, *, source_sample_rate: int | None = None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self._source_sample_rate = source_sample_rate or OPENAI_SAMPLE_RATE
|
||||
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
||||
voice = self._settings.voice
|
||||
if not voice:
|
||||
yield ErrorFrame(error="TTS voice must be specified")
|
||||
return
|
||||
|
||||
try:
|
||||
create_params = {
|
||||
"input": text,
|
||||
"model": self._settings.model,
|
||||
"voice": voice,
|
||||
"response_format": "pcm",
|
||||
}
|
||||
|
||||
if self._settings.instructions:
|
||||
create_params["instructions"] = self._settings.instructions
|
||||
|
||||
if self._settings.speed:
|
||||
create_params["speed"] = self._settings.speed
|
||||
|
||||
async with self._client.audio.speech.with_streaming_response.create(
|
||||
**create_params
|
||||
) as response:
|
||||
if response.status_code != 200:
|
||||
error = await response.text()
|
||||
yield ErrorFrame(
|
||||
error=f"TTS request failed (status: {response.status_code}, error: {error})"
|
||||
)
|
||||
return
|
||||
|
||||
await self.start_tts_usage_metrics(text)
|
||||
|
||||
async def audio_chunks():
|
||||
async for chunk in response.iter_bytes(self.chunk_size):
|
||||
if chunk:
|
||||
yield chunk
|
||||
|
||||
first_frame = True
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
audio_chunks(),
|
||||
in_sample_rate=self._source_sample_rate,
|
||||
context_id=context_id,
|
||||
):
|
||||
if first_frame:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_frame = False
|
||||
yield frame
|
||||
except BadRequestError as exc:
|
||||
yield ErrorFrame(error=f"TTS request failed: {exc}")
|
||||
except Exception as exc:
|
||||
yield ErrorFrame(error=f"TTS request failed: {exc}")
|
||||
|
||||
|
||||
def _require_provider(actual: str, expected: str, service_name: str) -> None:
|
||||
if actual != expected:
|
||||
raise ValueError(f"Unsupported {service_name} provider {actual!r}; expected {expected!r}")
|
||||
|
||||
|
||||
def _language(value: str | None) -> Language | None:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.replace("-", "_").upper()
|
||||
return getattr(Language, normalized, value)
|
||||
38
src/voice/text_input.py
Normal file
38
src/voice/text_input.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import Frame, InputTransportMessageFrame, LLMMessagesAppendFrame
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ProductTextInputProcessor(FrameProcessor):
|
||||
"""Converts product text-input transport messages into LLM turns."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if not isinstance(frame, InputTransportMessageFrame):
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
message = frame.message
|
||||
if not isinstance(message, dict) or message.get("type") != "input.text":
|
||||
await self.push_frame(frame, direction)
|
||||
return
|
||||
|
||||
text = str(message.get("text") or "").strip()
|
||||
if not text:
|
||||
return
|
||||
|
||||
if message.get("interrupt", True):
|
||||
logger.info("Text input interrupting current response")
|
||||
await self.broadcast_interruption()
|
||||
|
||||
await self.push_frame(
|
||||
LLMMessagesAppendFrame(
|
||||
messages=[{"role": "user", "content": text}],
|
||||
run_llm=True,
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
75
src/voice/text_stream.py
Normal file
75
src/voice/text_stream.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterruptionFrame,
|
||||
LLMFullResponseEndFrame,
|
||||
LLMFullResponseStartFrame,
|
||||
LLMTextFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
TTSSpeakFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ProductTextStreamProcessor(FrameProcessor):
|
||||
"""Mirrors LLM text frames as streaming protocol events."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._aggregation: list[str] = []
|
||||
self._turn_active = False
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, LLMFullResponseStartFrame):
|
||||
await self._start_turn()
|
||||
elif isinstance(frame, LLMTextFrame):
|
||||
if frame.text:
|
||||
await self._delta(frame.text)
|
||||
elif isinstance(frame, LLMFullResponseEndFrame):
|
||||
await self._end_turn(interrupted=False)
|
||||
elif isinstance(frame, InterruptionFrame):
|
||||
await self._end_turn(interrupted=True)
|
||||
elif isinstance(frame, TTSSpeakFrame):
|
||||
text = frame.text or ""
|
||||
await self._start_turn()
|
||||
if text:
|
||||
await self._delta(text)
|
||||
await self._end_turn(interrupted=False)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _start_turn(self) -> None:
|
||||
if self._turn_active:
|
||||
return
|
||||
self._turn_active = True
|
||||
self._aggregation = []
|
||||
await self._emit("response.text.started")
|
||||
|
||||
async def _delta(self, text: str) -> None:
|
||||
if not self._turn_active:
|
||||
await self._start_turn()
|
||||
self._aggregation.append(text)
|
||||
await self._emit("response.text.delta", text=text)
|
||||
|
||||
async def _end_turn(self, *, interrupted: bool) -> None:
|
||||
if not self._turn_active:
|
||||
return
|
||||
full_text = "".join(self._aggregation)
|
||||
self._turn_active = False
|
||||
self._aggregation = []
|
||||
await self._emit(
|
||||
"response.text.final",
|
||||
text=full_text,
|
||||
interrupted=interrupted,
|
||||
)
|
||||
|
||||
async def _emit(self, event_type: str, **payload: object) -> None:
|
||||
await self.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={"type": event_type, **payload},
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
30
src/voice/transcript_stream.py
Normal file
30
src/voice/transcript_stream.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
OutputTransportMessageUrgentFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
||||
|
||||
|
||||
class ProductTranscriptStreamProcessor(FrameProcessor):
|
||||
"""Mirrors interim STT frames to the product websocket protocol."""
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, InterimTranscriptionFrame):
|
||||
await self.push_frame(
|
||||
OutputTransportMessageUrgentFrame(
|
||||
message={
|
||||
"type": "input.transcript.interim",
|
||||
"text": frame.text,
|
||||
"user_id": frame.user_id,
|
||||
"timestamp": frame.timestamp,
|
||||
}
|
||||
),
|
||||
FrameDirection.DOWNSTREAM,
|
||||
)
|
||||
|
||||
await self.push_frame(frame, direction)
|
||||
86
src/voice/turn_start.py
Normal file
86
src/voice/turn_start.py
Normal file
@@ -0,0 +1,86 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from loguru import logger
|
||||
from pipecat.frames.frames import (
|
||||
BotStartedSpeakingFrame,
|
||||
BotStoppedSpeakingFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
)
|
||||
from pipecat.turns.types import ProcessFrameResult
|
||||
from pipecat.turns.user_start.base_user_turn_start_strategy import BaseUserTurnStartStrategy
|
||||
|
||||
|
||||
_COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
|
||||
|
||||
|
||||
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
|
||||
"""Starts user turns only after likely intentional speech."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
min_chars_when_bot_speaking: int,
|
||||
allowed_short_replies: list[str],
|
||||
use_interim: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self._min_chars_when_bot_speaking = min_chars_when_bot_speaking
|
||||
self._allowed_short_replies = {
|
||||
self._normalize_text(reply) for reply in allowed_short_replies if reply.strip()
|
||||
}
|
||||
self._use_interim = use_interim
|
||||
self._bot_speaking = False
|
||||
|
||||
async def reset(self):
|
||||
await super().reset()
|
||||
|
||||
async def process_frame(self, frame: Frame) -> ProcessFrameResult:
|
||||
if isinstance(frame, BotStartedSpeakingFrame):
|
||||
self._bot_speaking = True
|
||||
return ProcessFrameResult.CONTINUE
|
||||
if isinstance(frame, BotStoppedSpeakingFrame):
|
||||
self._bot_speaking = False
|
||||
return ProcessFrameResult.CONTINUE
|
||||
if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
|
||||
return await self._handle_transcription(frame.text, interim=True)
|
||||
if isinstance(frame, TranscriptionFrame):
|
||||
return await self._handle_transcription(frame.text, interim=False)
|
||||
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
async def _handle_transcription(self, text: str, *, interim: bool) -> ProcessFrameResult:
|
||||
normalized = self._normalize_text(text)
|
||||
if not normalized:
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
if not self._bot_speaking:
|
||||
await self.trigger_user_turn_started()
|
||||
return ProcessFrameResult.STOP
|
||||
|
||||
should_interrupt = self._should_interrupt(normalized)
|
||||
logger.debug(
|
||||
f"{self} interruption_gate text={text!r} normalized={normalized!r} "
|
||||
f"should_interrupt={should_interrupt} interim={interim}"
|
||||
)
|
||||
|
||||
if should_interrupt:
|
||||
await self.trigger_user_turn_started()
|
||||
return ProcessFrameResult.STOP
|
||||
|
||||
await self.trigger_reset_aggregation()
|
||||
return ProcessFrameResult.CONTINUE
|
||||
|
||||
def _should_interrupt(self, normalized: str) -> bool:
|
||||
return (
|
||||
normalized in self._allowed_short_replies
|
||||
or len(normalized) >= self._min_chars_when_bot_speaking
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_text(text: str) -> str:
|
||||
return "".join(_COUNTABLE_TEXT_RE.findall(text.lower()))
|
||||
353
src/voice/xfyun_asr.py
Normal file
353
src/voice/xfyun_asr.py
Normal file
@@ -0,0 +1,353 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import format_datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlencode, urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
EndFrame,
|
||||
Frame,
|
||||
InterimTranscriptionFrame,
|
||||
TranscriptionFrame,
|
||||
UserStoppedSpeakingFrame,
|
||||
VADUserStartedSpeakingFrame,
|
||||
)
|
||||
from pipecat.processors.frame_processor import FrameDirection
|
||||
from pipecat.services.settings import STTSettings
|
||||
from pipecat.services.stt_service import STTService
|
||||
from pipecat.transcriptions.language import Language
|
||||
from pipecat.utils.time import time_now_iso8601
|
||||
from websockets.asyncio.client import connect as websocket_connect
|
||||
from websockets.protocol import State
|
||||
|
||||
|
||||
DEFAULT_XFYUN_ASR_URL = "wss://iat-api.xfyun.cn/v2/iat"
|
||||
|
||||
|
||||
class XfyunASRService(STTService):
|
||||
"""iFlytek/Xfyun streaming voice dictation service for Pipecat."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
app_id: str,
|
||||
api_key: str,
|
||||
api_secret: str,
|
||||
url: str | None = None,
|
||||
language: str = "zh_cn",
|
||||
domain: str = "iat",
|
||||
accent: str = "mandarin",
|
||||
sample_rate: int = 16000,
|
||||
encoding: str = "raw",
|
||||
frame_size: int = 1280,
|
||||
open_timeout: float = 10.0,
|
||||
dynamic_correction: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
settings=STTSettings(model=None, language=language),
|
||||
**kwargs,
|
||||
)
|
||||
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
|
||||
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
|
||||
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
|
||||
self._url = url or DEFAULT_XFYUN_ASR_URL
|
||||
self._language = language
|
||||
self._domain = domain
|
||||
self._accent = accent
|
||||
self._encoding = encoding
|
||||
self._frame_size = frame_size
|
||||
self._open_timeout = open_timeout
|
||||
self._dynamic_correction = dynamic_correction
|
||||
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
self._audio_buffer = bytearray()
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
self._finalizing_turn = False
|
||||
self._partials: list[str] = []
|
||||
self._last_text = ""
|
||||
|
||||
async def cleanup(self) -> None:
|
||||
await self._close_utterance()
|
||||
await super().cleanup()
|
||||
|
||||
async def stop(self, frame: EndFrame) -> None:
|
||||
await self._close_utterance()
|
||||
await super().stop(frame)
|
||||
|
||||
async def cancel(self, frame: CancelFrame) -> None:
|
||||
await self._close_utterance()
|
||||
await super().cancel(frame)
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, UserStoppedSpeakingFrame):
|
||||
# Aggregator-level turn end (broadcast once per logical user turn).
|
||||
# This is the only boundary that finalizes/closes the xfyun
|
||||
# websocket, so brief VAD pauses do not restart the ASR session.
|
||||
await self._finish_utterance()
|
||||
elif isinstance(frame, VADUserStartedSpeakingFrame):
|
||||
await self._start_utterance()
|
||||
|
||||
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame | None, None]:
|
||||
if not audio:
|
||||
yield None
|
||||
return
|
||||
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
await self._start_utterance()
|
||||
|
||||
self._audio_buffer.extend(audio)
|
||||
await self._flush_audio_buffer(final=False)
|
||||
yield None
|
||||
|
||||
async def _start_utterance(self) -> None:
|
||||
if self._websocket and self._websocket.state is State.OPEN:
|
||||
return
|
||||
|
||||
if not self._app_id or not self._api_key or not self._api_secret:
|
||||
await self.push_error("Xfyun ASR requires app_id, api_key, and api_secret")
|
||||
return
|
||||
|
||||
if self.sample_rate not in (8000, 16000):
|
||||
await self.push_error("Xfyun ASR sample rate must be 8000 or 16000")
|
||||
return
|
||||
|
||||
self._audio_buffer.clear()
|
||||
self._partials = []
|
||||
self._last_text = ""
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
|
||||
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
|
||||
try:
|
||||
self._websocket = await websocket_connect(
|
||||
auth_url,
|
||||
max_size=None,
|
||||
open_timeout=self._open_timeout,
|
||||
)
|
||||
except Exception as exc:
|
||||
await self.push_error(f"Xfyun ASR connection failed: {exc}", exception=exc)
|
||||
self._websocket = None
|
||||
return
|
||||
|
||||
self._receive_task = self.create_task(
|
||||
self._receive_messages(),
|
||||
name="xfyun_asr_receive",
|
||||
)
|
||||
|
||||
async def _finish_utterance(self) -> None:
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
return
|
||||
|
||||
await self._flush_audio_buffer(final=True)
|
||||
if not self._sent_first_frame:
|
||||
await self._close_utterance()
|
||||
return
|
||||
|
||||
if not self._sent_final_frame:
|
||||
self._finalizing_turn = True
|
||||
await self._send_payload({"data": {"status": 2}})
|
||||
self.request_finalize()
|
||||
self._sent_final_frame = True
|
||||
|
||||
async def _close_utterance(self) -> None:
|
||||
current_task = asyncio.current_task()
|
||||
if self._receive_task and self._receive_task is not current_task:
|
||||
await self.cancel_task(self._receive_task)
|
||||
self._receive_task = None
|
||||
|
||||
websocket = self._websocket
|
||||
self._websocket = None
|
||||
if websocket and websocket.state is State.OPEN:
|
||||
try:
|
||||
await websocket.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._audio_buffer.clear()
|
||||
self._sent_first_frame = False
|
||||
self._sent_final_frame = False
|
||||
self._finalizing_turn = False
|
||||
|
||||
async def _flush_audio_buffer(self, *, final: bool) -> None:
|
||||
while len(self._audio_buffer) >= self._frame_size:
|
||||
chunk = bytes(self._audio_buffer[: self._frame_size])
|
||||
del self._audio_buffer[: self._frame_size]
|
||||
await self._send_audio_chunk(chunk, status=1)
|
||||
|
||||
if final and self._audio_buffer:
|
||||
chunk = bytes(self._audio_buffer)
|
||||
self._audio_buffer.clear()
|
||||
await self._send_audio_chunk(chunk, status=1)
|
||||
|
||||
async def _send_audio_chunk(self, audio: bytes, *, status: int) -> None:
|
||||
if not audio:
|
||||
return
|
||||
|
||||
if not self._sent_first_frame:
|
||||
business = {
|
||||
"language": self._language,
|
||||
"domain": self._domain,
|
||||
"accent": self._accent,
|
||||
}
|
||||
if self._dynamic_correction:
|
||||
business["dwa"] = "wpgs"
|
||||
|
||||
payload = {
|
||||
"common": {"app_id": self._app_id},
|
||||
"business": business,
|
||||
"data": {
|
||||
"status": 0,
|
||||
"format": f"audio/L16;rate={self.sample_rate}",
|
||||
"encoding": self._encoding,
|
||||
"audio": base64.b64encode(audio).decode("utf-8"),
|
||||
},
|
||||
}
|
||||
self._sent_first_frame = True
|
||||
else:
|
||||
payload = {
|
||||
"data": {
|
||||
"status": status,
|
||||
"format": f"audio/L16;rate={self.sample_rate}",
|
||||
"encoding": self._encoding,
|
||||
"audio": base64.b64encode(audio).decode("utf-8"),
|
||||
}
|
||||
}
|
||||
|
||||
await self._send_payload(payload)
|
||||
|
||||
async def _send_payload(self, payload: dict[str, Any]) -> None:
|
||||
if not self._websocket or self._websocket.state is not State.OPEN:
|
||||
return
|
||||
await self._websocket.send(json.dumps(payload, ensure_ascii=False))
|
||||
|
||||
async def _receive_messages(self) -> None:
|
||||
websocket = self._websocket
|
||||
if not websocket:
|
||||
return
|
||||
|
||||
try:
|
||||
async for message in websocket:
|
||||
await self._process_response(json.loads(message))
|
||||
except Exception as exc:
|
||||
if self._websocket is websocket:
|
||||
await self.push_error(f"Xfyun ASR receive failed: {exc}", exception=exc)
|
||||
finally:
|
||||
if self._websocket is websocket:
|
||||
self._websocket = None
|
||||
self._receive_task = None
|
||||
|
||||
async def _process_response(self, payload: dict[str, Any]) -> None:
|
||||
code = payload.get("code", -1)
|
||||
if code != 0:
|
||||
message = payload.get("message", "unknown error")
|
||||
sid = payload.get("sid")
|
||||
await self.push_error(f"Xfyun ASR error code={code}, sid={sid}, message={message}")
|
||||
return
|
||||
|
||||
data = payload.get("data")
|
||||
if not isinstance(data, dict):
|
||||
return
|
||||
|
||||
is_final_response = data.get("status") == 2
|
||||
recognition = data.get("result")
|
||||
if isinstance(recognition, dict):
|
||||
text = self._apply_recognition_result(recognition)
|
||||
if text and text != self._last_text:
|
||||
self._last_text = text
|
||||
if not self._finalizing_turn and not is_final_response:
|
||||
await self.push_frame(
|
||||
InterimTranscriptionFrame(
|
||||
text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
_language_or_none(self._language),
|
||||
result=payload,
|
||||
)
|
||||
)
|
||||
|
||||
if is_final_response:
|
||||
final_text = self._last_text
|
||||
if final_text:
|
||||
self.confirm_finalize()
|
||||
await self.push_frame(
|
||||
TranscriptionFrame(
|
||||
final_text,
|
||||
self._user_id,
|
||||
time_now_iso8601(),
|
||||
_language_or_none(self._language),
|
||||
result=payload,
|
||||
)
|
||||
)
|
||||
await self._close_utterance()
|
||||
|
||||
def _apply_recognition_result(self, recognition: dict[str, Any]) -> str:
|
||||
partial = _extract_text_from_result(recognition)
|
||||
if not partial:
|
||||
return self._last_text
|
||||
|
||||
if self._dynamic_correction and recognition.get("pgs") == "rpl" and recognition.get("rg"):
|
||||
start, end = recognition["rg"]
|
||||
if 1 <= start <= len(self._partials):
|
||||
self._partials[start - 1 : end] = [partial]
|
||||
else:
|
||||
logger.debug(f"Ignoring out-of-range Xfyun replacement rg={recognition['rg']}")
|
||||
else:
|
||||
self._partials.append(partial)
|
||||
|
||||
return "".join(self._partials)
|
||||
|
||||
|
||||
def _extract_text_from_result(result: dict[str, Any]) -> str:
|
||||
words: list[str] = []
|
||||
for item in result.get("ws", []):
|
||||
for candidate in item.get("cw", []):
|
||||
word = candidate.get("w")
|
||||
if word:
|
||||
words.append(word)
|
||||
return "".join(words)
|
||||
|
||||
|
||||
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc
|
||||
path = parsed.path or "/v2/iat"
|
||||
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
|
||||
request_line = f"GET {path} HTTP/1.1"
|
||||
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
|
||||
signature_sha = hmac.new(
|
||||
api_secret.encode("utf-8"),
|
||||
signature_origin.encode("utf-8"),
|
||||
digestmod=hashlib.sha256,
|
||||
).digest()
|
||||
signature = base64.b64encode(signature_sha).decode("utf-8")
|
||||
authorization_origin = (
|
||||
f'api_key="{api_key}", algorithm="hmac-sha256", '
|
||||
f'headers="host date request-line", signature="{signature}"'
|
||||
)
|
||||
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
||||
query = urlencode({"authorization": authorization, "date": date, "host": host})
|
||||
return f"{url}?{query}"
|
||||
|
||||
|
||||
def _language_or_none(value: str) -> Language | None:
|
||||
try:
|
||||
return Language(value)
|
||||
except ValueError:
|
||||
return None
|
||||
257
src/voice/xfyun_tts.py
Normal file
257
src/voice/xfyun_tts.py
Normal file
@@ -0,0 +1,257 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from collections.abc import AsyncGenerator, AsyncIterator
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import format_datetime
|
||||
from typing import Any
|
||||
from urllib.parse import urlencode, urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from pipecat.frames.frames import ErrorFrame, Frame
|
||||
from pipecat.services.settings import TTSSettings
|
||||
from pipecat.services.tts_service import TTSService
|
||||
from websockets.asyncio.client import connect
|
||||
|
||||
|
||||
DEFAULT_XFYUN_TTS_URL = "wss://tts-api.xfyun.cn/v2/tts"
|
||||
|
||||
# Strip characters Xfyun's online TTS cannot synthesize. The engine silently
|
||||
# rejects (or returns empty audio for) text containing emoji and other
|
||||
# non-BMP symbols, which surfaces as "request finished without audio data".
|
||||
_EMOJI_AND_SYMBOL_RE = re.compile(
|
||||
"["
|
||||
"\U0001F300-\U0001FAFF" # misc pictographs, emoji, symbols, transport, etc.
|
||||
"\U00002600-\U000027BF" # misc symbols and dingbats
|
||||
"\U0001F1E6-\U0001F1FF" # regional indicators (flags)
|
||||
"\uFE00-\uFE0F" # variation selectors
|
||||
"\u200D" # zero-width joiner
|
||||
"]",
|
||||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
class XfyunTTSService(TTSService):
|
||||
"""iFlytek/Xfyun online TTS service for Pipecat.
|
||||
|
||||
Xfyun's API is not OpenAI-compatible. It uses a signed WebSocket URL,
|
||||
receives one JSON request per synthesis, and streams text WebSocket
|
||||
messages containing base64-encoded audio chunks. This service requests
|
||||
raw PCM so the chunks can become Pipecat audio frames without MP3 decode.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
app_id: str,
|
||||
api_key: str,
|
||||
api_secret: str,
|
||||
voice: str,
|
||||
url: str | None = None,
|
||||
sample_rate: int = 16000,
|
||||
source_sample_rate: int = 16000,
|
||||
encoding: str = "raw",
|
||||
text_encoding: str = "UTF8",
|
||||
speed: int = 50,
|
||||
volume: int = 50,
|
||||
pitch: int = 50,
|
||||
timeout: float = 30.0,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
sample_rate=sample_rate,
|
||||
settings=TTSSettings(model=None, voice=voice, language=None),
|
||||
**kwargs,
|
||||
)
|
||||
self._app_id = app_id or os.environ.get("XFYUN_APP_ID", "")
|
||||
self._api_key = api_key or os.environ.get("XFYUN_API_KEY", "")
|
||||
self._api_secret = api_secret or os.environ.get("XFYUN_API_SECRET", "")
|
||||
self._voice = voice
|
||||
self._url = url or DEFAULT_XFYUN_TTS_URL
|
||||
self._source_sample_rate = source_sample_rate
|
||||
self._encoding = encoding
|
||||
self._text_encoding = text_encoding
|
||||
self._speed = speed
|
||||
self._volume = volume
|
||||
self._pitch = pitch
|
||||
self._timeout = timeout
|
||||
self._last_failure_detail: str | None = None
|
||||
|
||||
async def run_tts(self, text: str, context_id: str) -> AsyncGenerator[Frame, None]:
|
||||
if not text:
|
||||
return
|
||||
|
||||
if not self._app_id or not self._api_key or not self._api_secret:
|
||||
yield ErrorFrame(error="Xfyun TTS requires app_id, api_key, and api_secret")
|
||||
return
|
||||
|
||||
sanitized = _sanitize_text_for_tts(text)
|
||||
if not sanitized:
|
||||
logger.debug(
|
||||
f"{self}: skipping Xfyun TTS, text became empty after sanitization "
|
||||
f"(original={text!r})"
|
||||
)
|
||||
return
|
||||
|
||||
if sanitized != text:
|
||||
logger.debug(
|
||||
f"{self}: sanitized Xfyun TTS text "
|
||||
f"(original={text!r}, sanitized={sanitized!r})"
|
||||
)
|
||||
|
||||
if len(sanitized.encode("utf-8")) >= 8000:
|
||||
yield ErrorFrame(error="Xfyun TTS text must be less than 8000 UTF-8 bytes")
|
||||
return
|
||||
|
||||
if self._encoding != "raw":
|
||||
yield ErrorFrame(error="Xfyun TTS is configured for PCM output; set aue/encoding to raw")
|
||||
return
|
||||
|
||||
try:
|
||||
await self.start_tts_usage_metrics(sanitized)
|
||||
|
||||
first_frame = True
|
||||
async for frame in self._stream_audio_frames_from_iterator(
|
||||
self._iter_audio_chunks(sanitized),
|
||||
in_sample_rate=self._source_sample_rate,
|
||||
context_id=context_id,
|
||||
):
|
||||
if first_frame:
|
||||
await self.stop_ttfb_metrics()
|
||||
first_frame = False
|
||||
yield frame
|
||||
|
||||
if first_frame:
|
||||
detail = self._last_failure_detail or "no audio frames received"
|
||||
yield ErrorFrame(
|
||||
error=(
|
||||
f"Xfyun TTS request finished without audio data ({detail}); "
|
||||
f"text={sanitized!r}"
|
||||
)
|
||||
)
|
||||
except Exception as exc:
|
||||
yield ErrorFrame(error=f"Xfyun TTS request failed: {exc}")
|
||||
|
||||
async def _iter_audio_chunks(self, text: str) -> AsyncIterator[bytes]:
|
||||
request = self._build_request_frame(text)
|
||||
auth_url = _build_auth_url(self._url, self._api_key, self._api_secret)
|
||||
|
||||
self._last_failure_detail = None
|
||||
frames_received = 0
|
||||
audio_bytes_received = 0
|
||||
last_status: int | None = None
|
||||
last_sid: str | None = None
|
||||
saw_status_2 = False
|
||||
|
||||
async with connect(auth_url, max_size=None, open_timeout=self._timeout) as websocket:
|
||||
await websocket.send(json.dumps(request, ensure_ascii=False))
|
||||
|
||||
async for raw_message in websocket:
|
||||
frames_received += 1
|
||||
payload = json.loads(raw_message)
|
||||
code = payload.get("code", -1)
|
||||
sid = payload.get("sid")
|
||||
if sid:
|
||||
last_sid = sid
|
||||
if code != 0:
|
||||
err_msg = payload.get("message", "unknown error")
|
||||
raise RuntimeError(f"code={code}, sid={sid}, message={err_msg}")
|
||||
|
||||
data = payload.get("data")
|
||||
if not isinstance(data, dict):
|
||||
continue
|
||||
|
||||
last_status = data.get("status", last_status)
|
||||
|
||||
audio_b64 = data.get("audio")
|
||||
if audio_b64:
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
audio_bytes_received += len(audio_bytes)
|
||||
yield audio_bytes
|
||||
|
||||
if data.get("status") == 2:
|
||||
saw_status_2 = True
|
||||
break
|
||||
|
||||
if audio_bytes_received == 0:
|
||||
self._last_failure_detail = (
|
||||
f"frames={frames_received}, audio_bytes=0, "
|
||||
f"last_status={last_status}, saw_status_2={saw_status_2}, sid={last_sid}"
|
||||
)
|
||||
logger.warning(
|
||||
f"{self}: Xfyun TTS produced no audio ({self._last_failure_detail})"
|
||||
)
|
||||
|
||||
def _build_request_frame(self, text: str) -> dict[str, Any]:
|
||||
business: dict[str, Any] = {
|
||||
"aue": self._encoding,
|
||||
"auf": f"audio/L16;rate={self._source_sample_rate}",
|
||||
"vcn": self._voice,
|
||||
"speed": self._speed,
|
||||
"volume": self._volume,
|
||||
"pitch": self._pitch,
|
||||
"tte": self._text_encoding,
|
||||
}
|
||||
|
||||
return {
|
||||
"common": {"app_id": self._app_id},
|
||||
"business": business,
|
||||
"data": {
|
||||
"status": 2,
|
||||
"text": base64.b64encode(text.encode("utf-8")).decode("utf-8"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _sanitize_text_for_tts(text: str) -> str:
|
||||
"""Strip characters Xfyun's online TTS cannot synthesize.
|
||||
|
||||
The Xfyun ``/v2/tts`` engine silently drops or rejects emoji, pictographs,
|
||||
dingbats, regional-indicator flags, variation selectors, and zero-width
|
||||
joiners. When such characters appear in the input the synthesis can
|
||||
finish without any audio data ("Xfyun TTS request finished without audio
|
||||
data"). We also drop control characters (other than common whitespace)
|
||||
and "Symbol, Other" codepoints, then collapse runs of whitespace.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
cleaned = _EMOJI_AND_SYMBOL_RE.sub("", text)
|
||||
filtered: list[str] = []
|
||||
for ch in cleaned:
|
||||
category = unicodedata.category(ch)
|
||||
if category == "So":
|
||||
continue
|
||||
if category.startswith("C") and ch not in ("\n", "\r", "\t"):
|
||||
continue
|
||||
filtered.append(ch)
|
||||
return re.sub(r"\s+", " ", "".join(filtered)).strip()
|
||||
|
||||
|
||||
def _build_auth_url(url: str, api_key: str, api_secret: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
host = parsed.netloc
|
||||
path = parsed.path or "/v2/tts"
|
||||
date = format_datetime(datetime.now(timezone.utc), usegmt=True)
|
||||
request_line = f"GET {path} HTTP/1.1"
|
||||
signature_origin = f"host: {host}\ndate: {date}\n{request_line}"
|
||||
signature_sha = hmac.new(
|
||||
api_secret.encode("utf-8"),
|
||||
signature_origin.encode("utf-8"),
|
||||
digestmod=hashlib.sha256,
|
||||
).digest()
|
||||
signature = base64.b64encode(signature_sha).decode("utf-8")
|
||||
authorization_origin = (
|
||||
f'api_key="{api_key}", algorithm="hmac-sha256", '
|
||||
f'headers="host date request-line", signature="{signature}"'
|
||||
)
|
||||
authorization = base64.b64encode(authorization_origin.encode("utf-8")).decode("utf-8")
|
||||
query = urlencode({"authorization": authorization, "date": date, "host": host})
|
||||
return f"{url}?{query}"
|
||||
106
static/voice-demo/README.md
Normal file
106
static/voice-demo/README.md
Normal file
@@ -0,0 +1,106 @@
|
||||
# Webpage Example — Realtime Voice Chat
|
||||
|
||||
A self-contained browser client for the engine's product websocket
|
||||
(`/ws-product`, protocol `va.ws.v1`).
|
||||
|
||||
## Features
|
||||
|
||||
- **Connect / Disconnect** to any `ws://` or `wss://` URL.
|
||||
- **Microphone selector + mic on/off toggle** — available input devices
|
||||
are listed with `enumerateDevices`, and getUserMedia is requested with
|
||||
`echoCancellation`, `noiseSuppression`, and `autoGainControl` so the
|
||||
browser handles AEC against the bot's voice.
|
||||
- **Text composer** — type a message and press <kbd>Enter</kbd> to send
|
||||
an `input.text` event (Shift+Enter for newline). Sending interrupts
|
||||
any in-flight bot audio so the next reply is heard cleanly.
|
||||
- **Chat history** rendered from `input.transcript.final` (you, when
|
||||
spoken), streamed `response.text.delta` / `response.text.final`
|
||||
(assistant — deltas arrive ahead of the synthesized audio), and locally
|
||||
for text you submit (the engine doesn't echo text input back as a
|
||||
transcript).
|
||||
- **WebSocket log** panel for connection state and compact send/receive
|
||||
events. Audio chunks are summarized so the UI does not flood.
|
||||
- **Gapless TTS playback** by scheduling each `response.audio.delta`
|
||||
chunk back-to-back on the AudioContext.
|
||||
- **Live VU meter** + mic and bot activity indicators.
|
||||
- **Clear** button to reset history.
|
||||
|
||||
No build step, no dependencies — just three files plus an AudioWorklet.
|
||||
|
||||
## Layout
|
||||
|
||||
```text
|
||||
examples/webpage/
|
||||
├── index.html
|
||||
├── styles.css
|
||||
├── app.js
|
||||
└── pcm-recorder.worklet.js
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
1. Start the engine (default port `8000`):
|
||||
|
||||
```bash
|
||||
cd AI-VideoAssistant-engine-v5-pipecat-minimal
|
||||
source .venv/bin/activate
|
||||
export OPENAI_API_KEY=...
|
||||
uvicorn engine.main:app --host 127.0.0.1 --port 8000
|
||||
```
|
||||
|
||||
2. Open the demo page served by the same process:
|
||||
|
||||
```text
|
||||
http://127.0.0.1:8000/demo/
|
||||
```
|
||||
|
||||
The default websocket URL is derived from the page host
|
||||
(`ws://127.0.0.1:8000/ws-product`). Click **Connect**, pick a
|
||||
microphone if needed, click **Enable mic**, and start speaking.
|
||||
|
||||
Mount path and on/off are controlled in `config.json`:
|
||||
|
||||
```json
|
||||
"server": {
|
||||
"serve_webpage": true,
|
||||
"webpage_mount": "/demo"
|
||||
}
|
||||
```
|
||||
|
||||
Set `"serve_webpage": false` in production if you serve the UI elsewhere.
|
||||
|
||||
### Standalone static server (optional)
|
||||
|
||||
You can still serve the files from another port for UI-only iteration.
|
||||
Add that origin to `server.cors_origins` in `config.json` if needed:
|
||||
|
||||
```bash
|
||||
cd AI-VideoAssistant-engine-v5-pipecat-minimal/examples/webpage
|
||||
python -m http.server 8080
|
||||
```
|
||||
|
||||
Then open <http://localhost:8080> and point the URL field at
|
||||
`ws://127.0.0.1:8000/ws-product`.
|
||||
|
||||
> The browser's mic API requires a secure context. `http://localhost`
|
||||
> qualifies; if you serve from another host, use HTTPS and a `wss://`
|
||||
> URL.
|
||||
|
||||
## Audio details
|
||||
|
||||
- Input: mono Float32 from `getUserMedia` is resampled in the
|
||||
AudioWorklet to PCM16 mono @ 16 kHz, framed into 20 ms chunks, and
|
||||
sent as **binary** websocket messages (the server accepts either
|
||||
binary or the JSON+base64 form).
|
||||
- Output: each `response.audio.delta` carries base64-encoded PCM16 @
|
||||
16 kHz; chunks are decoded and scheduled back-to-back through Web
|
||||
Audio. The browser handles resampling to the device rate.
|
||||
|
||||
## Notes
|
||||
|
||||
- Use headphones if you still hear echo despite browser AEC; the bot's
|
||||
voice leaking back into the open mic is the most common cause of
|
||||
feedback loops.
|
||||
- The engine's session has an inactivity timeout
|
||||
(`session.inactivity_timeout_sec` in `config.json`). If the bot
|
||||
doesn't respond after a long silence, reconnect.
|
||||
899
static/voice-demo/app.js
Normal file
899
static/voice-demo/app.js
Normal file
@@ -0,0 +1,899 @@
|
||||
/**
|
||||
* Minimal browser client for the AI VideoAssistant engine's product
|
||||
* websocket (`/ws-product`, protocol `va.ws.v1`).
|
||||
*
|
||||
* Responsibilities:
|
||||
* - Open/close the websocket and run the session handshake.
|
||||
* - List/select microphones and capture mic audio with browser AEC enabled.
|
||||
* - Downsample to PCM16 mono @ 16 kHz in an AudioWorklet and stream frames
|
||||
* as binary websocket messages.
|
||||
* - Play `response.audio.delta` frames gaplessly through Web Audio.
|
||||
* - Render a chat-style history of user transcripts and bot text deltas.
|
||||
*/
|
||||
|
||||
const SAMPLE_RATE = 16000;
|
||||
const CHANNELS = 1;
|
||||
const FRAME_MS = 20;
|
||||
const PROTOCOL = "va.ws.v1";
|
||||
const MAX_WS_LOG_LINES = 120;
|
||||
const AUDIO_DELTA_LOG_INTERVAL_MS = 1000;
|
||||
|
||||
function defaultWsUrl() {
|
||||
const scheme = location.protocol === "https:" ? "wss:" : "ws:";
|
||||
return `${scheme}//${location.host}/ws-product`;
|
||||
}
|
||||
|
||||
const els = {
|
||||
url: document.getElementById("ws-url"),
|
||||
connectBtn: document.getElementById("connect-btn"),
|
||||
statusDot: document.getElementById("status-dot"),
|
||||
statusText: document.getElementById("status-text"),
|
||||
chatLog: document.getElementById("chat-log"),
|
||||
micBtn: document.getElementById("mic-btn"),
|
||||
micSelect: document.getElementById("mic-select"),
|
||||
micLabel: document.querySelector(".mic-btn__label"),
|
||||
micIndicator: document.getElementById("mic-indicator"),
|
||||
botIndicator: document.getElementById("bot-indicator"),
|
||||
clearBtn: document.getElementById("clear-btn"),
|
||||
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
|
||||
wsLog: document.getElementById("ws-log"),
|
||||
meterFill: document.getElementById("meter-fill"),
|
||||
composer: document.getElementById("composer"),
|
||||
textInput: document.getElementById("text-input"),
|
||||
sendBtn: document.getElementById("send-btn"),
|
||||
};
|
||||
|
||||
const state = {
|
||||
ws: null,
|
||||
connected: false,
|
||||
connecting: false,
|
||||
|
||||
audioContext: null,
|
||||
micStream: null,
|
||||
micSourceNode: null,
|
||||
recorderNode: null,
|
||||
|
||||
micEnabled: false,
|
||||
micDevices: [],
|
||||
selectedMicDeviceId: "",
|
||||
|
||||
// Output scheduling.
|
||||
nextPlaybackTime: 0,
|
||||
playbackEndsAt: 0,
|
||||
scheduledSources: [],
|
||||
botActive: false,
|
||||
botUiTimer: null,
|
||||
|
||||
// Chat state.
|
||||
currentAssistantBubble: null,
|
||||
|
||||
// VU meter smoothing.
|
||||
meterLevel: 0,
|
||||
|
||||
// Compact websocket logging.
|
||||
audioDeltaLogCount: 0,
|
||||
audioDeltaLogBytes: 0,
|
||||
lastAudioDeltaLogAt: 0,
|
||||
audioSendLogCount: 0,
|
||||
audioSendLogBytes: 0,
|
||||
lastAudioSendLogAt: 0,
|
||||
};
|
||||
|
||||
/* ------------------------------------------------------------------ UI */
|
||||
|
||||
function setStatus(kind, text) {
|
||||
els.statusDot.className = `status__dot status__dot--${kind}`;
|
||||
els.statusText.textContent = text;
|
||||
}
|
||||
|
||||
function setConnectButton() {
|
||||
if (state.connecting) {
|
||||
els.connectBtn.textContent = "Connecting…";
|
||||
els.connectBtn.disabled = true;
|
||||
els.connectBtn.classList.remove("is-disconnect");
|
||||
} else if (state.connected) {
|
||||
els.connectBtn.textContent = "Disconnect";
|
||||
els.connectBtn.disabled = false;
|
||||
els.connectBtn.classList.add("is-disconnect");
|
||||
} else {
|
||||
els.connectBtn.textContent = "Connect";
|
||||
els.connectBtn.disabled = false;
|
||||
els.connectBtn.classList.remove("is-disconnect");
|
||||
}
|
||||
}
|
||||
|
||||
function setMicButton() {
|
||||
els.micBtn.disabled = !state.connected;
|
||||
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
||||
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
|
||||
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
|
||||
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
||||
}
|
||||
|
||||
function setMicSelectEnabled() {
|
||||
els.micSelect.disabled = !state.connected || !navigator.mediaDevices;
|
||||
}
|
||||
|
||||
function setComposerEnabled(enabled) {
|
||||
els.textInput.disabled = !enabled;
|
||||
els.sendBtn.disabled = !enabled || els.textInput.value.trim().length === 0;
|
||||
}
|
||||
|
||||
function setBotIndicator(active) {
|
||||
els.botIndicator.classList.toggle("is-active", active);
|
||||
}
|
||||
|
||||
function addBubble(role, text) {
|
||||
if (els.chatLog.querySelector(".chat__empty")) {
|
||||
els.chatLog.innerHTML = "";
|
||||
}
|
||||
const bubble = document.createElement("div");
|
||||
bubble.className = `bubble bubble--${role}`;
|
||||
if (role !== "system") {
|
||||
const tag = document.createElement("span");
|
||||
tag.className = "bubble__role";
|
||||
tag.textContent = role === "user" ? "You" : "Assistant";
|
||||
bubble.appendChild(tag);
|
||||
}
|
||||
const body = document.createElement("span");
|
||||
body.className = "bubble__text";
|
||||
body.textContent = text;
|
||||
bubble.appendChild(body);
|
||||
els.chatLog.appendChild(bubble);
|
||||
scrollChatToBottom();
|
||||
return bubble;
|
||||
}
|
||||
|
||||
function appendToBubble(bubble, text) {
|
||||
const body = bubble.querySelector(".bubble__text");
|
||||
body.textContent += text;
|
||||
scrollChatToBottom();
|
||||
}
|
||||
|
||||
function scrollChatToBottom() {
|
||||
els.chatLog.scrollTop = els.chatLog.scrollHeight;
|
||||
}
|
||||
|
||||
function clearChat() {
|
||||
els.chatLog.innerHTML = "";
|
||||
state.currentAssistantBubble = null;
|
||||
const empty = document.createElement("div");
|
||||
empty.className = "chat__empty";
|
||||
empty.innerHTML = "<p>Chat cleared.</p>";
|
||||
els.chatLog.appendChild(empty);
|
||||
}
|
||||
|
||||
function truncateLogValue(value, maxLength = 160) {
|
||||
const text = String(value);
|
||||
if (text.length <= maxLength) return text;
|
||||
return `${text.slice(0, maxLength - 1)}…`;
|
||||
}
|
||||
|
||||
function compactWsPayload(payload) {
|
||||
if (!payload || typeof payload !== "object") return String(payload);
|
||||
const compact = { ...payload };
|
||||
|
||||
if (typeof compact.audio === "string") {
|
||||
compact.audio = `<base64 ${compact.audio.length} chars>`;
|
||||
}
|
||||
if (typeof compact.data === "string" && compact.data.length > 160) {
|
||||
compact.data = `<string ${compact.data.length} chars>`;
|
||||
}
|
||||
if (typeof compact.text === "string") {
|
||||
compact.text = truncateLogValue(compact.text);
|
||||
}
|
||||
|
||||
try {
|
||||
return JSON.stringify(compact);
|
||||
} catch (_) {
|
||||
return payload.type || "unserializable websocket payload";
|
||||
}
|
||||
}
|
||||
|
||||
function addWsLog(direction, detail, kind = direction) {
|
||||
if (els.wsLog.querySelector(".ws-log__empty")) {
|
||||
els.wsLog.innerHTML = "";
|
||||
}
|
||||
|
||||
const entry = document.createElement("div");
|
||||
entry.className = `ws-log__entry ws-log__entry--${kind}`;
|
||||
|
||||
const time = document.createElement("span");
|
||||
time.className = "ws-log__time";
|
||||
time.textContent = new Date().toLocaleTimeString([], {
|
||||
hour12: false,
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
second: "2-digit",
|
||||
});
|
||||
|
||||
const dir = document.createElement("span");
|
||||
dir.className = "ws-log__direction";
|
||||
dir.textContent =
|
||||
direction === "send"
|
||||
? "SEND"
|
||||
: direction === "recv"
|
||||
? "RECV"
|
||||
: direction.toUpperCase();
|
||||
|
||||
const body = document.createElement("span");
|
||||
body.className = "ws-log__detail";
|
||||
body.textContent = detail;
|
||||
|
||||
entry.append(time, dir, body);
|
||||
els.wsLog.appendChild(entry);
|
||||
|
||||
while (els.wsLog.children.length > MAX_WS_LOG_LINES) {
|
||||
els.wsLog.firstElementChild.remove();
|
||||
}
|
||||
els.wsLog.scrollTop = els.wsLog.scrollHeight;
|
||||
}
|
||||
|
||||
function flushAudioDeltaLog() {
|
||||
if (state.audioDeltaLogCount === 0) return;
|
||||
addWsLog(
|
||||
"recv",
|
||||
`response.audio.delta x${state.audioDeltaLogCount} (${state.audioDeltaLogBytes} bytes)`,
|
||||
);
|
||||
state.audioDeltaLogCount = 0;
|
||||
state.audioDeltaLogBytes = 0;
|
||||
state.lastAudioDeltaLogAt = performance.now();
|
||||
}
|
||||
|
||||
function flushAudioSendLog() {
|
||||
if (state.audioSendLogCount === 0) return;
|
||||
addWsLog(
|
||||
"send",
|
||||
`input.audio binary x${state.audioSendLogCount} (${state.audioSendLogBytes} bytes)`,
|
||||
);
|
||||
state.audioSendLogCount = 0;
|
||||
state.audioSendLogBytes = 0;
|
||||
state.lastAudioSendLogAt = performance.now();
|
||||
}
|
||||
|
||||
function flushPendingWsLogs() {
|
||||
flushAudioDeltaLog();
|
||||
flushAudioSendLog();
|
||||
}
|
||||
|
||||
function logWsPayload(direction, payload) {
|
||||
if (direction === "send") {
|
||||
flushAudioSendLog();
|
||||
} else {
|
||||
flushAudioDeltaLog();
|
||||
}
|
||||
|
||||
if (direction === "recv" && payload?.type === "response.audio.delta") {
|
||||
state.audioDeltaLogCount += 1;
|
||||
state.audioDeltaLogBytes += payload.bytes || payload.audio?.length || 0;
|
||||
const now = performance.now();
|
||||
if (now - state.lastAudioDeltaLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
|
||||
flushAudioDeltaLog();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
addWsLog(direction, compactWsPayload(payload));
|
||||
}
|
||||
|
||||
function logBinarySend(byteLength) {
|
||||
state.audioSendLogCount += 1;
|
||||
state.audioSendLogBytes += byteLength;
|
||||
const now = performance.now();
|
||||
if (now - state.lastAudioSendLogAt >= AUDIO_DELTA_LOG_INTERVAL_MS) {
|
||||
flushAudioSendLog();
|
||||
}
|
||||
}
|
||||
|
||||
function wsSend(data) {
|
||||
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||||
|
||||
if (typeof data === "string") {
|
||||
try {
|
||||
logWsPayload("send", JSON.parse(data));
|
||||
} catch (_) {
|
||||
flushAudioSendLog();
|
||||
flushAudioDeltaLog();
|
||||
addWsLog("send", truncateLogValue(data));
|
||||
}
|
||||
} else {
|
||||
const byteLength =
|
||||
data instanceof ArrayBuffer
|
||||
? data.byteLength
|
||||
: ArrayBuffer.isView(data)
|
||||
? data.byteLength
|
||||
: 0;
|
||||
if (byteLength > 0) {
|
||||
logBinarySend(byteLength);
|
||||
}
|
||||
}
|
||||
|
||||
state.ws.send(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
function clearWsLog() {
|
||||
state.audioDeltaLogCount = 0;
|
||||
state.audioDeltaLogBytes = 0;
|
||||
state.audioSendLogCount = 0;
|
||||
state.audioSendLogBytes = 0;
|
||||
els.wsLog.innerHTML =
|
||||
'<div class="ws-log__empty">No websocket events yet.</div>';
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------- Audio */
|
||||
|
||||
async function ensureAudioContext() {
|
||||
if (!state.audioContext) {
|
||||
const Ctx = window.AudioContext || window.webkitAudioContext;
|
||||
state.audioContext = new Ctx();
|
||||
await state.audioContext.audioWorklet.addModule("./pcm-recorder.worklet.js");
|
||||
}
|
||||
if (state.audioContext.state === "suspended") {
|
||||
await state.audioContext.resume();
|
||||
}
|
||||
return state.audioContext;
|
||||
}
|
||||
|
||||
function renderMicDevices() {
|
||||
const previousValue = state.selectedMicDeviceId || els.micSelect.value;
|
||||
els.micSelect.innerHTML = "";
|
||||
|
||||
const defaultOption = document.createElement("option");
|
||||
defaultOption.value = "";
|
||||
defaultOption.textContent = "Default microphone";
|
||||
els.micSelect.appendChild(defaultOption);
|
||||
|
||||
state.micDevices.forEach((device, index) => {
|
||||
const option = document.createElement("option");
|
||||
option.value = device.deviceId;
|
||||
option.textContent = device.label || `Microphone ${index + 1}`;
|
||||
els.micSelect.appendChild(option);
|
||||
});
|
||||
|
||||
const hasPrevious = state.micDevices.some(
|
||||
(device) => device.deviceId === previousValue,
|
||||
);
|
||||
state.selectedMicDeviceId = hasPrevious ? previousValue : "";
|
||||
els.micSelect.value = state.selectedMicDeviceId;
|
||||
setMicSelectEnabled();
|
||||
}
|
||||
|
||||
async function refreshMicDevices() {
|
||||
if (!navigator.mediaDevices?.enumerateDevices) {
|
||||
setMicSelectEnabled();
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||
state.micDevices = devices.filter((device) => device.kind === "audioinput");
|
||||
renderMicDevices();
|
||||
} catch (err) {
|
||||
console.warn("Could not enumerate microphones", err);
|
||||
setMicSelectEnabled();
|
||||
}
|
||||
}
|
||||
|
||||
async function startMic() {
|
||||
const ctx = await ensureAudioContext();
|
||||
const audioConstraints = {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
channelCount: 1,
|
||||
};
|
||||
if (state.selectedMicDeviceId) {
|
||||
audioConstraints.deviceId = { exact: state.selectedMicDeviceId };
|
||||
}
|
||||
|
||||
state.micStream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: audioConstraints,
|
||||
video: false,
|
||||
});
|
||||
await refreshMicDevices();
|
||||
|
||||
state.micSourceNode = ctx.createMediaStreamSource(state.micStream);
|
||||
state.recorderNode = new AudioWorkletNode(ctx, "pcm-recorder", {
|
||||
numberOfInputs: 1,
|
||||
numberOfOutputs: 0,
|
||||
channelCount: 1,
|
||||
processorOptions: {
|
||||
targetSampleRate: SAMPLE_RATE,
|
||||
frameMs: FRAME_MS,
|
||||
},
|
||||
});
|
||||
state.recorderNode.port.onmessage = (event) => {
|
||||
const data = event.data;
|
||||
if (!data || data.type !== "frame") return;
|
||||
updateMeter(data.rms || 0);
|
||||
if (state.connected) {
|
||||
wsSend(data.buffer);
|
||||
}
|
||||
};
|
||||
|
||||
state.micSourceNode.connect(state.recorderNode);
|
||||
state.micEnabled = true;
|
||||
addWsLog("system", "mic capture started (binary input.audio frames)");
|
||||
setMicButton();
|
||||
}
|
||||
|
||||
function stopMic() {
|
||||
const wasEnabled = state.micEnabled;
|
||||
if (state.recorderNode) {
|
||||
try {
|
||||
state.recorderNode.port.onmessage = null;
|
||||
state.recorderNode.disconnect();
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
state.recorderNode = null;
|
||||
}
|
||||
if (state.micSourceNode) {
|
||||
try {
|
||||
state.micSourceNode.disconnect();
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
state.micSourceNode = null;
|
||||
}
|
||||
if (state.micStream) {
|
||||
for (const track of state.micStream.getTracks()) {
|
||||
try {
|
||||
track.stop();
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
state.micStream = null;
|
||||
}
|
||||
state.micEnabled = false;
|
||||
updateMeter(0);
|
||||
if (wasEnabled) {
|
||||
flushAudioSendLog();
|
||||
addWsLog("system", "mic capture stopped");
|
||||
}
|
||||
setMicButton();
|
||||
}
|
||||
|
||||
function updateMeter(rms) {
|
||||
// Smooth and convert to a 0..100 width. RMS ~0.3+ is loud speech.
|
||||
const target = Math.min(1, rms * 2.4);
|
||||
state.meterLevel = state.meterLevel * 0.5 + target * 0.5;
|
||||
els.meterFill.style.width = `${Math.round(state.meterLevel * 100)}%`;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------- Bot audio playback */
|
||||
|
||||
function schedulePlayback(int16) {
|
||||
const ctx = state.audioContext;
|
||||
if (!ctx) return;
|
||||
|
||||
const float32 = new Float32Array(int16.length);
|
||||
for (let i = 0; i < int16.length; i++) {
|
||||
float32[i] = int16[i] / (int16[i] < 0 ? 0x8000 : 0x7fff);
|
||||
}
|
||||
const buffer = ctx.createBuffer(CHANNELS, float32.length, SAMPLE_RATE);
|
||||
buffer.copyToChannel(float32, 0);
|
||||
|
||||
const src = ctx.createBufferSource();
|
||||
src.buffer = buffer;
|
||||
src.connect(ctx.destination);
|
||||
|
||||
const now = ctx.currentTime;
|
||||
// Schedule immediately after the previously scheduled chunk to keep
|
||||
// playback contiguous, with a tiny safety margin if we fell behind.
|
||||
const startAt = Math.max(now + 0.02, state.nextPlaybackTime);
|
||||
src.start(startAt);
|
||||
state.nextPlaybackTime = startAt + buffer.duration;
|
||||
state.playbackEndsAt = state.nextPlaybackTime;
|
||||
|
||||
src.onended = () => {
|
||||
const idx = state.scheduledSources.indexOf(src);
|
||||
if (idx >= 0) state.scheduledSources.splice(idx, 1);
|
||||
};
|
||||
state.scheduledSources.push(src);
|
||||
|
||||
setBotIndicator(true);
|
||||
if (state.botUiTimer) clearTimeout(state.botUiTimer);
|
||||
const msUntilEnd = Math.max(0, (state.playbackEndsAt - now) * 1000) + 120;
|
||||
state.botUiTimer = setTimeout(() => {
|
||||
if (state.audioContext &&
|
||||
state.audioContext.currentTime >= state.playbackEndsAt - 0.01) {
|
||||
setBotIndicator(false);
|
||||
}
|
||||
}, msUntilEnd);
|
||||
}
|
||||
|
||||
function stopPlaybackQueue() {
|
||||
for (const src of state.scheduledSources) {
|
||||
try {
|
||||
src.onended = null;
|
||||
src.stop();
|
||||
src.disconnect();
|
||||
} catch (_) {
|
||||
/* already stopped */
|
||||
}
|
||||
}
|
||||
state.scheduledSources = [];
|
||||
resetPlaybackClock();
|
||||
if (state.botUiTimer) {
|
||||
clearTimeout(state.botUiTimer);
|
||||
state.botUiTimer = null;
|
||||
}
|
||||
setBotIndicator(false);
|
||||
}
|
||||
|
||||
function resetPlaybackClock() {
|
||||
if (state.audioContext) {
|
||||
state.nextPlaybackTime = state.audioContext.currentTime;
|
||||
state.playbackEndsAt = state.audioContext.currentTime;
|
||||
}
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------- Chat updates */
|
||||
|
||||
function handleUserTranscript(text) {
|
||||
if (!text) return;
|
||||
state.currentAssistantBubble = null;
|
||||
addBubble("user", text);
|
||||
}
|
||||
|
||||
function sendText(text) {
|
||||
const value = (text || "").trim();
|
||||
if (!value) return false;
|
||||
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||||
const message = {
|
||||
type: "input.text",
|
||||
text: value,
|
||||
interrupt: true,
|
||||
};
|
||||
|
||||
// The engine does not echo text input back as a transcript event, so we
|
||||
// render the user bubble locally. Also interrupt any in-flight bot audio
|
||||
// so the next reply is heard cleanly. We deliberately do NOT clear
|
||||
// `currentAssistantBubble` here — the engine will emit a
|
||||
// `response.text.final(interrupted=true)` for the in-flight assistant
|
||||
// turn, which finalizes that bubble in place. A brand-new bubble for the
|
||||
// reply will be created when `response.text.started` arrives.
|
||||
wsSend(JSON.stringify(message));
|
||||
stopPlaybackQueue();
|
||||
addBubble("user", value);
|
||||
return true;
|
||||
}
|
||||
|
||||
function handleAssistantDelta(text) {
|
||||
if (!text) return;
|
||||
if (!state.currentAssistantBubble) {
|
||||
state.currentAssistantBubble = addBubble("assistant", "");
|
||||
}
|
||||
appendToBubble(state.currentAssistantBubble, text);
|
||||
}
|
||||
|
||||
function handleAssistantStarted() {
|
||||
state.currentAssistantBubble = null;
|
||||
}
|
||||
|
||||
function handleAssistantFinal(text, interrupted) {
|
||||
if (!text) {
|
||||
state.currentAssistantBubble = null;
|
||||
return;
|
||||
}
|
||||
if (state.currentAssistantBubble) {
|
||||
const body = state.currentAssistantBubble.querySelector(".bubble__text");
|
||||
body.textContent = text;
|
||||
} else {
|
||||
state.currentAssistantBubble = addBubble("assistant", text);
|
||||
}
|
||||
if (interrupted) {
|
||||
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
||||
}
|
||||
state.currentAssistantBubble = null;
|
||||
scrollChatToBottom();
|
||||
}
|
||||
|
||||
function finalizeAssistantBubble() {
|
||||
state.currentAssistantBubble = null;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------- Websocket IO */
|
||||
|
||||
function decodeBase64ToInt16(b64) {
|
||||
const binary = atob(b64);
|
||||
const len = binary.length;
|
||||
const bytes = new Uint8Array(len);
|
||||
for (let i = 0; i < len; i++) bytes[i] = binary.charCodeAt(i);
|
||||
return new Int16Array(bytes.buffer, bytes.byteOffset, bytes.byteLength / 2);
|
||||
}
|
||||
|
||||
function handleEvent(event) {
|
||||
switch (event.type) {
|
||||
case "response.audio.delta":
|
||||
if (typeof event.audio === "string") {
|
||||
schedulePlayback(decodeBase64ToInt16(event.audio));
|
||||
}
|
||||
break;
|
||||
case "response.audio.started":
|
||||
setBotIndicator(true);
|
||||
break;
|
||||
case "response.audio.stopped":
|
||||
finalizeAssistantBubble();
|
||||
// The indicator turns off automatically when the playback queue drains.
|
||||
break;
|
||||
case "response.text.delta":
|
||||
handleAssistantDelta(event.text);
|
||||
break;
|
||||
case "response.text.started":
|
||||
handleAssistantStarted();
|
||||
break;
|
||||
case "response.text.final":
|
||||
handleAssistantFinal(event.text, event.interrupted);
|
||||
break;
|
||||
case "input.transcript.final":
|
||||
handleUserTranscript(event.text);
|
||||
break;
|
||||
case "input.transcript.interim":
|
||||
// Ignore partial ASR updates; chat history renders committed user turns.
|
||||
break;
|
||||
case "transport.message":
|
||||
// Reserved for future structured messages; ignore silently.
|
||||
break;
|
||||
default:
|
||||
// Unknown event type: log for debugging.
|
||||
console.debug("ws event", event);
|
||||
}
|
||||
}
|
||||
|
||||
async function connect() {
|
||||
if (state.connected || state.connecting) return;
|
||||
const url = (els.url.value || "").trim();
|
||||
if (!url) {
|
||||
setStatus("error", "Missing URL");
|
||||
return;
|
||||
}
|
||||
|
||||
state.connecting = true;
|
||||
setStatus("connecting", "Connecting…");
|
||||
setConnectButton();
|
||||
addWsLog("system", `connecting ${url}`);
|
||||
|
||||
try {
|
||||
// Pre-warm audio context on user gesture so playback works on Safari.
|
||||
await ensureAudioContext();
|
||||
} catch (err) {
|
||||
console.error("AudioContext failed", err);
|
||||
state.connecting = false;
|
||||
setStatus("error", "Audio init failed");
|
||||
setConnectButton();
|
||||
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
|
||||
return;
|
||||
}
|
||||
|
||||
let ws;
|
||||
try {
|
||||
ws = new WebSocket(url);
|
||||
} catch (err) {
|
||||
console.error("WebSocket constructor failed", err);
|
||||
state.connecting = false;
|
||||
setStatus("error", "Bad URL");
|
||||
setConnectButton();
|
||||
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
|
||||
return;
|
||||
}
|
||||
ws.binaryType = "arraybuffer";
|
||||
state.ws = ws;
|
||||
|
||||
ws.addEventListener("open", () => {
|
||||
const startMessage = {
|
||||
type: "session.start",
|
||||
protocol: PROTOCOL,
|
||||
audio: {
|
||||
encoding: "pcm_s16le",
|
||||
sample_rate: SAMPLE_RATE,
|
||||
channels: CHANNELS,
|
||||
},
|
||||
};
|
||||
|
||||
state.connecting = false;
|
||||
state.connected = true;
|
||||
resetPlaybackClock();
|
||||
addWsLog("system", "websocket open");
|
||||
setStatus("connected", "Connected");
|
||||
setConnectButton();
|
||||
setMicButton();
|
||||
setMicSelectEnabled();
|
||||
refreshMicDevices();
|
||||
|
||||
wsSend(JSON.stringify(startMessage));
|
||||
addBubble("system", "Session started.");
|
||||
setComposerEnabled(true);
|
||||
els.textInput.focus();
|
||||
});
|
||||
|
||||
ws.addEventListener("message", (event) => {
|
||||
const data = event.data;
|
||||
if (typeof data === "string") {
|
||||
let parsed;
|
||||
try {
|
||||
parsed = JSON.parse(data);
|
||||
} catch (err) {
|
||||
console.warn("Bad JSON from server", err, data);
|
||||
addWsLog(
|
||||
"error",
|
||||
`invalid JSON from server: ${truncateLogValue(data)}`,
|
||||
"error",
|
||||
);
|
||||
return;
|
||||
}
|
||||
logWsPayload("recv", parsed);
|
||||
handleEvent(parsed);
|
||||
} else if (data instanceof ArrayBuffer) {
|
||||
// Server doesn't currently send binary, but handle it just in case.
|
||||
addWsLog("recv", `binary audio ${data.byteLength} bytes`);
|
||||
schedulePlayback(new Int16Array(data));
|
||||
}
|
||||
});
|
||||
|
||||
ws.addEventListener("error", (err) => {
|
||||
console.error("WebSocket error", err);
|
||||
setStatus("error", "Connection error");
|
||||
addWsLog("error", "websocket error", "error");
|
||||
});
|
||||
|
||||
ws.addEventListener("close", (event) => {
|
||||
const wasConnected = state.connected;
|
||||
state.ws = null;
|
||||
state.connected = false;
|
||||
state.connecting = false;
|
||||
if (state.micEnabled) stopMic();
|
||||
stopPlaybackQueue();
|
||||
setConnectButton();
|
||||
setMicButton();
|
||||
setMicSelectEnabled();
|
||||
setComposerEnabled(false);
|
||||
setBotIndicator(false);
|
||||
flushPendingWsLogs();
|
||||
addWsLog(
|
||||
"system",
|
||||
`websocket close code=${event.code}${
|
||||
event.reason ? ` reason=${event.reason}` : ""
|
||||
}`,
|
||||
);
|
||||
if (wasConnected) {
|
||||
addBubble(
|
||||
"system",
|
||||
`Session ended${event.reason ? ` — ${event.reason}` : ""}.`,
|
||||
);
|
||||
setStatus("idle", "Disconnected");
|
||||
} else {
|
||||
setStatus("error", "Connection closed");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function disconnect() {
|
||||
if (!state.ws) return;
|
||||
try {
|
||||
if (state.ws.readyState === WebSocket.OPEN) {
|
||||
const stopMessage = { type: "session.stop", reason: "client_disconnect" };
|
||||
wsSend(JSON.stringify(stopMessage));
|
||||
}
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
try {
|
||||
state.ws.close(1000, "client_disconnect");
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------- Wiring */
|
||||
|
||||
els.connectBtn.addEventListener("click", () => {
|
||||
if (state.connected) disconnect();
|
||||
else connect();
|
||||
});
|
||||
|
||||
els.micBtn.addEventListener("click", async () => {
|
||||
if (!state.connected) return;
|
||||
els.micBtn.disabled = true;
|
||||
try {
|
||||
if (state.micEnabled) {
|
||||
stopMic();
|
||||
} else {
|
||||
await startMic();
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Mic error", err);
|
||||
addBubble("system", `Mic error: ${err.message || err}`);
|
||||
} finally {
|
||||
els.micBtn.disabled = !state.connected;
|
||||
}
|
||||
});
|
||||
|
||||
els.micSelect.addEventListener("change", async () => {
|
||||
state.selectedMicDeviceId = els.micSelect.value;
|
||||
if (!state.micEnabled) return;
|
||||
|
||||
els.micSelect.disabled = true;
|
||||
els.micBtn.disabled = true;
|
||||
try {
|
||||
stopMic();
|
||||
await startMic();
|
||||
} catch (err) {
|
||||
console.error("Mic switch error", err);
|
||||
addBubble("system", `Mic switch error: ${err.message || err}`);
|
||||
} finally {
|
||||
setMicButton();
|
||||
setMicSelectEnabled();
|
||||
}
|
||||
});
|
||||
|
||||
if (navigator.mediaDevices?.addEventListener) {
|
||||
navigator.mediaDevices.addEventListener("devicechange", refreshMicDevices);
|
||||
}
|
||||
|
||||
els.clearBtn.addEventListener("click", () => {
|
||||
clearChat();
|
||||
});
|
||||
|
||||
els.clearWsLogBtn.addEventListener("click", () => {
|
||||
clearWsLog();
|
||||
});
|
||||
|
||||
function autosizeTextarea() {
|
||||
const ta = els.textInput;
|
||||
ta.style.height = "auto";
|
||||
ta.style.height = `${Math.min(ta.scrollHeight, 180)}px`;
|
||||
}
|
||||
|
||||
function submitText() {
|
||||
const value = els.textInput.value;
|
||||
if (!sendText(value)) return;
|
||||
els.textInput.value = "";
|
||||
autosizeTextarea();
|
||||
setComposerEnabled(state.connected);
|
||||
}
|
||||
|
||||
els.composer.addEventListener("submit", (event) => {
|
||||
event.preventDefault();
|
||||
submitText();
|
||||
});
|
||||
|
||||
els.textInput.addEventListener("input", () => {
|
||||
autosizeTextarea();
|
||||
setComposerEnabled(state.connected);
|
||||
});
|
||||
|
||||
els.textInput.addEventListener("keydown", (event) => {
|
||||
if (event.key === "Enter" && !event.shiftKey && !event.isComposing) {
|
||||
event.preventDefault();
|
||||
submitText();
|
||||
}
|
||||
});
|
||||
|
||||
window.addEventListener("beforeunload", () => {
|
||||
if (state.ws) {
|
||||
try {
|
||||
state.ws.close();
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
if (state.audioContext) {
|
||||
try {
|
||||
state.audioContext.close();
|
||||
} catch (_) {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
els.url.value = defaultWsUrl();
|
||||
|
||||
setStatus("idle", "Disconnected");
|
||||
setConnectButton();
|
||||
setMicButton();
|
||||
setMicSelectEnabled();
|
||||
setComposerEnabled(false);
|
||||
158
static/voice-demo/index.html
Normal file
158
static/voice-demo/index.html
Normal file
@@ -0,0 +1,158 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>VA Voice Chat — /ws-product</title>
|
||||
<link rel="stylesheet" href="./styles.css" />
|
||||
</head>
|
||||
<body>
|
||||
<main class="app">
|
||||
<header class="app__header">
|
||||
<div class="brand">
|
||||
<span class="brand__dot" aria-hidden="true"></span>
|
||||
<h1>VA Voice Chat</h1>
|
||||
</div>
|
||||
|
||||
<div class="connection">
|
||||
<label class="connection__field">
|
||||
<span>WebSocket URL</span>
|
||||
<input
|
||||
id="ws-url"
|
||||
type="text"
|
||||
placeholder="ws://host/ws-product"
|
||||
spellcheck="false"
|
||||
autocomplete="off"
|
||||
/>
|
||||
</label>
|
||||
<button id="connect-btn" class="btn btn--primary" type="button">
|
||||
Connect
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="status">
|
||||
<span id="status-dot" class="status__dot status__dot--idle"></span>
|
||||
<span id="status-text" class="status__text">Disconnected</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<div class="app__body">
|
||||
<div class="app__main">
|
||||
<section class="chat" aria-label="Conversation history">
|
||||
<div id="chat-log" class="chat__log" role="log" aria-live="polite">
|
||||
<div class="chat__empty">
|
||||
<p>Connect to the engine, enable your mic, and start talking.</p>
|
||||
<p class="chat__hint">
|
||||
Audio is streamed as PCM16 mono @ 16 kHz over
|
||||
<code>/ws-product</code>.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<footer class="controls" aria-label="Chat controls">
|
||||
<div class="meter" aria-hidden="true">
|
||||
<div id="meter-fill" class="meter__fill"></div>
|
||||
</div>
|
||||
|
||||
<form id="composer" class="composer" autocomplete="off">
|
||||
<textarea
|
||||
id="text-input"
|
||||
class="composer__input"
|
||||
rows="1"
|
||||
placeholder="Type a message, or use the mic…"
|
||||
disabled
|
||||
></textarea>
|
||||
<button
|
||||
id="send-btn"
|
||||
class="btn btn--primary composer__send"
|
||||
type="submit"
|
||||
disabled
|
||||
title="Send message (Enter)"
|
||||
>
|
||||
Send
|
||||
</button>
|
||||
</form>
|
||||
|
||||
<div class="controls__row">
|
||||
<label class="device-picker">
|
||||
<span class="device-picker__label">Microphone</span>
|
||||
<select id="mic-select" class="device-picker__select" disabled>
|
||||
<option value="">Default microphone</option>
|
||||
</select>
|
||||
</label>
|
||||
|
||||
<button
|
||||
id="mic-btn"
|
||||
class="mic-btn"
|
||||
type="button"
|
||||
disabled
|
||||
aria-pressed="false"
|
||||
title="Mic is off"
|
||||
>
|
||||
<svg
|
||||
class="mic-btn__icon"
|
||||
viewBox="0 0 24 24"
|
||||
width="24"
|
||||
height="24"
|
||||
aria-hidden="true"
|
||||
>
|
||||
<path
|
||||
d="M12 14a3 3 0 0 0 3-3V6a3 3 0 1 0-6 0v5a3 3 0 0 0 3 3Z"
|
||||
fill="currentColor"
|
||||
/>
|
||||
<path
|
||||
d="M19 11a1 1 0 1 0-2 0 5 5 0 0 1-10 0 1 1 0 1 0-2 0 7 7 0 0 0 6 6.92V21a1 1 0 1 0 2 0v-3.08A7 7 0 0 0 19 11Z"
|
||||
fill="currentColor"
|
||||
/>
|
||||
</svg>
|
||||
<span class="mic-btn__label">Enable mic</span>
|
||||
</button>
|
||||
|
||||
<div class="indicators">
|
||||
<span id="mic-indicator" class="indicator">
|
||||
<span class="indicator__dot indicator__dot--mic"></span>
|
||||
<span class="indicator__label">Mic</span>
|
||||
</span>
|
||||
<span id="bot-indicator" class="indicator">
|
||||
<span class="indicator__dot indicator__dot--bot"></span>
|
||||
<span class="indicator__label">Bot</span>
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<button id="clear-btn" class="btn btn--ghost" type="button">
|
||||
Clear
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<p class="hint">
|
||||
Press <kbd>Enter</kbd> to send, <kbd>Shift</kbd>+<kbd>Enter</kbd>
|
||||
for newline. Sending text will interrupt the bot if it's speaking.
|
||||
Browser echo cancellation is on; use headphones if echo persists.
|
||||
</p>
|
||||
</footer>
|
||||
</div>
|
||||
|
||||
<section class="ws-log" aria-label="WebSocket log">
|
||||
<div class="ws-log__header">
|
||||
<div class="ws-log__header-left">
|
||||
<h2>WebSocket Log</h2>
|
||||
<div class="ws-log__legend" aria-hidden="true">
|
||||
<span class="ws-log__legend-item ws-log__legend-item--send">Send</span>
|
||||
<span class="ws-log__legend-item ws-log__legend-item--recv">Recv</span>
|
||||
</div>
|
||||
</div>
|
||||
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
|
||||
Clear log
|
||||
</button>
|
||||
</div>
|
||||
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
|
||||
<div class="ws-log__empty">No websocket events yet.</div>
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script type="module" src="./app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
104
static/voice-demo/pcm-recorder.worklet.js
Normal file
104
static/voice-demo/pcm-recorder.worklet.js
Normal file
@@ -0,0 +1,104 @@
|
||||
/**
|
||||
* PCM Recorder AudioWorklet.
|
||||
*
|
||||
* Captures mono Float32 mic samples at the AudioContext's native rate,
|
||||
* resamples them to a target sample rate (default 16 kHz) with linear
|
||||
* interpolation, then ships PCM16 frames of a fixed duration (default 20 ms)
|
||||
* to the main thread via `port.postMessage(ArrayBuffer)`.
|
||||
*
|
||||
* It also computes a simple RMS level per frame for the UI VU meter so the
|
||||
* main thread doesn't have to re-process the audio.
|
||||
*/
|
||||
|
||||
class PcmRecorderProcessor extends AudioWorkletProcessor {
|
||||
constructor(options) {
|
||||
super();
|
||||
|
||||
const opts = (options && options.processorOptions) || {};
|
||||
this._targetSampleRate = opts.targetSampleRate || 16000;
|
||||
this._frameMs = opts.frameMs || 20;
|
||||
this._frameSamples = Math.round(
|
||||
(this._targetSampleRate * this._frameMs) / 1000,
|
||||
);
|
||||
|
||||
// Resampling state.
|
||||
// `ratio` is input samples per output sample.
|
||||
this._ratio = sampleRate / this._targetSampleRate;
|
||||
this._inputBuffer = new Float32Array(0);
|
||||
// Float position in `_inputBuffer` for the next output sample.
|
||||
this._inputOffset = 0;
|
||||
|
||||
// Output framing state.
|
||||
this._frameBuffer = new Int16Array(this._frameSamples);
|
||||
this._frameIndex = 0;
|
||||
|
||||
// VU meter accumulator.
|
||||
this._rmsSumSquares = 0;
|
||||
this._rmsCount = 0;
|
||||
}
|
||||
|
||||
process(inputs) {
|
||||
const input = inputs[0];
|
||||
if (!input || input.length === 0) return true;
|
||||
const channel = input[0];
|
||||
if (!channel || channel.length === 0) return true;
|
||||
|
||||
// Append new samples to the input buffer.
|
||||
const merged = new Float32Array(this._inputBuffer.length + channel.length);
|
||||
merged.set(this._inputBuffer, 0);
|
||||
merged.set(channel, this._inputBuffer.length);
|
||||
this._inputBuffer = merged;
|
||||
|
||||
const ratio = this._ratio;
|
||||
const inLen = this._inputBuffer.length;
|
||||
let pos = this._inputOffset;
|
||||
|
||||
while (pos + 1 < inLen) {
|
||||
const lo = Math.floor(pos);
|
||||
const hi = lo + 1;
|
||||
const w = pos - lo;
|
||||
const sample =
|
||||
this._inputBuffer[lo] * (1 - w) + this._inputBuffer[hi] * w;
|
||||
|
||||
this._rmsSumSquares += sample * sample;
|
||||
this._rmsCount += 1;
|
||||
|
||||
let s = sample;
|
||||
if (s > 1) s = 1;
|
||||
else if (s < -1) s = -1;
|
||||
this._frameBuffer[this._frameIndex++] =
|
||||
s < 0 ? Math.round(s * 0x8000) : Math.round(s * 0x7fff);
|
||||
|
||||
if (this._frameIndex === this._frameSamples) {
|
||||
const frame = new Int16Array(this._frameSamples);
|
||||
frame.set(this._frameBuffer);
|
||||
const rms =
|
||||
this._rmsCount > 0
|
||||
? Math.sqrt(this._rmsSumSquares / this._rmsCount)
|
||||
: 0;
|
||||
this.port.postMessage(
|
||||
{ type: "frame", buffer: frame.buffer, rms },
|
||||
[frame.buffer],
|
||||
);
|
||||
this._frameIndex = 0;
|
||||
this._rmsSumSquares = 0;
|
||||
this._rmsCount = 0;
|
||||
}
|
||||
|
||||
pos += ratio;
|
||||
}
|
||||
|
||||
// Trim consumed samples from the input buffer; keep at least the last
|
||||
// sample we still need to interpolate against on the next call.
|
||||
const consumed = Math.floor(pos);
|
||||
if (consumed > 0) {
|
||||
this._inputBuffer = this._inputBuffer.slice(consumed);
|
||||
pos -= consumed;
|
||||
}
|
||||
this._inputOffset = pos;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
registerProcessor("pcm-recorder", PcmRecorderProcessor);
|
||||
739
static/voice-demo/styles.css
Normal file
739
static/voice-demo/styles.css
Normal file
@@ -0,0 +1,739 @@
|
||||
:root {
|
||||
color-scheme: dark;
|
||||
--bg: #0b0d12;
|
||||
--bg-elevated: #131722;
|
||||
--bg-soft: #1a2030;
|
||||
--border: #232a3b;
|
||||
--text: #e6e9f2;
|
||||
--text-dim: #95a0bb;
|
||||
--accent: #4f8cff;
|
||||
--accent-strong: #6aa1ff;
|
||||
--user: #2f7ff0;
|
||||
--assistant: #2a3145;
|
||||
--danger: #ff5577;
|
||||
--success: #2dd28b;
|
||||
--warning: #ffb84d;
|
||||
--radius: 14px;
|
||||
--shadow: 0 10px 30px rgba(0, 0, 0, 0.35);
|
||||
font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto,
|
||||
"Helvetica Neue", Arial, sans-serif;
|
||||
}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
html,
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
height: 100%;
|
||||
background: radial-gradient(
|
||||
1200px 600px at 80% -10%,
|
||||
rgba(79, 140, 255, 0.18),
|
||||
transparent 60%
|
||||
),
|
||||
radial-gradient(
|
||||
900px 500px at -10% 110%,
|
||||
rgba(45, 210, 139, 0.12),
|
||||
transparent 60%
|
||||
),
|
||||
var(--bg);
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
body {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
padding: 12px;
|
||||
}
|
||||
|
||||
.app {
|
||||
display: grid;
|
||||
grid-template-rows: auto minmax(0, 1fr);
|
||||
gap: 12px;
|
||||
width: min(1440px, 100%);
|
||||
height: calc(100dvh - 24px);
|
||||
max-height: calc(100vh - 24px);
|
||||
background: var(--bg-elevated);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 20px;
|
||||
box-shadow: var(--shadow);
|
||||
padding: 14px;
|
||||
}
|
||||
|
||||
.app__body {
|
||||
display: grid;
|
||||
grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
|
||||
gap: 12px;
|
||||
min-height: 0;
|
||||
}
|
||||
|
||||
.app__main {
|
||||
display: grid;
|
||||
grid-template-rows: minmax(0, 1fr) auto;
|
||||
gap: 0;
|
||||
min-height: 0;
|
||||
overflow: hidden;
|
||||
background: var(--bg);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: var(--radius);
|
||||
}
|
||||
|
||||
/* Header ---------------------------------------------------------------- */
|
||||
|
||||
.app__header {
|
||||
display: grid;
|
||||
grid-template-columns: auto minmax(0, 1fr) auto;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
}
|
||||
|
||||
.brand {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
}
|
||||
|
||||
.brand h1 {
|
||||
font-size: 16px;
|
||||
margin: 0;
|
||||
letter-spacing: 0.2px;
|
||||
}
|
||||
|
||||
.brand__dot {
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 50%;
|
||||
background: var(--accent);
|
||||
box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.18);
|
||||
}
|
||||
|
||||
.connection {
|
||||
display: flex;
|
||||
align-items: end;
|
||||
gap: 8px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.connection__field {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 4px;
|
||||
min-width: 0;
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.connection__field span {
|
||||
font-size: 11px;
|
||||
color: var(--text-dim);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.8px;
|
||||
}
|
||||
|
||||
.connection__field input {
|
||||
background: var(--bg-soft);
|
||||
color: var(--text);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 10px;
|
||||
padding: 7px 10px;
|
||||
font: inherit;
|
||||
font-size: 12px;
|
||||
outline: none;
|
||||
width: 100%;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.connection__field input:focus {
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
|
||||
}
|
||||
|
||||
.status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
font-size: 13px;
|
||||
color: var(--text-dim);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.status__dot {
|
||||
width: 9px;
|
||||
height: 9px;
|
||||
border-radius: 50%;
|
||||
background: var(--text-dim);
|
||||
}
|
||||
|
||||
.status__dot--idle {
|
||||
background: var(--text-dim);
|
||||
}
|
||||
|
||||
.status__dot--connecting {
|
||||
background: var(--warning);
|
||||
animation: pulse 1.2s ease-in-out infinite;
|
||||
}
|
||||
|
||||
.status__dot--connected {
|
||||
background: var(--success);
|
||||
}
|
||||
|
||||
.status__dot--error {
|
||||
background: var(--danger);
|
||||
}
|
||||
|
||||
/* Chat ------------------------------------------------------------------ */
|
||||
|
||||
.chat {
|
||||
overflow: hidden;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
min-height: 0;
|
||||
}
|
||||
|
||||
.chat__log {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
padding: 18px;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 10px;
|
||||
scroll-behavior: smooth;
|
||||
}
|
||||
|
||||
.chat__empty {
|
||||
margin: auto;
|
||||
text-align: center;
|
||||
color: var(--text-dim);
|
||||
}
|
||||
|
||||
.chat__empty p {
|
||||
margin: 4px 0;
|
||||
}
|
||||
|
||||
.chat__hint code {
|
||||
background: var(--bg-soft);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 6px;
|
||||
padding: 1px 6px;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.bubble {
|
||||
max-width: 78%;
|
||||
padding: 10px 14px;
|
||||
border-radius: 14px;
|
||||
line-height: 1.45;
|
||||
font-size: 14px;
|
||||
white-space: pre-wrap;
|
||||
word-wrap: break-word;
|
||||
animation: bubble-in 0.16s ease-out;
|
||||
}
|
||||
|
||||
.bubble--user {
|
||||
background: var(--user);
|
||||
color: #fff;
|
||||
align-self: flex-end;
|
||||
border-bottom-right-radius: 4px;
|
||||
}
|
||||
|
||||
.bubble--assistant {
|
||||
background: var(--assistant);
|
||||
color: var(--text);
|
||||
align-self: flex-start;
|
||||
border-bottom-left-radius: 4px;
|
||||
}
|
||||
|
||||
.bubble--assistant.bubble--interrupted {
|
||||
opacity: 0.75;
|
||||
border-left: 2px solid var(--warning);
|
||||
}
|
||||
|
||||
.bubble--system {
|
||||
align-self: center;
|
||||
background: transparent;
|
||||
color: var(--text-dim);
|
||||
font-size: 12px;
|
||||
border: 1px dashed var(--border);
|
||||
padding: 6px 10px;
|
||||
border-radius: 999px;
|
||||
}
|
||||
|
||||
.bubble__role {
|
||||
display: block;
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.8px;
|
||||
opacity: 0.7;
|
||||
margin-bottom: 4px;
|
||||
}
|
||||
|
||||
/* WebSocket log --------------------------------------------------------- */
|
||||
|
||||
.ws-log {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
min-height: 0;
|
||||
background: var(--bg);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: var(--radius);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.ws-log__header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 8px;
|
||||
padding: 8px 12px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
flex-shrink: 0;
|
||||
background: var(--bg-soft);
|
||||
}
|
||||
|
||||
.ws-log__legend {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
font-size: 10px;
|
||||
color: var(--text-dim);
|
||||
}
|
||||
|
||||
.ws-log__legend-item {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.ws-log__legend-item::before {
|
||||
content: "";
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 2px;
|
||||
}
|
||||
|
||||
.ws-log__legend-item--send::before {
|
||||
background: var(--success);
|
||||
}
|
||||
|
||||
.ws-log__legend-item--recv::before {
|
||||
background: var(--accent-strong);
|
||||
}
|
||||
|
||||
.ws-log__header h2 {
|
||||
margin: 0;
|
||||
font-size: 12px;
|
||||
color: var(--text-dim);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.8px;
|
||||
}
|
||||
|
||||
.ws-log__header-left {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
min-width: 0;
|
||||
}
|
||||
|
||||
.ws-log__body {
|
||||
flex: 1;
|
||||
min-height: 0;
|
||||
overflow-y: auto;
|
||||
overflow-x: hidden;
|
||||
padding: 6px 8px;
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||
font-size: 10.5px;
|
||||
line-height: 1.4;
|
||||
color: var(--text-dim);
|
||||
}
|
||||
|
||||
.ws-log__entry {
|
||||
display: grid;
|
||||
grid-template-columns: 58px 42px minmax(0, 1fr);
|
||||
gap: 6px;
|
||||
align-items: start;
|
||||
padding: 5px 4px;
|
||||
border-radius: 6px;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.ws-log__entry:hover {
|
||||
background: rgba(255, 255, 255, 0.03);
|
||||
}
|
||||
|
||||
.ws-log__time {
|
||||
color: var(--text-dim);
|
||||
opacity: 0.7;
|
||||
font-size: 10px;
|
||||
}
|
||||
|
||||
.ws-log__direction {
|
||||
font-weight: 700;
|
||||
font-size: 9px;
|
||||
letter-spacing: 0.4px;
|
||||
text-transform: uppercase;
|
||||
padding: 2px 0;
|
||||
}
|
||||
|
||||
.ws-log__detail {
|
||||
min-width: 0;
|
||||
overflow-wrap: anywhere;
|
||||
}
|
||||
|
||||
.ws-log__entry--send .ws-log__direction {
|
||||
color: var(--success);
|
||||
}
|
||||
|
||||
.ws-log__entry--recv .ws-log__direction {
|
||||
color: var(--accent-strong);
|
||||
}
|
||||
|
||||
.ws-log__entry--system .ws-log__direction {
|
||||
color: var(--text-dim);
|
||||
}
|
||||
|
||||
.ws-log__entry--error .ws-log__direction {
|
||||
color: var(--danger);
|
||||
}
|
||||
|
||||
.ws-log__empty {
|
||||
color: var(--text-dim);
|
||||
opacity: 0.7;
|
||||
padding: 8px 4px;
|
||||
}
|
||||
|
||||
/* Controls -------------------------------------------------------------- */
|
||||
|
||||
.controls {
|
||||
display: grid;
|
||||
gap: 8px;
|
||||
padding: 10px 12px 8px;
|
||||
border-top: 1px solid var(--border);
|
||||
background: var(--bg-elevated);
|
||||
}
|
||||
|
||||
.meter {
|
||||
height: 6px;
|
||||
background: var(--bg-soft);
|
||||
border-radius: 999px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.meter__fill {
|
||||
height: 100%;
|
||||
width: 0%;
|
||||
background: linear-gradient(90deg, var(--success), var(--accent));
|
||||
transition: width 80ms linear;
|
||||
}
|
||||
|
||||
.composer {
|
||||
display: flex;
|
||||
align-items: flex-end;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.composer__input {
|
||||
flex: 1;
|
||||
resize: none;
|
||||
background: var(--bg-soft);
|
||||
color: var(--text);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 12px;
|
||||
padding: 10px 12px;
|
||||
font: inherit;
|
||||
font-size: 14px;
|
||||
line-height: 1.4;
|
||||
outline: none;
|
||||
min-height: 42px;
|
||||
max-height: 180px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.composer__input:focus {
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
|
||||
}
|
||||
|
||||
.composer__input:disabled {
|
||||
opacity: 0.55;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.composer__send {
|
||||
padding: 10px 18px;
|
||||
border-radius: 12px;
|
||||
min-width: 84px;
|
||||
align-self: flex-end;
|
||||
}
|
||||
|
||||
.composer__send:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.controls__row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.device-picker {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 3px;
|
||||
min-width: 0;
|
||||
flex: 1 1 160px;
|
||||
max-width: 240px;
|
||||
}
|
||||
|
||||
.device-picker__label {
|
||||
font-size: 11px;
|
||||
color: var(--text-dim);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.8px;
|
||||
}
|
||||
|
||||
.device-picker__select {
|
||||
appearance: none;
|
||||
background: var(--bg-soft);
|
||||
color: var(--text);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 10px;
|
||||
padding: 7px 28px 7px 10px;
|
||||
font: inherit;
|
||||
font-size: 12px;
|
||||
outline: none;
|
||||
width: 100%;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.device-picker__select:focus {
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
|
||||
}
|
||||
|
||||
.device-picker__select:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.mic-btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
padding: 8px 12px;
|
||||
border-radius: 999px;
|
||||
background: var(--bg-soft);
|
||||
color: var(--text);
|
||||
border: 1px solid var(--border);
|
||||
font: inherit;
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
flex-shrink: 0;
|
||||
transition: transform 0.08s ease, background 0.15s ease, color 0.15s ease,
|
||||
border-color 0.15s ease;
|
||||
}
|
||||
|
||||
.mic-btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.mic-btn:not(:disabled):hover {
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.mic-btn:not(:disabled):active {
|
||||
transform: scale(0.98);
|
||||
}
|
||||
|
||||
.mic-btn[aria-pressed="true"] {
|
||||
background: var(--danger);
|
||||
border-color: var(--danger);
|
||||
color: #fff;
|
||||
box-shadow: 0 0 0 6px rgba(255, 85, 119, 0.18);
|
||||
}
|
||||
|
||||
.mic-btn__icon {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.indicators {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
margin-left: auto;
|
||||
}
|
||||
|
||||
.indicator {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
font-size: 12px;
|
||||
color: var(--text-dim);
|
||||
}
|
||||
|
||||
.indicator__dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
background: var(--bg-soft);
|
||||
border: 1px solid var(--border);
|
||||
}
|
||||
|
||||
.indicator.is-active .indicator__dot--mic {
|
||||
background: var(--success);
|
||||
border-color: var(--success);
|
||||
box-shadow: 0 0 0 4px rgba(45, 210, 139, 0.2);
|
||||
}
|
||||
|
||||
.indicator.is-active .indicator__dot--bot {
|
||||
background: var(--accent);
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 4px rgba(79, 140, 255, 0.22);
|
||||
animation: pulse 1s ease-in-out infinite;
|
||||
}
|
||||
|
||||
.indicator.is-active .indicator__label {
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
.btn {
|
||||
appearance: none;
|
||||
border: 1px solid var(--border);
|
||||
background: var(--bg-soft);
|
||||
color: var(--text);
|
||||
padding: 9px 14px;
|
||||
border-radius: 10px;
|
||||
font: inherit;
|
||||
font-size: 13px;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s ease, border-color 0.15s ease;
|
||||
}
|
||||
|
||||
.btn:hover {
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.btn--primary {
|
||||
background: var(--accent);
|
||||
color: #fff;
|
||||
border-color: var(--accent);
|
||||
}
|
||||
|
||||
.btn--primary:hover {
|
||||
background: var(--accent-strong);
|
||||
border-color: var(--accent-strong);
|
||||
}
|
||||
|
||||
.btn--primary.is-disconnect {
|
||||
background: transparent;
|
||||
color: var(--danger);
|
||||
border-color: var(--danger);
|
||||
}
|
||||
|
||||
.btn--ghost {
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
.hint {
|
||||
margin: 0;
|
||||
font-size: 11px;
|
||||
color: var(--text-dim);
|
||||
opacity: 0.85;
|
||||
}
|
||||
|
||||
.hint kbd {
|
||||
background: var(--bg-soft);
|
||||
border: 1px solid var(--border);
|
||||
border-bottom-width: 2px;
|
||||
border-radius: 4px;
|
||||
padding: 0 5px;
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
|
||||
font-size: 11px;
|
||||
color: var(--text);
|
||||
}
|
||||
|
||||
/* Animations ------------------------------------------------------------ */
|
||||
|
||||
@keyframes pulse {
|
||||
0%,
|
||||
100% {
|
||||
opacity: 1;
|
||||
}
|
||||
50% {
|
||||
opacity: 0.55;
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes bubble-in {
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translateY(4px);
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive ------------------------------------------------------------ */
|
||||
|
||||
@media (max-width: 820px) {
|
||||
.app__body {
|
||||
grid-template-columns: 1fr;
|
||||
grid-template-rows: minmax(0, 1fr) min(240px, 32vh);
|
||||
}
|
||||
|
||||
.ws-log__legend {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.hint {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 720px) {
|
||||
body {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.app {
|
||||
height: 100dvh;
|
||||
max-height: 100vh;
|
||||
border-radius: 0;
|
||||
border: none;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.app__header {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.connection {
|
||||
flex-direction: column;
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.ws-log__entry {
|
||||
grid-template-columns: 54px 38px minmax(0, 1fr);
|
||||
}
|
||||
|
||||
.status {
|
||||
justify-content: flex-end;
|
||||
}
|
||||
|
||||
.device-picker {
|
||||
flex: 1 1 100%;
|
||||
max-width: none;
|
||||
}
|
||||
|
||||
.indicators {
|
||||
margin-left: 0;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user