Sync with engine v5

This commit is contained in:
Xin Wang
2026-06-03 12:36:18 +08:00
parent 056a8a4ad8
commit 705a63dd25
17 changed files with 854 additions and 111 deletions

View File

@@ -131,6 +131,7 @@ class LLMConfig:
variables: dict[str, str] = field(default_factory=dict) variables: dict[str, str] = field(default_factory=dict)
detail: bool = False detail: bool = False
timeout_sec: float = 60.0 timeout_sec: float = 60.0
image_input_mode: str = "base64"
@property @property
def is_fastgpt(self) -> bool: def is_fastgpt(self) -> bool:
@@ -236,6 +237,15 @@ def config_from_dict(data: dict) -> EngineConfig:
if llm.get("chat_id") == "": if llm.get("chat_id") == "":
llm["chat_id"] = None llm["chat_id"] = None
llm.pop("send_system_prompt", None) llm.pop("send_system_prompt", None)
image_input_mode = str(
llm.get("image_input_mode", LLMConfig().image_input_mode)
).strip().lower()
if image_input_mode not in {"base64", "upload"}:
raise ValueError(
"services.llm.image_input_mode must be 'base64' or 'upload', "
f"got {llm.get('image_input_mode')!r}"
)
llm["image_input_mode"] = image_input_mode
if llm.get("app_id") == "": if llm.get("app_id") == "":
llm["app_id"] = None llm["app_id"] = None
if not isinstance(llm.get("variables"), dict): if not isinstance(llm.get("variables"), dict):

View File

@@ -1,5 +1,10 @@
from __future__ import annotations from __future__ import annotations
import asyncio
import base64
import binascii
import os
import tempfile
import uuid import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any from typing import Any
@@ -19,6 +24,7 @@ from pipecat.frames.frames import (
LLMFullResponseStartFrame, LLMFullResponseStartFrame,
LLMTextFrame, LLMTextFrame,
OutputTransportMessageFrame, OutputTransportMessageFrame,
OutputTransportMessageUrgentFrame,
) )
from pipecat.processors.aggregators.llm_context import LLMContext from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.frame_processor import FrameDirection from pipecat.processors.frame_processor import FrameDirection
@@ -129,6 +135,50 @@ def _interactive_spoken_prompt(event: FastGPTInteractiveEvent) -> str:
return "请继续。" return "请继续。"
IMAGE_INPUT_MODE_BASE64 = "base64"
IMAGE_INPUT_MODE_UPLOAD = "upload"
SUPPORTED_IMAGE_INPUT_MODES = frozenset({IMAGE_INPUT_MODE_BASE64, IMAGE_INPUT_MODE_UPLOAD})
_MIME_TO_EXT = {
"image/jpeg": ".jpg",
"image/png": ".png",
"image/webp": ".webp",
}
def _message_has_image(message: dict[str, Any]) -> bool:
content = message.get("content")
if not isinstance(content, list):
return False
return any(
isinstance(part, dict) and part.get("type") == "image_url"
for part in content
)
def _redact_messages_for_log(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Replace base64 image data URLs with a short placeholder for logging."""
redacted: list[dict[str, Any]] = []
for message in messages:
content = message.get("content")
if not isinstance(content, list):
redacted.append(message)
continue
parts: list[Any] = []
for part in content:
if (
isinstance(part, dict)
and part.get("type") == "image_url"
and isinstance(part.get("image_url"), dict)
):
url = str(part["image_url"].get("url") or "")
parts.append({"type": "image_url", "image_url": {"url": f"<{len(url)} chars>"}})
else:
parts.append(part)
redacted.append({**message, "content": parts})
return redacted
@dataclass @dataclass
class FastGPTLLMSettings(LLMSettings): class FastGPTLLMSettings(LLMSettings):
variables: dict[str, Any] = field(default_factory=dict) variables: dict[str, Any] = field(default_factory=dict)
@@ -167,6 +217,7 @@ class FastGPTLLMService(LLMService):
app_id: str | None = None, app_id: str | None = None,
greeting_prompt: str | None = None, greeting_prompt: str | None = None,
timeout: float = 60.0, timeout: float = 60.0,
image_input_mode: str = IMAGE_INPUT_MODE_BASE64,
settings: FastGPTLLMSettings | None = None, settings: FastGPTLLMSettings | None = None,
**kwargs, **kwargs,
) -> None: ) -> None:
@@ -185,6 +236,20 @@ class FastGPTLLMService(LLMService):
) )
self._active_response = None self._active_response = None
mode = (image_input_mode or IMAGE_INPUT_MODE_BASE64).strip().lower()
if mode not in SUPPORTED_IMAGE_INPUT_MODES:
raise ValueError(
f"Unsupported image_input_mode {image_input_mode!r}; "
f"expected one of {sorted(SUPPORTED_IMAGE_INPUT_MODES)}"
)
if mode == IMAGE_INPUT_MODE_UPLOAD and not self._app_id:
logger.warning(
"FastGPT image_input_mode='upload' requires app_id; "
"falling back to inline base64"
)
mode = IMAGE_INPUT_MODE_BASE64
self._image_input_mode = mode
@property @property
def app_id(self) -> str: def app_id(self) -> str:
return self._app_id return self._app_id
@@ -305,26 +370,114 @@ class FastGPTLLMService(LLMService):
if response is not None: if response is not None:
await response.aclose() await response.aclose()
def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, str]]: def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, Any]]:
raw_messages = context.get_messages() raw_messages = context.get_messages()
for message in reversed(raw_messages): for message in reversed(raw_messages):
if not isinstance(message, dict) or message.get("role") != "user": if not isinstance(message, dict) or message.get("role") != "user":
continue continue
if _message_has_image(message):
# Multimodal turn: forward the OpenAI-style content list as-is
# (text parts + image_url with a base64 data URL). FastGPT's
# /chat/completions accepts this directly.
return [{"role": "user", "content": message["content"]}]
text = _message_text(message) text = _message_text(message)
if text: if text:
return [{"role": "user", "content": text}] return [{"role": "user", "content": text}]
return [{"role": "user", "content": self._greeting_prompt}] return [{"role": "user", "content": self._greeting_prompt}]
async def _resolve_image_inputs(
self, messages: list[dict[str, Any]]
) -> list[dict[str, Any]]:
"""In ``upload`` mode, replace inline base64 image data URLs with uploaded URLs.
In ``base64`` mode the messages are returned untouched (inline data URLs).
New message/content objects are built so the shared ``LLMContext`` messages
are never mutated.
"""
if self._image_input_mode != IMAGE_INPUT_MODE_UPLOAD:
return messages
resolved: list[dict[str, Any]] = []
for message in messages:
content = message.get("content")
if not isinstance(content, list):
resolved.append(message)
continue
new_content: list[Any] = []
for part in content:
url = (
part.get("image_url", {}).get("url")
if isinstance(part, dict) and part.get("type") == "image_url"
else None
)
if isinstance(url, str) and url.startswith("data:image/"):
uploaded = await self._upload_data_url(url)
new_content.append(
{"type": "image_url", "image_url": {"url": uploaded}}
)
else:
new_content.append(part)
resolved.append({**message, "content": new_content})
return resolved
async def _upload_data_url(self, data_url: str) -> str:
"""Upload a ``data:image/...;base64,...`` URL via FastGPT and return its URL.
Falls back to the original data URL if parsing or upload fails so the turn
still proceeds with inline base64.
"""
header, _, payload = data_url.partition(",")
mime_type = header[len("data:"):].split(";", 1)[0].strip() or "image/jpeg"
try:
raw = base64.b64decode(payload, validate=True)
except (binascii.Error, ValueError) as exc:
logger.warning(f"FastGPT image upload skipped; invalid base64: {exc}")
return data_url
suffix = _MIME_TO_EXT.get(mime_type, ".jpg")
tmp_path: str | None = None
try:
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(raw)
tmp_path = tmp.name
result = await self._client.upload_chat_image(
appId=self._app_id,
chatId=self._chat_id,
file_path=tmp_path,
)
url = result.get("url") if isinstance(result, dict) else None
if isinstance(url, str) and url:
logger.info(
f"FastGPT image uploaded chatId={self._chat_id} "
f"bytes={len(raw)} url={url}"
)
return url
logger.warning("FastGPT image upload returned no url; using inline base64")
return data_url
except Exception as exc:
logger.warning(f"FastGPT image upload failed; using inline base64: {exc}")
return data_url
finally:
if tmp_path is not None:
try:
os.unlink(tmp_path)
except OSError:
pass
async def _process_context(self, context: LLMContext) -> None: async def _process_context(self, context: LLMContext) -> None:
messages = self._build_fastgpt_messages(context) messages = self._build_fastgpt_messages(context)
messages = await self._resolve_image_inputs(messages)
variables = self._settings.variables or None variables = self._settings.variables or None
logger.info( logger.info(
"FastGPT chat completion " "FastGPT chat completion "
f"chatId={self._chat_id} appId={self._app_id or '-'} " f"chatId={self._chat_id} appId={self._app_id or '-'} "
f"variables={sorted((variables or {}).keys())} messages={messages!r}" f"variables={sorted((variables or {}).keys())} "
f"messages={_redact_messages_for_log(messages)!r}"
) )
await self.start_ttfb_metrics() await self.start_ttfb_metrics()

View File

@@ -23,6 +23,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
UserTurnStoppedMessage, UserTurnStoppedMessage,
) )
from pipecat.serializers.base_serializer import FrameSerializer from pipecat.serializers.base_serializer import FrameSerializer
from pipecat.serializers.protobuf import ProtobufFrameSerializer
from pipecat.transports.websocket.fastapi import ( from pipecat.transports.websocket.fastapi import (
FastAPIWebsocketParams, FastAPIWebsocketParams,
FastAPIWebsocketTransport, FastAPIWebsocketTransport,
@@ -68,6 +69,15 @@ async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
) )
async def run_voice_pipeline(websocket, config: EngineConfig) -> None:
await run_pipeline_with_serializer(
websocket,
config,
serializer=ProtobufFrameSerializer(),
client_label="Pipecat protobuf",
)
async def run_pipeline_with_serializer( async def run_pipeline_with_serializer(
websocket, websocket,
config: EngineConfig, config: EngineConfig,
@@ -120,8 +130,13 @@ async def run_pipeline_with_serializer(
stop_secs=config.turn.vad.stop_secs, stop_secs=config.turn.vad.stop_secs,
min_volume=config.turn.vad.min_volume, min_volume=config.turn.vad.min_volume,
) )
# Use a simple silence-timeout strategy for streaming ASR so short Chinese # Replace pipecat's default stop strategy (Smart Turn v3) with a simple
# pauses do not split one logical utterance into multiple LLM calls. # silence-timeout strategy. Smart Turn v3 was finalizing every short
# Chinese phrase as a complete turn, which caused one logical utterance
# to become several LLM calls and several user bubbles in the UI. The
# timeout strategy waits for `user_speech_timeout_sec` of silence
# (re-armed every time the user resumes speaking) before declaring the
# turn finished — which is what we actually want for streaming ASRs.
user_turn_strategies = UserTurnStrategies( user_turn_strategies = UserTurnStrategies(
start=[ start=[
InterruptionGateUserTurnStartStrategy( InterruptionGateUserTurnStartStrategy(
@@ -225,22 +240,6 @@ async def run_pipeline_with_serializer(
nonlocal idle_prompt_count nonlocal idle_prompt_count
idle_prompt_count = 0 idle_prompt_count = 0
@user_aggregator.event_handler("on_user_turn_idle")
async def on_user_turn_idle(aggregator):
nonlocal idle_prompt_count
text = config.turn.idle_prompt_text.strip()
if not text or config.turn.idle_prompt_max_count <= 0:
return
if idle_prompt_count >= config.turn.idle_prompt_max_count:
return
idle_prompt_count += 1
logger.info(
"User idle prompt triggered "
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
)
await aggregator.push_frame(TTSSpeakFrame(text))
@user_aggregator.event_handler("on_user_turn_stopped") @user_aggregator.event_handler("on_user_turn_stopped")
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage): async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
logger.info(f"User: {message.content}") logger.info(f"User: {message.content}")
@@ -268,5 +267,25 @@ async def run_pipeline_with_serializer(
) )
text_stream.take_interrupted_stream_text() text_stream.take_interrupted_stream_text()
@user_aggregator.event_handler("on_user_turn_idle")
async def on_user_turn_idle(aggregator):
nonlocal idle_prompt_count
text = config.turn.idle_prompt_text.strip()
if not text or config.turn.idle_prompt_max_count <= 0:
return
if idle_prompt_count >= config.turn.idle_prompt_max_count:
return
idle_prompt_count += 1
logger.info(
"User idle prompt triggered "
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
)
await aggregator.push_frame(TTSSpeakFrame(text))
# NOTE: assistant turn started/final events are emitted by
# ProductTextStreamProcessor, upstream of TTS, so text streams to the
# client ahead of audio. This logger is kept for server-side visibility.
runner = PipelineRunner(handle_sigint=False) runner = PipelineRunner(handle_sigint=False)
await runner.run(task) await runner.run(task)

View File

@@ -65,6 +65,7 @@ def create_llm_service(
app_id=config.app_id, app_id=config.app_id,
greeting_prompt=greeting_prompt, greeting_prompt=greeting_prompt,
timeout=config.timeout_sec, timeout=config.timeout_sec,
image_input_mode=config.image_input_mode,
settings=FastGPTLLMSettings( settings=FastGPTLLMSettings(
model=config.model or "fastgpt", model=config.model or "fastgpt",
variables=variables, variables=variables,

View File

@@ -6,6 +6,7 @@ from pipecat.frames.frames import (
Frame, Frame,
InputTransportMessageFrame, InputTransportMessageFrame,
LLMMessagesAppendFrame, LLMMessagesAppendFrame,
UserImageRawFrame,
UserStartedSpeakingFrame, UserStartedSpeakingFrame,
UserStoppedSpeakingFrame, UserStoppedSpeakingFrame,
) )
@@ -13,11 +14,17 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class ProductTextInputProcessor(FrameProcessor): class ProductTextInputProcessor(FrameProcessor):
"""Converts product text-input transport messages into LLM turns.""" """Converts product text-input transport messages and marks image input as user activity."""
async def process_frame(self, frame: Frame, direction: FrameDirection): async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction) await super().process_frame(frame, direction)
if isinstance(frame, UserImageRawFrame):
await self.broadcast_frame(UserStartedSpeakingFrame)
await self.push_frame(frame, direction)
await self.broadcast_frame(UserStoppedSpeakingFrame)
return
if not isinstance(frame, InputTransportMessageFrame): if not isinstance(frame, InputTransportMessageFrame):
await self.push_frame(frame, direction) await self.push_frame(frame, direction)
return return

View File

@@ -154,6 +154,8 @@ class ProductTextStreamProcessor(FrameProcessor):
await self.push_frame(frame, direction) await self.push_frame(frame, direction)
await self._handle_interrupt() await self._handle_interrupt()
elif isinstance(frame, TTSSpeakFrame): elif isinstance(frame, TTSSpeakFrame):
# Fixed-text / direct-speech path: there's no LLM cycle, so
# synthesize one started/delta/final sequence for the spoken text.
text = frame.text or "" text = frame.text or ""
await self.push_frame(frame, direction) await self.push_frame(frame, direction)
await self._start_turn() await self._start_turn()
@@ -172,6 +174,8 @@ class ProductTextStreamProcessor(FrameProcessor):
async def _delta(self, text: str) -> None: async def _delta(self, text: str) -> None:
if not self._turn_active: if not self._turn_active:
# A text frame outside a turn shouldn't happen, but if it does,
# synthesize a started boundary so the client renders sensibly.
await self._start_turn() await self._start_turn()
self._aggregation.append(text) self._aggregation.append(text)
await self._emit("response.text.delta", text=text) await self._emit("response.text.delta", text=text)

View File

@@ -18,7 +18,12 @@ _COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy): class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
"""Starts user turns only after likely intentional speech.""" """Starts user turns only after likely intentional speech.
When the assistant is speaking, short background speech should not barge in
unless it is a common answer to a yes/no style question. When the assistant
is not speaking, any non-empty transcript can start a normal user turn.
"""
def __init__( def __init__(
self, self,

View File

@@ -24,6 +24,19 @@ const WS_LOG_GROUP_KEYS = {
AUDIO_SEND: "send:input.audio", AUDIO_SEND: "send:input.audio",
}; };
const CAMERA_DONE_TEXT = "【拍摄完成】"; const CAMERA_DONE_TEXT = "【拍摄完成】";
// Sample images shown as thumbnails under the camera preview. Same-origin files
// so they can be drawn to a canvas (for base64 + dimensions) without tainting.
const SAMPLE_IMAGES = [
{ src: "./samples/damage1.png", label: "车辆前部" },
{ src: "./samples/damage2.png", label: "车辆后部" },
{ src: "./samples/plate1.jpg", label: "车牌 1" },
{ src: "./samples/plate2.jpg", label: "车牌 2" },
{ src: "./samples/user1.jpg", label: "人物 1" },
{ src: "./samples/user2.jpg", label: "人物 2" },
];
// Cap the longer edge before JPEG-encoding so payloads stay small.
const IMAGE_MAX_DIM = 1280;
const IMAGE_JPEG_QUALITY = 0.85;
const CAMERA_STATE_PROMPTS = { const CAMERA_STATE_PROMPTS = {
2000: "请对准车辆碰撞部位拍摄照片。", 2000: "请对准车辆碰撞部位拍摄照片。",
2001: "请对准车辆碰撞部位拍摄照片。", 2001: "请对准车辆碰撞部位拍摄照片。",
@@ -62,6 +75,15 @@ const els = {
cameraState: document.getElementById("camera-state"), cameraState: document.getElementById("camera-state"),
cameraQuestion: document.getElementById("camera-question"), cameraQuestion: document.getElementById("camera-question"),
cameraDoneBtn: document.getElementById("camera-done-btn"), cameraDoneBtn: document.getElementById("camera-done-btn"),
cameraPreview: document.getElementById("camera-preview"),
cameraVideo: document.getElementById("camera-video"),
cameraPhoto: document.getElementById("camera-photo"),
cameraCanvas: document.getElementById("camera-canvas"),
cameraStartBtn: document.getElementById("camera-start-btn"),
cameraDeviceRow: document.getElementById("camera-device-row"),
cameraDeviceSelect: document.getElementById("camera-device-select"),
cameraUpload: document.getElementById("camera-upload"),
cameraSamples: document.getElementById("camera-samples"),
clearBtn: document.getElementById("clear-btn"), clearBtn: document.getElementById("clear-btn"),
clearWsLogBtn: document.getElementById("clear-ws-log-btn"), clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
wsLog: document.getElementById("ws-log"), wsLog: document.getElementById("ws-log"),
@@ -125,6 +147,14 @@ const state = {
assistantState: "", assistantState: "",
cameraState: "", cameraState: "",
// Camera / image input.
cameraStream: null,
cameraActive: false,
cameraFacing: "environment",
videoDevices: [],
pendingImage: null,
samplesRendered: false,
// VU meter smoothing. // VU meter smoothing.
meterLevel: 0, meterLevel: 0,
@@ -143,15 +173,15 @@ function setConnectButton() {
els.chatId.disabled = state.connected || state.connecting; els.chatId.disabled = state.connected || state.connecting;
els.copyChatIdBtn.disabled = !state.connected || !state.chatId; els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
if (state.connecting) { if (state.connecting) {
els.connectBtn.textContent = "Connecting…"; els.connectBtn.textContent = "连接中…";
els.connectBtn.disabled = true; els.connectBtn.disabled = true;
els.connectBtn.classList.remove("is-disconnect"); els.connectBtn.classList.remove("is-disconnect");
} else if (state.connected) { } else if (state.connected) {
els.connectBtn.textContent = "Disconnect"; els.connectBtn.textContent = "断开连接";
els.connectBtn.disabled = false; els.connectBtn.disabled = false;
els.connectBtn.classList.add("is-disconnect"); els.connectBtn.classList.add("is-disconnect");
} else { } else {
els.connectBtn.textContent = "Connect"; els.connectBtn.textContent = "连接";
els.connectBtn.disabled = false; els.connectBtn.disabled = false;
els.connectBtn.classList.remove("is-disconnect"); els.connectBtn.classList.remove("is-disconnect");
} }
@@ -180,8 +210,8 @@ async function copyChatId() {
function setMicButton() { function setMicButton() {
els.micBtn.disabled = !state.connected; els.micBtn.disabled = !state.connected;
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false"); els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic"; els.micBtn.title = state.micEnabled ? "关闭麦克风" : "开启麦克风";
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic"; els.micLabel.textContent = state.micEnabled ? "关闭麦克风" : "开启麦克风";
els.micIndicator.classList.toggle("is-active", state.micEnabled); els.micIndicator.classList.toggle("is-active", state.micEnabled);
} }
@@ -204,41 +234,40 @@ function setAssistantState(value) {
const label = text.length > 32 ? `${text.slice(0, 31)}` : text; const label = text.length > 32 ? `${text.slice(0, 31)}` : text;
state.assistantState = text; state.assistantState = text;
els.stateIndicator.classList.toggle("is-active", Boolean(text)); els.stateIndicator.classList.toggle("is-active", Boolean(text));
els.stateLabel.textContent = label ? `State ${label}` : "State -"; els.stateLabel.textContent = label ? `状态 ${label}` : "状态 -";
els.stateIndicator.title = label ? `Assistant state: ${text}` : "Assistant state"; els.stateIndicator.title = label ? `助手状态:${text}` : "助手状态";
syncCameraDrawer(text); syncCameraDrawer(text);
} }
function setCameraButtonEnabled() { function setCameraButtonEnabled() {
if (!els.cameraDoneBtn) return; if (!els.cameraDoneBtn) return;
els.cameraDoneBtn.disabled = const wsReady =
!state.connected || !state.cameraState || state.connected && state.ws && state.ws.readyState === WebSocket.OPEN;
!state.ws || state.ws.readyState !== WebSocket.OPEN; const hasImageSource = state.cameraActive || Boolean(state.pendingImage);
els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource;
} }
function syncCameraDrawer(value) { function syncCameraDrawer(value) {
const prompt = CAMERA_STATE_PROMPTS[value]; const prompt = CAMERA_STATE_PROMPTS[value];
const open = Boolean(prompt); const open = Boolean(prompt);
const wasOpen = Boolean(state.cameraState);
state.cameraState = open ? value : ""; state.cameraState = open ? value : "";
els.cameraDrawer.classList.toggle("is-open", open); els.cameraDrawer.classList.toggle("is-open", open);
els.conversation.classList.toggle("has-camera", open); els.conversation.classList.toggle("has-camera", open);
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true"); els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
if (open) { if (open) {
els.cameraState.textContent = `State ${value}`; els.cameraState.textContent = `状态 ${value}`;
els.cameraQuestion.textContent = prompt; els.cameraQuestion.textContent = prompt;
renderSampleThumbnails();
selectDefaultImage();
} else { } else {
els.cameraState.textContent = "State -"; els.cameraState.textContent = "状态 -";
els.cameraQuestion.textContent = ""; els.cameraQuestion.textContent = "";
if (wasOpen) resetCameraInput();
} }
setCameraButtonEnabled(); setCameraButtonEnabled();
} }
function updateCameraQuestion(text) {
const value = typeof text === "string" ? text.trim() : "";
if (!state.cameraState || !value) return;
els.cameraQuestion.textContent = value;
}
function addBubble(role, text) { function addBubble(role, text) {
if (els.chatLog.querySelector(".chat__empty")) { if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = ""; els.chatLog.innerHTML = "";
@@ -248,7 +277,7 @@ function addBubble(role, text) {
if (role !== "system") { if (role !== "system") {
const tag = document.createElement("span"); const tag = document.createElement("span");
tag.className = "bubble__role"; tag.className = "bubble__role";
tag.textContent = role === "user" ? "You" : "Assistant"; tag.textContent = role === "user" ? "" : "助手";
bubble.appendChild(tag); bubble.appendChild(tag);
} }
const body = document.createElement("span"); const body = document.createElement("span");
@@ -260,6 +289,35 @@ function addBubble(role, text) {
return bubble; return bubble;
} }
// Render a single chat bubble holding an image and (optionally) text together.
function addImageBubble(role, imageUrl, text) {
if (els.chatLog.querySelector(".chat__empty")) {
els.chatLog.innerHTML = "";
}
const bubble = document.createElement("div");
bubble.className = `bubble bubble--${role}`;
if (role !== "system") {
const tag = document.createElement("span");
tag.className = "bubble__role";
tag.textContent = role === "user" ? "你" : "助手";
bubble.appendChild(tag);
}
const img = document.createElement("img");
img.className = "bubble__image";
img.src = imageUrl;
img.alt = text || "image";
bubble.appendChild(img);
const body = document.createElement("span");
body.className = "bubble__text";
body.textContent = text || "";
bubble.appendChild(body);
els.chatLog.appendChild(bubble);
scrollChatToBottom();
return bubble;
}
function appendToBubble(bubble, text) { function appendToBubble(bubble, text) {
const body = bubble.querySelector(".bubble__text"); const body = bubble.querySelector(".bubble__text");
body.textContent += text; body.textContent += text;
@@ -276,7 +334,7 @@ function clearChat() {
setAssistantState(""); setAssistantState("");
const empty = document.createElement("div"); const empty = document.createElement("div");
empty.className = "chat__empty"; empty.className = "chat__empty";
empty.innerHTML = "<p>Chat cleared.</p>"; empty.innerHTML = "<p>对话已清空。</p>";
els.chatLog.appendChild(empty); els.chatLog.appendChild(empty);
} }
@@ -499,6 +557,9 @@ function compactWsPayload(payload) {
if (typeof compact.audio === "string") { if (typeof compact.audio === "string") {
compact.audio = `<base64 ${compact.audio.length} chars>`; compact.audio = `<base64 ${compact.audio.length} chars>`;
} }
if (typeof compact.image === "string") {
compact.image = `<base64 ${compact.image.length} chars>`;
}
if (typeof compact.data === "string" && compact.data.length > 160) { if (typeof compact.data === "string" && compact.data.length > 160) {
compact.data = `<string ${compact.data.length} chars>`; compact.data = `<string ${compact.data.length} chars>`;
} }
@@ -595,7 +656,7 @@ function wsSend(data) {
function clearWsLog() { function clearWsLog() {
state.wsLogGroup = null; state.wsLogGroup = null;
els.wsLog.innerHTML = els.wsLog.innerHTML =
'<div class="ws-log__empty">No websocket events yet.</div>'; '<div class="ws-log__empty">暂无 WebSocket 事件。</div>';
} }
/* ---------------------------------------------------------------- Audio */ /* ---------------------------------------------------------------- Audio */
@@ -618,13 +679,13 @@ function renderMicDevices() {
const defaultOption = document.createElement("option"); const defaultOption = document.createElement("option");
defaultOption.value = ""; defaultOption.value = "";
defaultOption.textContent = "Default microphone"; defaultOption.textContent = "默认麦克风";
els.micSelect.appendChild(defaultOption); els.micSelect.appendChild(defaultOption);
state.micDevices.forEach((device, index) => { state.micDevices.forEach((device, index) => {
const option = document.createElement("option"); const option = document.createElement("option");
option.value = device.deviceId; option.value = device.deviceId;
option.textContent = device.label || `Microphone ${index + 1}`; option.textContent = device.label || `麦克风 ${index + 1}`;
els.micSelect.appendChild(option); els.micSelect.appendChild(option);
}); });
@@ -691,7 +752,7 @@ async function startMic() {
state.micSourceNode.connect(state.recorderNode); state.micSourceNode.connect(state.recorderNode);
state.micEnabled = true; state.micEnabled = true;
addWsLog("system", "mic capture started (binary input.audio frames)"); addWsLog("system", "麦克风已开启PCM 音频流)");
setMicButton(); setMicButton();
} }
@@ -727,7 +788,7 @@ function stopMic() {
state.micEnabled = false; state.micEnabled = false;
updateMeter(0); updateMeter(0);
if (wasEnabled) { if (wasEnabled) {
addWsLog("system", "mic capture stopped"); addWsLog("system", "麦克风已关闭");
} }
setMicButton(); setMicButton();
} }
@@ -807,6 +868,272 @@ function resetPlaybackClock() {
} }
} }
/* ------------------------------------------------------ Camera / image */
function setPreviewMode(mode) {
// mode: "camera" | "photo" | "idle"
els.cameraPreview.classList.toggle("is-camera", mode === "camera");
els.cameraPreview.classList.toggle("is-photo", mode === "photo");
}
// Draw an <img>/<video> source to the canvas and return a normalized payload
// (JPEG data URL + dimensions) suitable for an `input.image` message.
function mediaToPayload(source) {
const srcW = source.videoWidth || source.naturalWidth || source.width;
const srcH = source.videoHeight || source.naturalHeight || source.height;
if (!srcW || !srcH) return null;
let w = srcW;
let h = srcH;
const longest = Math.max(w, h);
if (longest > IMAGE_MAX_DIM) {
const scale = IMAGE_MAX_DIM / longest;
w = Math.round(w * scale);
h = Math.round(h * scale);
}
const canvas = els.cameraCanvas;
canvas.width = w;
canvas.height = h;
const ctx = canvas.getContext("2d");
ctx.drawImage(source, 0, 0, w, h);
let dataUrl;
try {
dataUrl = canvas.toDataURL("image/jpeg", IMAGE_JPEG_QUALITY);
} catch (err) {
addWsLog("system", `图片编码失败:${err.message || err}`);
return null;
}
return { dataUrl, mime: "image/jpeg", width: w, height: h };
}
function setPendingImage(payload) {
state.pendingImage = payload;
if (payload) {
els.cameraPhoto.src = payload.dataUrl;
setPreviewMode("photo");
}
setCameraButtonEnabled();
}
async function refreshVideoDevices() {
try {
const devices = await navigator.mediaDevices.enumerateDevices();
state.videoDevices = devices.filter((d) => d.kind === "videoinput");
} catch (_) {
state.videoDevices = [];
}
}
// Fill the camera dropdown from the enumerated devices. Labels are only exposed
// after camera permission has been granted, so before that we show generic
// names ("摄像头 1", …) or just the default option.
function populateDeviceSelect(activeDeviceId) {
const sel = els.cameraDeviceSelect;
sel.innerHTML = "";
if (state.videoDevices.length === 0) {
const opt = document.createElement("option");
opt.value = "";
opt.textContent = "默认摄像头";
sel.appendChild(opt);
sel.disabled = true;
return;
}
state.videoDevices.forEach((device, index) => {
const opt = document.createElement("option");
opt.value = device.deviceId;
opt.textContent = device.label || `摄像头 ${index + 1}`;
sel.appendChild(opt);
});
sel.disabled = false;
if (activeDeviceId) sel.value = activeDeviceId;
}
async function startCamera(deviceId) {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
addWsLog("system", "该浏览器不支持摄像头访问");
return;
}
stopCameraStream();
const video = deviceId
? { deviceId: { exact: deviceId } }
: { facingMode: state.cameraFacing };
try {
state.cameraStream = await navigator.mediaDevices.getUserMedia({
video,
audio: false,
});
} catch (err) {
addWsLog("system", `摄像头错误:${err.message || err}`);
return;
}
els.cameraVideo.srcObject = state.cameraStream;
try {
await els.cameraVideo.play();
} catch (_) {
/* autoplay may resolve later */
}
state.cameraActive = true;
state.pendingImage = null;
setPreviewMode("camera");
els.cameraStartBtn.classList.add("is-active");
clearSampleSelection();
// Device labels become available only after permission is granted; refresh
// the dropdown now and select whichever camera is actually streaming.
await refreshVideoDevices();
const activeId =
state.cameraStream.getVideoTracks?.()[0]?.getSettings?.().deviceId ||
deviceId;
populateDeviceSelect(activeId);
// Reveal the camera device dropdown only while the camera is in use.
els.cameraDeviceRow.hidden = false;
setCameraButtonEnabled();
}
function stopCameraStream() {
if (state.cameraStream) {
state.cameraStream.getTracks().forEach((track) => track.stop());
state.cameraStream = null;
}
els.cameraVideo.srcObject = null;
state.cameraActive = false;
els.cameraStartBtn.classList.remove("is-active");
els.cameraDeviceRow.hidden = true;
}
function captureFromCamera() {
const payload = mediaToPayload(els.cameraVideo);
if (!payload) return null;
stopCameraStream();
setPendingImage(payload);
return payload;
}
// Load a same-origin/object URL into an <img> and resolve once decoded.
function loadImage(src) {
return new Promise((resolve, reject) => {
const img = new Image();
img.onload = () => resolve(img);
img.onerror = () => reject(new Error(`failed to load image: ${src}`));
img.src = src;
});
}
async function selectFileImage(file) {
if (!file) return;
const objectUrl = URL.createObjectURL(file);
try {
const img = await loadImage(objectUrl);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
setPendingImage(payload);
} catch (err) {
addWsLog("system", `上传错误:${err.message || err}`);
} finally {
URL.revokeObjectURL(objectUrl);
}
}
async function selectSampleImage(src, buttonEl) {
try {
const img = await loadImage(src);
const payload = mediaToPayload(img);
if (!payload) return;
stopCameraStream();
clearSampleSelection();
if (buttonEl) buttonEl.classList.add("is-selected");
setPendingImage(payload);
} catch (err) {
addWsLog("system", `示例图加载错误:${err.message || err}`);
}
}
function clearSampleSelection() {
els.cameraSamples
.querySelectorAll(".camera-drawer__sample.is-selected")
.forEach((el) => el.classList.remove("is-selected"));
}
function renderSampleThumbnails() {
if (state.samplesRendered) return;
state.samplesRendered = true;
els.cameraSamples.innerHTML = "";
for (const sample of SAMPLE_IMAGES) {
const btn = document.createElement("button");
btn.type = "button";
btn.className = "camera-drawer__sample";
btn.title = sample.label;
const img = document.createElement("img");
img.src = sample.src;
img.alt = sample.label;
btn.appendChild(img);
btn.addEventListener("click", () => selectSampleImage(sample.src, btn));
els.cameraSamples.appendChild(btn);
}
}
function resetCameraInput() {
stopCameraStream();
state.pendingImage = null;
clearSampleSelection();
els.cameraPhoto.removeAttribute("src");
setPreviewMode("idle");
setCameraButtonEnabled();
}
// Pre-select the first sample image so "拍摄完成" is immediately pressable when
// the drawer opens, without requiring the user to capture or pick first.
function selectDefaultImage() {
if (state.pendingImage || state.cameraActive) return;
const first = els.cameraSamples.querySelector(".camera-drawer__sample");
if (first && SAMPLE_IMAGES[0]) {
selectSampleImage(SAMPLE_IMAGES[0].src, first);
}
}
function sendImage(payload, text) {
if (!payload) return false;
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
const message = {
type: "input.image",
image: payload.dataUrl,
mime_type: payload.mime,
width: payload.width,
height: payload.height,
text: text || CAMERA_DONE_TEXT,
interrupt: true,
};
wsSend(JSON.stringify(message));
// Mirror the text-input path: interrupt in-flight bot audio and render the
// user's image + text together as one local bubble (the engine does not echo
// image input back as a transcript event).
stopPlaybackQueue();
state.currentAssistantBubble = null;
addImageBubble("user", payload.dataUrl, text || CAMERA_DONE_TEXT);
return true;
}
function submitCameraImage() {
// If the live camera is on, grab the current frame first; otherwise use the
// already-selected (uploaded / sample / captured) image.
let payload = state.pendingImage;
if (state.cameraActive) {
payload = captureFromCamera() || payload;
}
if (!payload) return;
// Keep the existing workflow contract: the accompanying text stays the
// "【拍摄完成】" marker that advances the FastGPT camera step; the image is
// the new multimodal attachment.
if (!sendImage(payload, CAMERA_DONE_TEXT)) return;
resetCameraInput();
}
/* --------------------------------------------------------- Chat updates */ /* --------------------------------------------------------- Chat updates */
function handleUserTranscript(text) { function handleUserTranscript(text) {
@@ -864,7 +1191,6 @@ function handleAssistantFinal(text, interrupted) {
if (interrupted) { if (interrupted) {
state.currentAssistantBubble.classList.add("bubble--interrupted"); state.currentAssistantBubble.classList.add("bubble--interrupted");
} }
updateCameraQuestion(text);
state.currentAssistantBubble = null; state.currentAssistantBubble = null;
scrollChatToBottom(); scrollChatToBottom();
} }
@@ -930,16 +1256,16 @@ async function connect() {
const chatId = inputChatId || generateChatId(); const chatId = inputChatId || generateChatId();
const url = wsUrlWithChatId(chatId); const url = wsUrlWithChatId(chatId);
if (!url) { if (!url) {
setStatus("error", "Missing URL"); setStatus("error", "缺少服务器地址");
return; return;
} }
state.connecting = true; state.connecting = true;
state.chatId = chatId; state.chatId = chatId;
els.chatId.value = chatId; els.chatId.value = chatId;
setStatus("connecting", "Connecting…"); setStatus("connecting", "连接中…");
setConnectButton(); setConnectButton();
addWsLog("system", `connecting ${url}`); addWsLog("system", `正在连接 ${url}`);
try { try {
// Pre-warm audio context on user gesture so playback works on Safari. // Pre-warm audio context on user gesture so playback works on Safari.
@@ -949,9 +1275,9 @@ async function connect() {
state.connecting = false; state.connecting = false;
state.chatId = ""; state.chatId = "";
if (!inputChatId) els.chatId.value = ""; if (!inputChatId) els.chatId.value = "";
setStatus("error", "Audio init failed"); setStatus("error", "音频初始化失败");
setConnectButton(); setConnectButton();
addWsLog("error", `audio init failed: ${err.message || err}`, "error"); addWsLog("error", `音频初始化失败:${err.message || err}`, "error");
return; return;
} }
@@ -963,9 +1289,9 @@ async function connect() {
state.connecting = false; state.connecting = false;
state.chatId = ""; state.chatId = "";
if (!inputChatId) els.chatId.value = ""; if (!inputChatId) els.chatId.value = "";
setStatus("error", "Bad URL"); setStatus("error", "服务器地址无效");
setConnectButton(); setConnectButton();
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error"); addWsLog("error", `WebSocket 地址无效:${err.message || err}`, "error");
return; return;
} }
ws.binaryType = "arraybuffer"; ws.binaryType = "arraybuffer";
@@ -986,15 +1312,15 @@ async function connect() {
state.connecting = false; state.connecting = false;
state.connected = true; state.connected = true;
resetPlaybackClock(); resetPlaybackClock();
addWsLog("system", "websocket open"); addWsLog("system", "连接已建立");
setStatus("connected", "Connected"); setStatus("connected", "已连接");
setConnectButton(); setConnectButton();
setMicButton(); setMicButton();
setMicSelectEnabled(); setMicSelectEnabled();
refreshMicDevices(); refreshMicDevices();
wsSend(JSON.stringify(startMessage)); wsSend(JSON.stringify(startMessage));
addBubble("system", "Session started."); addBubble("system", "会话已开始。");
setComposerEnabled(true); setComposerEnabled(true);
setCameraButtonEnabled(); setCameraButtonEnabled();
els.textInput.focus(); els.textInput.focus();
@@ -1026,7 +1352,7 @@ async function connect() {
ws.addEventListener("error", (err) => { ws.addEventListener("error", (err) => {
console.error("WebSocket error", err); console.error("WebSocket error", err);
setStatus("error", "Connection error"); setStatus("error", "连接错误");
addWsLog("error", "websocket error", "error"); addWsLog("error", "websocket error", "error");
}); });
@@ -1055,11 +1381,11 @@ async function connect() {
if (wasConnected) { if (wasConnected) {
addBubble( addBubble(
"system", "system",
`Session ended${event.reason ? `${event.reason}` : ""}.`, `会话已结束${event.reason ? `${event.reason}` : ""}`,
); );
setStatus("idle", "Disconnected"); setStatus("idle", "未连接");
} else { } else {
setStatus("error", "Connection closed"); setStatus("error", "连接已断开");
} }
}); });
} }
@@ -1101,7 +1427,7 @@ els.micBtn.addEventListener("click", async () => {
} }
} catch (err) { } catch (err) {
console.error("Mic error", err); console.error("Mic error", err);
addBubble("system", `Mic error: ${err.message || err}`); addBubble("system", `麦克风错误:${err.message || err}`);
} finally { } finally {
els.micBtn.disabled = !state.connected; els.micBtn.disabled = !state.connected;
} }
@@ -1118,7 +1444,7 @@ els.micSelect.addEventListener("change", async () => {
await startMic(); await startMic();
} catch (err) { } catch (err) {
console.error("Mic switch error", err); console.error("Mic switch error", err);
addBubble("system", `Mic switch error: ${err.message || err}`); addBubble("system", `麦克风切换错误:${err.message || err}`);
} finally { } finally {
setMicButton(); setMicButton();
setMicSelectEnabled(); setMicSelectEnabled();
@@ -1139,7 +1465,25 @@ els.clearWsLogBtn.addEventListener("click", () => {
els.cameraDoneBtn.addEventListener("click", () => { els.cameraDoneBtn.addEventListener("click", () => {
if (!state.cameraState) return; if (!state.cameraState) return;
sendText(CAMERA_DONE_TEXT); submitCameraImage();
});
els.cameraStartBtn.addEventListener("click", () => {
startCamera(els.cameraDeviceSelect.value || undefined);
});
els.cameraDeviceSelect.addEventListener("change", () => {
// Switching device only restarts the stream when the camera is already live;
// otherwise the choice is applied when "使用摄像头" is pressed.
if (state.cameraActive) {
startCamera(els.cameraDeviceSelect.value || undefined);
}
});
els.cameraUpload.addEventListener("change", (event) => {
const file = event.target.files && event.target.files[0];
selectFileImage(file);
event.target.value = "";
}); });
function autosizeTextarea() { function autosizeTextarea() {
@@ -1174,6 +1518,7 @@ els.textInput.addEventListener("keydown", (event) => {
}); });
window.addEventListener("beforeunload", () => { window.addEventListener("beforeunload", () => {
stopCameraStream();
if (state.ws) { if (state.ws) {
try { try {
state.ws.close(); state.ws.close();
@@ -1192,7 +1537,7 @@ window.addEventListener("beforeunload", () => {
els.url.value = defaultWsUrl(); els.url.value = defaultWsUrl();
setStatus("idle", "Disconnected"); setStatus("idle", "未连接");
setConnectButton(); setConnectButton();
setMicButton(); setMicButton();
setMicSelectEnabled(); setMicSelectEnabled();

View File

@@ -1,5 +1,5 @@
<!doctype html> <!doctype html>
<html lang="en"> <html lang="zh-CN">
<head> <head>
<meta charset="utf-8" /> <meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1" />
@@ -16,7 +16,7 @@
<div class="connection"> <div class="connection">
<label class="connection__field"> <label class="connection__field">
<span>WebSocket URL</span> <span>服务器地址</span>
<input <input
id="ws-url" id="ws-url"
type="text" type="text"
@@ -26,12 +26,12 @@
/> />
</label> </label>
<label class="connection__field connection__field--chat"> <label class="connection__field connection__field--chat">
<span>Chat ID</span> <span>会话 ID</span>
<div class="chat-id-control"> <div class="chat-id-control">
<input <input
id="chat-id" id="chat-id"
type="text" type="text"
placeholder="optional chatId" placeholder="可选"
spellcheck="false" spellcheck="false"
autocomplete="off" autocomplete="off"
/> />
@@ -40,8 +40,8 @@
class="chat-id-control__copy" class="chat-id-control__copy"
type="button" type="button"
disabled disabled
title="Copy Chat ID" title="复制会话 ID"
aria-label="Copy Chat ID" aria-label="复制会话 ID"
> >
<svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true"> <svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
<rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/> <rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/>
@@ -54,13 +54,13 @@
</div> </div>
</label> </label>
<button id="connect-btn" class="btn btn--primary" type="button"> <button id="connect-btn" class="btn btn--primary" type="button">
Connect 连接
</button> </button>
</div> </div>
<div class="status"> <div class="status">
<span id="status-dot" class="status__dot status__dot--idle"></span> <span id="status-dot" class="status__dot status__dot--idle"></span>
<span id="status-text" class="status__text">Disconnected</span> <span id="status-text" class="status__text">未连接</span>
</div> </div>
</header> </header>
@@ -70,28 +70,87 @@
<aside <aside
id="camera-drawer" id="camera-drawer"
class="camera-drawer" class="camera-drawer"
aria-label="Camera capture step" aria-label="拍照步骤"
aria-hidden="true" aria-hidden="true"
> >
<div class="camera-drawer__panel"> <div class="camera-drawer__panel">
<div class="camera-drawer__header"> <div class="camera-drawer__header">
<div> <div>
<p class="camera-drawer__eyebrow">Camera</p> <p class="camera-drawer__eyebrow">拍照</p>
<h2>拍照步骤</h2> <h2>拍照步骤</h2>
</div> </div>
<span id="camera-state" class="camera-drawer__state">State -</span> <span id="camera-state" class="camera-drawer__state">状态 -</span>
</div> </div>
<div class="camera-drawer__preview" aria-hidden="true"> <div id="camera-preview" class="camera-drawer__preview">
<video
id="camera-video"
class="camera-drawer__video"
playsinline
muted
autoplay
></video>
<img
id="camera-photo"
class="camera-drawer__photo"
alt="已选择图片预览"
/>
<span class="camera-drawer__corner camera-drawer__corner--tl"></span> <span class="camera-drawer__corner camera-drawer__corner--tl"></span>
<span class="camera-drawer__corner camera-drawer__corner--tr"></span> <span class="camera-drawer__corner camera-drawer__corner--tr"></span>
<span class="camera-drawer__corner camera-drawer__corner--bl"></span> <span class="camera-drawer__corner camera-drawer__corner--bl"></span>
<span class="camera-drawer__corner camera-drawer__corner--br"></span> <span class="camera-drawer__corner camera-drawer__corner--br"></span>
<span class="camera-drawer__lens"></span> <span class="camera-drawer__lens"></span>
<span class="camera-drawer__scan"></span> <span class="camera-drawer__scan"></span>
<span id="camera-placeholder" class="camera-drawer__placeholder">
打开摄像头实时拍摄,或从下方选择 / 上传图片
</span>
</div> </div>
<p id="camera-question" class="camera-drawer__question"></p> <p id="camera-question" class="camera-drawer__question"></p>
<div
id="camera-samples"
class="camera-drawer__samples"
aria-label="示例图片,点击选择"
></div>
<div class="camera-drawer__sources">
<label
class="btn btn--ghost camera-drawer__source"
>
上传图片
<input
id="camera-upload"
type="file"
accept="image/*"
hidden
/>
</label>
<button
id="camera-start-btn"
class="btn btn--ghost camera-drawer__source"
type="button"
title="打开摄像头"
>
使用摄像头
</button>
</div>
<label
id="camera-device-row"
class="device-picker camera-drawer__device-row"
hidden
>
<span class="device-picker__label">选择摄像头</span>
<select
id="camera-device-select"
class="device-picker__select"
disabled
>
<option value="">默认摄像头</option>
</select>
</label>
<button <button
id="camera-done-btn" id="camera-done-btn"
class="btn btn--primary camera-drawer__button" class="btn btn--primary camera-drawer__button"
@@ -100,23 +159,24 @@
> >
拍摄完成 拍摄完成
</button> </button>
<canvas id="camera-canvas" hidden></canvas>
</div> </div>
</aside> </aside>
<section class="chat" aria-label="Conversation history"> <section class="chat" aria-label="对话记录">
<div id="chat-log" class="chat__log" role="log" aria-live="polite"> <div id="chat-log" class="chat__log" role="log" aria-live="polite">
<div class="chat__empty"> <div class="chat__empty">
<p>Connect to the engine, enable your mic, and start talking.</p> <p>连接服务、开启麦克风后即可开始对话。</p>
<p class="chat__hint"> <p class="chat__hint">
Audio is streamed as PCM16 mono @ 16&nbsp;kHz over 音频通过 <code>/ws-product</code> PCM16 单声道 16&nbsp;kHz
<code>/ws-product</code>. 传输。
</p> </p>
</div> </div>
</div> </div>
</section> </section>
</div> </div>
<footer class="controls" aria-label="Chat controls"> <footer class="controls" aria-label="操作栏">
<div class="meter" aria-hidden="true"> <div class="meter" aria-hidden="true">
<div id="meter-fill" class="meter__fill"></div> <div id="meter-fill" class="meter__fill"></div>
</div> </div>
@@ -126,7 +186,7 @@
id="text-input" id="text-input"
class="composer__input" class="composer__input"
rows="1" rows="1"
placeholder="Type a message, or use the mic…" placeholder="输入消息,或使用麦克风…"
disabled disabled
></textarea> ></textarea>
<button <button
@@ -134,17 +194,17 @@
class="btn btn--primary composer__send" class="btn btn--primary composer__send"
type="submit" type="submit"
disabled disabled
title="Send message (Enter)" title="发送消息 (Enter)"
> >
Send 发送
</button> </button>
</form> </form>
<div class="controls__row"> <div class="controls__row">
<label class="device-picker"> <label class="device-picker">
<span class="device-picker__label">Microphone</span> <span class="device-picker__label">麦克风</span>
<select id="mic-select" class="device-picker__select" disabled> <select id="mic-select" class="device-picker__select" disabled>
<option value="">Default microphone</option> <option value="">默认麦克风</option>
</select> </select>
</label> </label>
@@ -154,7 +214,7 @@
type="button" type="button"
disabled disabled
aria-pressed="false" aria-pressed="false"
title="Mic is off" title="麦克风已关闭"
> >
<svg <svg
class="mic-btn__icon" class="mic-btn__icon"
@@ -172,52 +232,52 @@
fill="currentColor" fill="currentColor"
/> />
</svg> </svg>
<span class="mic-btn__label">Enable mic</span> <span class="mic-btn__label">开启麦克风</span>
</button> </button>
<div class="indicators"> <div class="indicators">
<span id="mic-indicator" class="indicator"> <span id="mic-indicator" class="indicator">
<span class="indicator__dot indicator__dot--mic"></span> <span class="indicator__dot indicator__dot--mic"></span>
<span class="indicator__label">Mic</span> <span class="indicator__label">麦克风</span>
</span> </span>
<span id="bot-indicator" class="indicator"> <span id="bot-indicator" class="indicator">
<span class="indicator__dot indicator__dot--bot"></span> <span class="indicator__dot indicator__dot--bot"></span>
<span class="indicator__label">Bot</span> <span class="indicator__label">助手</span>
</span> </span>
<span id="state-indicator" class="indicator indicator--state"> <span id="state-indicator" class="indicator indicator--state">
<span class="indicator__dot indicator__dot--state"></span> <span class="indicator__dot indicator__dot--state"></span>
<span id="state-label" class="indicator__label">State -</span> <span id="state-label" class="indicator__label">状态 -</span>
</span> </span>
</div> </div>
<button id="clear-btn" class="btn btn--ghost" type="button"> <button id="clear-btn" class="btn btn--ghost" type="button">
Clear 清空
</button> </button>
</div> </div>
<p class="hint"> <p class="hint">
Press <kbd>Enter</kbd> to send, <kbd>Shift</kbd>+<kbd>Enter</kbd> <kbd>Enter</kbd> 发送,<kbd>Shift</kbd>+<kbd>Enter</kbd>
for newline. Sending text will interrupt the bot if it's speaking. 换行。发送文字会打断正在说话的助手。
Browser echo cancellation is on; use headphones if echo persists. 浏览器回声消除已开启,如有回音请使用耳机。
</p> </p>
</footer> </footer>
</div> </div>
<section class="ws-log" aria-label="WebSocket log"> <section class="ws-log" aria-label="WebSocket 日志">
<div class="ws-log__header"> <div class="ws-log__header">
<div class="ws-log__header-left"> <div class="ws-log__header-left">
<h2>WebSocket Log</h2> <h2>WebSocket 日志</h2>
<div class="ws-log__legend" aria-hidden="true"> <div class="ws-log__legend" aria-hidden="true">
<span class="ws-log__legend-item ws-log__legend-item--send">Send</span> <span class="ws-log__legend-item ws-log__legend-item--send">发送</span>
<span class="ws-log__legend-item ws-log__legend-item--recv">Recv</span> <span class="ws-log__legend-item ws-log__legend-item--recv">接收</span>
</div> </div>
</div> </div>
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button"> <button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
Clear log 清空日志
</button> </button>
</div> </div>
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite"> <div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
<div class="ws-log__empty">No websocket events yet.</div> <div class="ws-log__empty">暂无 WebSocket 事件。</div>
</div> </div>
</section> </section>
</div> </div>

BIN
static/voice-demo/samples/.DS_Store vendored Normal file

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 273 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 323 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 229 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

View File

@@ -136,7 +136,8 @@ body {
.camera-drawer__preview { .camera-drawer__preview {
position: relative; position: relative;
min-height: 210px; aspect-ratio: 4 / 3;
min-height: 200px;
overflow: hidden; overflow: hidden;
border: 1px solid rgba(149, 160, 187, 0.28); border: 1px solid rgba(149, 160, 187, 0.28);
border-radius: 14px; border-radius: 14px;
@@ -148,6 +149,49 @@ body {
background-size: 34px 34px, 34px 34px, auto, auto; background-size: 34px 34px, 34px 34px, auto, auto;
} }
.camera-drawer__video,
.camera-drawer__photo {
position: absolute;
inset: 0;
width: 100%;
height: 100%;
object-fit: cover;
display: none;
z-index: 1;
}
.camera-drawer__photo {
object-fit: contain;
}
.camera-drawer__preview.is-camera .camera-drawer__video {
display: block;
}
.camera-drawer__preview.is-photo .camera-drawer__photo {
display: block;
}
/* Hide the decorative lens/scan/placeholder once real media is showing. */
.camera-drawer__preview.is-camera .camera-drawer__lens,
.camera-drawer__preview.is-photo .camera-drawer__lens,
.camera-drawer__preview.is-camera .camera-drawer__scan,
.camera-drawer__preview.is-photo .camera-drawer__scan,
.camera-drawer__preview.is-camera .camera-drawer__placeholder,
.camera-drawer__preview.is-photo .camera-drawer__placeholder {
display: none;
}
.camera-drawer__placeholder {
position: absolute;
inset: auto 18px 16px;
z-index: 2;
color: rgba(214, 220, 235, 0.78);
font-size: 12px;
line-height: 1.5;
text-align: center;
}
.camera-drawer__lens { .camera-drawer__lens {
position: absolute; position: absolute;
top: 50%; top: 50%;
@@ -174,6 +218,7 @@ body {
.camera-drawer__corner { .camera-drawer__corner {
position: absolute; position: absolute;
z-index: 2;
width: 28px; width: 28px;
height: 28px; height: 28px;
border-color: rgba(255, 255, 255, 0.7); border-color: rgba(255, 255, 255, 0.7);
@@ -229,6 +274,87 @@ body {
cursor: not-allowed; cursor: not-allowed;
} }
/* 上传图片 + 使用摄像头 share one row. */
.camera-drawer__sources {
display: flex;
gap: 8px;
}
/* The camera device dropdown only appears after "使用摄像头" is selected. */
.camera-drawer__device-row {
max-width: none;
}
.camera-drawer__device-row[hidden] {
display: none;
}
/* Active state for the "使用摄像头" button once the camera is live. */
.camera-drawer__source.is-active {
border-color: var(--success);
color: var(--success);
}
.camera-drawer__source {
flex: 1 1 0;
display: inline-flex;
align-items: center;
justify-content: center;
text-align: center;
min-height: 38px;
font-size: 13px;
font-weight: 600;
cursor: pointer;
}
.camera-drawer__source.is-active {
border-color: var(--success);
color: var(--success);
}
.camera-drawer__source:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.camera-drawer__samples {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 8px;
}
.camera-drawer__samples:empty {
display: none;
}
.camera-drawer__sample {
position: relative;
aspect-ratio: 4 / 3;
padding: 0;
border: 2px solid transparent;
border-radius: 10px;
overflow: hidden;
cursor: pointer;
background: #0f141f;
}
.camera-drawer__sample img {
width: 100%;
height: 100%;
object-fit: contain;
display: block;
}
.camera-drawer__sample:hover {
border-color: rgba(149, 160, 187, 0.6);
}
.camera-drawer__sample.is-selected {
border-color: var(--success);
box-shadow: 0 0 0 1px var(--success);
}
.app__body { .app__body {
display: grid; display: grid;
grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px); grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
@@ -511,6 +637,18 @@ body {
margin-bottom: 4px; margin-bottom: 4px;
} }
.bubble__image {
display: block;
max-width: 240px;
width: 100%;
border-radius: 10px;
margin-bottom: 6px;
}
.bubble__image + .bubble__text:empty {
display: none;
}
/* WebSocket log --------------------------------------------------------- */ /* WebSocket log --------------------------------------------------------- */
.ws-log { .ws-log {
@@ -567,8 +705,8 @@ body {
margin: 0; margin: 0;
font-size: 12px; font-size: 12px;
color: var(--text-dim); color: var(--text-dim);
text-transform: uppercase; letter-spacing: 0.5px;
letter-spacing: 0.8px; white-space: nowrap;
} }
.ws-log__header-left { .ws-log__header-left {
@@ -823,11 +961,7 @@ body {
outline: none; outline: none;
width: 100%; width: 100%;
cursor: pointer; cursor: pointer;
} text-overflow: ellipsis;
.device-picker__select:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
} }
.device-picker__select:disabled { .device-picker__select:disabled {
@@ -835,6 +969,11 @@ body {
cursor: not-allowed; cursor: not-allowed;
} }
.device-picker__select:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
}
.mic-btn { .mic-btn {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;