Sync with engine v5
This commit is contained in:
@@ -131,6 +131,7 @@ class LLMConfig:
|
|||||||
variables: dict[str, str] = field(default_factory=dict)
|
variables: dict[str, str] = field(default_factory=dict)
|
||||||
detail: bool = False
|
detail: bool = False
|
||||||
timeout_sec: float = 60.0
|
timeout_sec: float = 60.0
|
||||||
|
image_input_mode: str = "base64"
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_fastgpt(self) -> bool:
|
def is_fastgpt(self) -> bool:
|
||||||
@@ -236,6 +237,15 @@ def config_from_dict(data: dict) -> EngineConfig:
|
|||||||
if llm.get("chat_id") == "":
|
if llm.get("chat_id") == "":
|
||||||
llm["chat_id"] = None
|
llm["chat_id"] = None
|
||||||
llm.pop("send_system_prompt", None)
|
llm.pop("send_system_prompt", None)
|
||||||
|
image_input_mode = str(
|
||||||
|
llm.get("image_input_mode", LLMConfig().image_input_mode)
|
||||||
|
).strip().lower()
|
||||||
|
if image_input_mode not in {"base64", "upload"}:
|
||||||
|
raise ValueError(
|
||||||
|
"services.llm.image_input_mode must be 'base64' or 'upload', "
|
||||||
|
f"got {llm.get('image_input_mode')!r}"
|
||||||
|
)
|
||||||
|
llm["image_input_mode"] = image_input_mode
|
||||||
if llm.get("app_id") == "":
|
if llm.get("app_id") == "":
|
||||||
llm["app_id"] = None
|
llm["app_id"] = None
|
||||||
if not isinstance(llm.get("variables"), dict):
|
if not isinstance(llm.get("variables"), dict):
|
||||||
|
|||||||
@@ -1,5 +1,10 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import binascii
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -19,6 +24,7 @@ from pipecat.frames.frames import (
|
|||||||
LLMFullResponseStartFrame,
|
LLMFullResponseStartFrame,
|
||||||
LLMTextFrame,
|
LLMTextFrame,
|
||||||
OutputTransportMessageFrame,
|
OutputTransportMessageFrame,
|
||||||
|
OutputTransportMessageUrgentFrame,
|
||||||
)
|
)
|
||||||
from pipecat.processors.aggregators.llm_context import LLMContext
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
||||||
from pipecat.processors.frame_processor import FrameDirection
|
from pipecat.processors.frame_processor import FrameDirection
|
||||||
@@ -129,6 +135,50 @@ def _interactive_spoken_prompt(event: FastGPTInteractiveEvent) -> str:
|
|||||||
return "请继续。"
|
return "请继续。"
|
||||||
|
|
||||||
|
|
||||||
|
IMAGE_INPUT_MODE_BASE64 = "base64"
|
||||||
|
IMAGE_INPUT_MODE_UPLOAD = "upload"
|
||||||
|
SUPPORTED_IMAGE_INPUT_MODES = frozenset({IMAGE_INPUT_MODE_BASE64, IMAGE_INPUT_MODE_UPLOAD})
|
||||||
|
|
||||||
|
_MIME_TO_EXT = {
|
||||||
|
"image/jpeg": ".jpg",
|
||||||
|
"image/png": ".png",
|
||||||
|
"image/webp": ".webp",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _message_has_image(message: dict[str, Any]) -> bool:
|
||||||
|
content = message.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
return False
|
||||||
|
return any(
|
||||||
|
isinstance(part, dict) and part.get("type") == "image_url"
|
||||||
|
for part in content
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _redact_messages_for_log(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||||
|
"""Replace base64 image data URLs with a short placeholder for logging."""
|
||||||
|
redacted: list[dict[str, Any]] = []
|
||||||
|
for message in messages:
|
||||||
|
content = message.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
redacted.append(message)
|
||||||
|
continue
|
||||||
|
parts: list[Any] = []
|
||||||
|
for part in content:
|
||||||
|
if (
|
||||||
|
isinstance(part, dict)
|
||||||
|
and part.get("type") == "image_url"
|
||||||
|
and isinstance(part.get("image_url"), dict)
|
||||||
|
):
|
||||||
|
url = str(part["image_url"].get("url") or "")
|
||||||
|
parts.append({"type": "image_url", "image_url": {"url": f"<{len(url)} chars>"}})
|
||||||
|
else:
|
||||||
|
parts.append(part)
|
||||||
|
redacted.append({**message, "content": parts})
|
||||||
|
return redacted
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FastGPTLLMSettings(LLMSettings):
|
class FastGPTLLMSettings(LLMSettings):
|
||||||
variables: dict[str, Any] = field(default_factory=dict)
|
variables: dict[str, Any] = field(default_factory=dict)
|
||||||
@@ -167,6 +217,7 @@ class FastGPTLLMService(LLMService):
|
|||||||
app_id: str | None = None,
|
app_id: str | None = None,
|
||||||
greeting_prompt: str | None = None,
|
greeting_prompt: str | None = None,
|
||||||
timeout: float = 60.0,
|
timeout: float = 60.0,
|
||||||
|
image_input_mode: str = IMAGE_INPUT_MODE_BASE64,
|
||||||
settings: FastGPTLLMSettings | None = None,
|
settings: FastGPTLLMSettings | None = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -185,6 +236,20 @@ class FastGPTLLMService(LLMService):
|
|||||||
)
|
)
|
||||||
self._active_response = None
|
self._active_response = None
|
||||||
|
|
||||||
|
mode = (image_input_mode or IMAGE_INPUT_MODE_BASE64).strip().lower()
|
||||||
|
if mode not in SUPPORTED_IMAGE_INPUT_MODES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported image_input_mode {image_input_mode!r}; "
|
||||||
|
f"expected one of {sorted(SUPPORTED_IMAGE_INPUT_MODES)}"
|
||||||
|
)
|
||||||
|
if mode == IMAGE_INPUT_MODE_UPLOAD and not self._app_id:
|
||||||
|
logger.warning(
|
||||||
|
"FastGPT image_input_mode='upload' requires app_id; "
|
||||||
|
"falling back to inline base64"
|
||||||
|
)
|
||||||
|
mode = IMAGE_INPUT_MODE_BASE64
|
||||||
|
self._image_input_mode = mode
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def app_id(self) -> str:
|
def app_id(self) -> str:
|
||||||
return self._app_id
|
return self._app_id
|
||||||
@@ -305,26 +370,114 @@ class FastGPTLLMService(LLMService):
|
|||||||
if response is not None:
|
if response is not None:
|
||||||
await response.aclose()
|
await response.aclose()
|
||||||
|
|
||||||
def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, str]]:
|
def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, Any]]:
|
||||||
raw_messages = context.get_messages()
|
raw_messages = context.get_messages()
|
||||||
|
|
||||||
for message in reversed(raw_messages):
|
for message in reversed(raw_messages):
|
||||||
if not isinstance(message, dict) or message.get("role") != "user":
|
if not isinstance(message, dict) or message.get("role") != "user":
|
||||||
continue
|
continue
|
||||||
|
if _message_has_image(message):
|
||||||
|
# Multimodal turn: forward the OpenAI-style content list as-is
|
||||||
|
# (text parts + image_url with a base64 data URL). FastGPT's
|
||||||
|
# /chat/completions accepts this directly.
|
||||||
|
return [{"role": "user", "content": message["content"]}]
|
||||||
text = _message_text(message)
|
text = _message_text(message)
|
||||||
if text:
|
if text:
|
||||||
return [{"role": "user", "content": text}]
|
return [{"role": "user", "content": text}]
|
||||||
|
|
||||||
return [{"role": "user", "content": self._greeting_prompt}]
|
return [{"role": "user", "content": self._greeting_prompt}]
|
||||||
|
|
||||||
|
async def _resolve_image_inputs(
|
||||||
|
self, messages: list[dict[str, Any]]
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""In ``upload`` mode, replace inline base64 image data URLs with uploaded URLs.
|
||||||
|
|
||||||
|
In ``base64`` mode the messages are returned untouched (inline data URLs).
|
||||||
|
New message/content objects are built so the shared ``LLMContext`` messages
|
||||||
|
are never mutated.
|
||||||
|
"""
|
||||||
|
if self._image_input_mode != IMAGE_INPUT_MODE_UPLOAD:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
resolved: list[dict[str, Any]] = []
|
||||||
|
for message in messages:
|
||||||
|
content = message.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
resolved.append(message)
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_content: list[Any] = []
|
||||||
|
for part in content:
|
||||||
|
url = (
|
||||||
|
part.get("image_url", {}).get("url")
|
||||||
|
if isinstance(part, dict) and part.get("type") == "image_url"
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
if isinstance(url, str) and url.startswith("data:image/"):
|
||||||
|
uploaded = await self._upload_data_url(url)
|
||||||
|
new_content.append(
|
||||||
|
{"type": "image_url", "image_url": {"url": uploaded}}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
new_content.append(part)
|
||||||
|
resolved.append({**message, "content": new_content})
|
||||||
|
|
||||||
|
return resolved
|
||||||
|
|
||||||
|
async def _upload_data_url(self, data_url: str) -> str:
|
||||||
|
"""Upload a ``data:image/...;base64,...`` URL via FastGPT and return its URL.
|
||||||
|
|
||||||
|
Falls back to the original data URL if parsing or upload fails so the turn
|
||||||
|
still proceeds with inline base64.
|
||||||
|
"""
|
||||||
|
header, _, payload = data_url.partition(",")
|
||||||
|
mime_type = header[len("data:"):].split(";", 1)[0].strip() or "image/jpeg"
|
||||||
|
try:
|
||||||
|
raw = base64.b64decode(payload, validate=True)
|
||||||
|
except (binascii.Error, ValueError) as exc:
|
||||||
|
logger.warning(f"FastGPT image upload skipped; invalid base64: {exc}")
|
||||||
|
return data_url
|
||||||
|
|
||||||
|
suffix = _MIME_TO_EXT.get(mime_type, ".jpg")
|
||||||
|
tmp_path: str | None = None
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||||
|
tmp.write(raw)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
result = await self._client.upload_chat_image(
|
||||||
|
appId=self._app_id,
|
||||||
|
chatId=self._chat_id,
|
||||||
|
file_path=tmp_path,
|
||||||
|
)
|
||||||
|
url = result.get("url") if isinstance(result, dict) else None
|
||||||
|
if isinstance(url, str) and url:
|
||||||
|
logger.info(
|
||||||
|
f"FastGPT image uploaded chatId={self._chat_id} "
|
||||||
|
f"bytes={len(raw)} url={url}"
|
||||||
|
)
|
||||||
|
return url
|
||||||
|
logger.warning("FastGPT image upload returned no url; using inline base64")
|
||||||
|
return data_url
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"FastGPT image upload failed; using inline base64: {exc}")
|
||||||
|
return data_url
|
||||||
|
finally:
|
||||||
|
if tmp_path is not None:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
async def _process_context(self, context: LLMContext) -> None:
|
async def _process_context(self, context: LLMContext) -> None:
|
||||||
messages = self._build_fastgpt_messages(context)
|
messages = self._build_fastgpt_messages(context)
|
||||||
|
messages = await self._resolve_image_inputs(messages)
|
||||||
variables = self._settings.variables or None
|
variables = self._settings.variables or None
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"FastGPT chat completion "
|
"FastGPT chat completion "
|
||||||
f"chatId={self._chat_id} appId={self._app_id or '-'} "
|
f"chatId={self._chat_id} appId={self._app_id or '-'} "
|
||||||
f"variables={sorted((variables or {}).keys())} messages={messages!r}"
|
f"variables={sorted((variables or {}).keys())} "
|
||||||
|
f"messages={_redact_messages_for_log(messages)!r}"
|
||||||
)
|
)
|
||||||
|
|
||||||
await self.start_ttfb_metrics()
|
await self.start_ttfb_metrics()
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ from pipecat.processors.aggregators.llm_response_universal import (
|
|||||||
UserTurnStoppedMessage,
|
UserTurnStoppedMessage,
|
||||||
)
|
)
|
||||||
from pipecat.serializers.base_serializer import FrameSerializer
|
from pipecat.serializers.base_serializer import FrameSerializer
|
||||||
|
from pipecat.serializers.protobuf import ProtobufFrameSerializer
|
||||||
from pipecat.transports.websocket.fastapi import (
|
from pipecat.transports.websocket.fastapi import (
|
||||||
FastAPIWebsocketParams,
|
FastAPIWebsocketParams,
|
||||||
FastAPIWebsocketTransport,
|
FastAPIWebsocketTransport,
|
||||||
@@ -68,6 +69,15 @@ async def run_product_voice_pipeline(websocket, config: EngineConfig) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_voice_pipeline(websocket, config: EngineConfig) -> None:
|
||||||
|
await run_pipeline_with_serializer(
|
||||||
|
websocket,
|
||||||
|
config,
|
||||||
|
serializer=ProtobufFrameSerializer(),
|
||||||
|
client_label="Pipecat protobuf",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def run_pipeline_with_serializer(
|
async def run_pipeline_with_serializer(
|
||||||
websocket,
|
websocket,
|
||||||
config: EngineConfig,
|
config: EngineConfig,
|
||||||
@@ -120,8 +130,13 @@ async def run_pipeline_with_serializer(
|
|||||||
stop_secs=config.turn.vad.stop_secs,
|
stop_secs=config.turn.vad.stop_secs,
|
||||||
min_volume=config.turn.vad.min_volume,
|
min_volume=config.turn.vad.min_volume,
|
||||||
)
|
)
|
||||||
# Use a simple silence-timeout strategy for streaming ASR so short Chinese
|
# Replace pipecat's default stop strategy (Smart Turn v3) with a simple
|
||||||
# pauses do not split one logical utterance into multiple LLM calls.
|
# silence-timeout strategy. Smart Turn v3 was finalizing every short
|
||||||
|
# Chinese phrase as a complete turn, which caused one logical utterance
|
||||||
|
# to become several LLM calls and several user bubbles in the UI. The
|
||||||
|
# timeout strategy waits for `user_speech_timeout_sec` of silence
|
||||||
|
# (re-armed every time the user resumes speaking) before declaring the
|
||||||
|
# turn finished — which is what we actually want for streaming ASRs.
|
||||||
user_turn_strategies = UserTurnStrategies(
|
user_turn_strategies = UserTurnStrategies(
|
||||||
start=[
|
start=[
|
||||||
InterruptionGateUserTurnStartStrategy(
|
InterruptionGateUserTurnStartStrategy(
|
||||||
@@ -225,22 +240,6 @@ async def run_pipeline_with_serializer(
|
|||||||
nonlocal idle_prompt_count
|
nonlocal idle_prompt_count
|
||||||
idle_prompt_count = 0
|
idle_prompt_count = 0
|
||||||
|
|
||||||
@user_aggregator.event_handler("on_user_turn_idle")
|
|
||||||
async def on_user_turn_idle(aggregator):
|
|
||||||
nonlocal idle_prompt_count
|
|
||||||
text = config.turn.idle_prompt_text.strip()
|
|
||||||
if not text or config.turn.idle_prompt_max_count <= 0:
|
|
||||||
return
|
|
||||||
if idle_prompt_count >= config.turn.idle_prompt_max_count:
|
|
||||||
return
|
|
||||||
|
|
||||||
idle_prompt_count += 1
|
|
||||||
logger.info(
|
|
||||||
"User idle prompt triggered "
|
|
||||||
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
|
|
||||||
)
|
|
||||||
await aggregator.push_frame(TTSSpeakFrame(text))
|
|
||||||
|
|
||||||
@user_aggregator.event_handler("on_user_turn_stopped")
|
@user_aggregator.event_handler("on_user_turn_stopped")
|
||||||
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
|
async def on_user_turn_stopped(_aggregator, _strategy, message: UserTurnStoppedMessage):
|
||||||
logger.info(f"User: {message.content}")
|
logger.info(f"User: {message.content}")
|
||||||
@@ -268,5 +267,25 @@ async def run_pipeline_with_serializer(
|
|||||||
)
|
)
|
||||||
text_stream.take_interrupted_stream_text()
|
text_stream.take_interrupted_stream_text()
|
||||||
|
|
||||||
|
@user_aggregator.event_handler("on_user_turn_idle")
|
||||||
|
async def on_user_turn_idle(aggregator):
|
||||||
|
nonlocal idle_prompt_count
|
||||||
|
text = config.turn.idle_prompt_text.strip()
|
||||||
|
if not text or config.turn.idle_prompt_max_count <= 0:
|
||||||
|
return
|
||||||
|
if idle_prompt_count >= config.turn.idle_prompt_max_count:
|
||||||
|
return
|
||||||
|
|
||||||
|
idle_prompt_count += 1
|
||||||
|
logger.info(
|
||||||
|
"User idle prompt triggered "
|
||||||
|
f"count={idle_prompt_count}/{config.turn.idle_prompt_max_count}"
|
||||||
|
)
|
||||||
|
await aggregator.push_frame(TTSSpeakFrame(text))
|
||||||
|
|
||||||
|
# NOTE: assistant turn started/final events are emitted by
|
||||||
|
# ProductTextStreamProcessor, upstream of TTS, so text streams to the
|
||||||
|
# client ahead of audio. This logger is kept for server-side visibility.
|
||||||
|
|
||||||
runner = PipelineRunner(handle_sigint=False)
|
runner = PipelineRunner(handle_sigint=False)
|
||||||
await runner.run(task)
|
await runner.run(task)
|
||||||
|
|||||||
@@ -65,6 +65,7 @@ def create_llm_service(
|
|||||||
app_id=config.app_id,
|
app_id=config.app_id,
|
||||||
greeting_prompt=greeting_prompt,
|
greeting_prompt=greeting_prompt,
|
||||||
timeout=config.timeout_sec,
|
timeout=config.timeout_sec,
|
||||||
|
image_input_mode=config.image_input_mode,
|
||||||
settings=FastGPTLLMSettings(
|
settings=FastGPTLLMSettings(
|
||||||
model=config.model or "fastgpt",
|
model=config.model or "fastgpt",
|
||||||
variables=variables,
|
variables=variables,
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from pipecat.frames.frames import (
|
|||||||
Frame,
|
Frame,
|
||||||
InputTransportMessageFrame,
|
InputTransportMessageFrame,
|
||||||
LLMMessagesAppendFrame,
|
LLMMessagesAppendFrame,
|
||||||
|
UserImageRawFrame,
|
||||||
UserStartedSpeakingFrame,
|
UserStartedSpeakingFrame,
|
||||||
UserStoppedSpeakingFrame,
|
UserStoppedSpeakingFrame,
|
||||||
)
|
)
|
||||||
@@ -13,11 +14,17 @@ from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|||||||
|
|
||||||
|
|
||||||
class ProductTextInputProcessor(FrameProcessor):
|
class ProductTextInputProcessor(FrameProcessor):
|
||||||
"""Converts product text-input transport messages into LLM turns."""
|
"""Converts product text-input transport messages and marks image input as user activity."""
|
||||||
|
|
||||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||||
await super().process_frame(frame, direction)
|
await super().process_frame(frame, direction)
|
||||||
|
|
||||||
|
if isinstance(frame, UserImageRawFrame):
|
||||||
|
await self.broadcast_frame(UserStartedSpeakingFrame)
|
||||||
|
await self.push_frame(frame, direction)
|
||||||
|
await self.broadcast_frame(UserStoppedSpeakingFrame)
|
||||||
|
return
|
||||||
|
|
||||||
if not isinstance(frame, InputTransportMessageFrame):
|
if not isinstance(frame, InputTransportMessageFrame):
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -154,6 +154,8 @@ class ProductTextStreamProcessor(FrameProcessor):
|
|||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
await self._handle_interrupt()
|
await self._handle_interrupt()
|
||||||
elif isinstance(frame, TTSSpeakFrame):
|
elif isinstance(frame, TTSSpeakFrame):
|
||||||
|
# Fixed-text / direct-speech path: there's no LLM cycle, so
|
||||||
|
# synthesize one started/delta/final sequence for the spoken text.
|
||||||
text = frame.text or ""
|
text = frame.text or ""
|
||||||
await self.push_frame(frame, direction)
|
await self.push_frame(frame, direction)
|
||||||
await self._start_turn()
|
await self._start_turn()
|
||||||
@@ -172,6 +174,8 @@ class ProductTextStreamProcessor(FrameProcessor):
|
|||||||
|
|
||||||
async def _delta(self, text: str) -> None:
|
async def _delta(self, text: str) -> None:
|
||||||
if not self._turn_active:
|
if not self._turn_active:
|
||||||
|
# A text frame outside a turn shouldn't happen, but if it does,
|
||||||
|
# synthesize a started boundary so the client renders sensibly.
|
||||||
await self._start_turn()
|
await self._start_turn()
|
||||||
self._aggregation.append(text)
|
self._aggregation.append(text)
|
||||||
await self._emit("response.text.delta", text=text)
|
await self._emit("response.text.delta", text=text)
|
||||||
|
|||||||
@@ -18,7 +18,12 @@ _COUNTABLE_TEXT_RE = re.compile(r"[\w\u4e00-\u9fff]", re.UNICODE)
|
|||||||
|
|
||||||
|
|
||||||
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
|
class InterruptionGateUserTurnStartStrategy(BaseUserTurnStartStrategy):
|
||||||
"""Starts user turns only after likely intentional speech."""
|
"""Starts user turns only after likely intentional speech.
|
||||||
|
|
||||||
|
When the assistant is speaking, short background speech should not barge in
|
||||||
|
unless it is a common answer to a yes/no style question. When the assistant
|
||||||
|
is not speaking, any non-empty transcript can start a normal user turn.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -24,6 +24,19 @@ const WS_LOG_GROUP_KEYS = {
|
|||||||
AUDIO_SEND: "send:input.audio",
|
AUDIO_SEND: "send:input.audio",
|
||||||
};
|
};
|
||||||
const CAMERA_DONE_TEXT = "【拍摄完成】";
|
const CAMERA_DONE_TEXT = "【拍摄完成】";
|
||||||
|
// Sample images shown as thumbnails under the camera preview. Same-origin files
|
||||||
|
// so they can be drawn to a canvas (for base64 + dimensions) without tainting.
|
||||||
|
const SAMPLE_IMAGES = [
|
||||||
|
{ src: "./samples/damage1.png", label: "车辆前部" },
|
||||||
|
{ src: "./samples/damage2.png", label: "车辆后部" },
|
||||||
|
{ src: "./samples/plate1.jpg", label: "车牌 1" },
|
||||||
|
{ src: "./samples/plate2.jpg", label: "车牌 2" },
|
||||||
|
{ src: "./samples/user1.jpg", label: "人物 1" },
|
||||||
|
{ src: "./samples/user2.jpg", label: "人物 2" },
|
||||||
|
];
|
||||||
|
// Cap the longer edge before JPEG-encoding so payloads stay small.
|
||||||
|
const IMAGE_MAX_DIM = 1280;
|
||||||
|
const IMAGE_JPEG_QUALITY = 0.85;
|
||||||
const CAMERA_STATE_PROMPTS = {
|
const CAMERA_STATE_PROMPTS = {
|
||||||
2000: "请对准车辆碰撞部位拍摄照片。",
|
2000: "请对准车辆碰撞部位拍摄照片。",
|
||||||
2001: "请对准车辆碰撞部位拍摄照片。",
|
2001: "请对准车辆碰撞部位拍摄照片。",
|
||||||
@@ -62,6 +75,15 @@ const els = {
|
|||||||
cameraState: document.getElementById("camera-state"),
|
cameraState: document.getElementById("camera-state"),
|
||||||
cameraQuestion: document.getElementById("camera-question"),
|
cameraQuestion: document.getElementById("camera-question"),
|
||||||
cameraDoneBtn: document.getElementById("camera-done-btn"),
|
cameraDoneBtn: document.getElementById("camera-done-btn"),
|
||||||
|
cameraPreview: document.getElementById("camera-preview"),
|
||||||
|
cameraVideo: document.getElementById("camera-video"),
|
||||||
|
cameraPhoto: document.getElementById("camera-photo"),
|
||||||
|
cameraCanvas: document.getElementById("camera-canvas"),
|
||||||
|
cameraStartBtn: document.getElementById("camera-start-btn"),
|
||||||
|
cameraDeviceRow: document.getElementById("camera-device-row"),
|
||||||
|
cameraDeviceSelect: document.getElementById("camera-device-select"),
|
||||||
|
cameraUpload: document.getElementById("camera-upload"),
|
||||||
|
cameraSamples: document.getElementById("camera-samples"),
|
||||||
clearBtn: document.getElementById("clear-btn"),
|
clearBtn: document.getElementById("clear-btn"),
|
||||||
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
|
clearWsLogBtn: document.getElementById("clear-ws-log-btn"),
|
||||||
wsLog: document.getElementById("ws-log"),
|
wsLog: document.getElementById("ws-log"),
|
||||||
@@ -125,6 +147,14 @@ const state = {
|
|||||||
assistantState: "",
|
assistantState: "",
|
||||||
cameraState: "",
|
cameraState: "",
|
||||||
|
|
||||||
|
// Camera / image input.
|
||||||
|
cameraStream: null,
|
||||||
|
cameraActive: false,
|
||||||
|
cameraFacing: "environment",
|
||||||
|
videoDevices: [],
|
||||||
|
pendingImage: null,
|
||||||
|
samplesRendered: false,
|
||||||
|
|
||||||
// VU meter smoothing.
|
// VU meter smoothing.
|
||||||
meterLevel: 0,
|
meterLevel: 0,
|
||||||
|
|
||||||
@@ -143,15 +173,15 @@ function setConnectButton() {
|
|||||||
els.chatId.disabled = state.connected || state.connecting;
|
els.chatId.disabled = state.connected || state.connecting;
|
||||||
els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
|
els.copyChatIdBtn.disabled = !state.connected || !state.chatId;
|
||||||
if (state.connecting) {
|
if (state.connecting) {
|
||||||
els.connectBtn.textContent = "Connecting…";
|
els.connectBtn.textContent = "连接中…";
|
||||||
els.connectBtn.disabled = true;
|
els.connectBtn.disabled = true;
|
||||||
els.connectBtn.classList.remove("is-disconnect");
|
els.connectBtn.classList.remove("is-disconnect");
|
||||||
} else if (state.connected) {
|
} else if (state.connected) {
|
||||||
els.connectBtn.textContent = "Disconnect";
|
els.connectBtn.textContent = "断开连接";
|
||||||
els.connectBtn.disabled = false;
|
els.connectBtn.disabled = false;
|
||||||
els.connectBtn.classList.add("is-disconnect");
|
els.connectBtn.classList.add("is-disconnect");
|
||||||
} else {
|
} else {
|
||||||
els.connectBtn.textContent = "Connect";
|
els.connectBtn.textContent = "连接";
|
||||||
els.connectBtn.disabled = false;
|
els.connectBtn.disabled = false;
|
||||||
els.connectBtn.classList.remove("is-disconnect");
|
els.connectBtn.classList.remove("is-disconnect");
|
||||||
}
|
}
|
||||||
@@ -180,8 +210,8 @@ async function copyChatId() {
|
|||||||
function setMicButton() {
|
function setMicButton() {
|
||||||
els.micBtn.disabled = !state.connected;
|
els.micBtn.disabled = !state.connected;
|
||||||
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
els.micBtn.setAttribute("aria-pressed", state.micEnabled ? "true" : "false");
|
||||||
els.micBtn.title = state.micEnabled ? "Mute mic" : "Unmute mic";
|
els.micBtn.title = state.micEnabled ? "关闭麦克风" : "开启麦克风";
|
||||||
els.micLabel.textContent = state.micEnabled ? "Mute mic" : "Enable mic";
|
els.micLabel.textContent = state.micEnabled ? "关闭麦克风" : "开启麦克风";
|
||||||
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
els.micIndicator.classList.toggle("is-active", state.micEnabled);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,41 +234,40 @@ function setAssistantState(value) {
|
|||||||
const label = text.length > 32 ? `${text.slice(0, 31)}…` : text;
|
const label = text.length > 32 ? `${text.slice(0, 31)}…` : text;
|
||||||
state.assistantState = text;
|
state.assistantState = text;
|
||||||
els.stateIndicator.classList.toggle("is-active", Boolean(text));
|
els.stateIndicator.classList.toggle("is-active", Boolean(text));
|
||||||
els.stateLabel.textContent = label ? `State ${label}` : "State -";
|
els.stateLabel.textContent = label ? `状态 ${label}` : "状态 -";
|
||||||
els.stateIndicator.title = label ? `Assistant state: ${text}` : "Assistant state";
|
els.stateIndicator.title = label ? `助手状态:${text}` : "助手状态";
|
||||||
syncCameraDrawer(text);
|
syncCameraDrawer(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
function setCameraButtonEnabled() {
|
function setCameraButtonEnabled() {
|
||||||
if (!els.cameraDoneBtn) return;
|
if (!els.cameraDoneBtn) return;
|
||||||
els.cameraDoneBtn.disabled =
|
const wsReady =
|
||||||
!state.connected || !state.cameraState ||
|
state.connected && state.ws && state.ws.readyState === WebSocket.OPEN;
|
||||||
!state.ws || state.ws.readyState !== WebSocket.OPEN;
|
const hasImageSource = state.cameraActive || Boolean(state.pendingImage);
|
||||||
|
els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
function syncCameraDrawer(value) {
|
function syncCameraDrawer(value) {
|
||||||
const prompt = CAMERA_STATE_PROMPTS[value];
|
const prompt = CAMERA_STATE_PROMPTS[value];
|
||||||
const open = Boolean(prompt);
|
const open = Boolean(prompt);
|
||||||
|
const wasOpen = Boolean(state.cameraState);
|
||||||
state.cameraState = open ? value : "";
|
state.cameraState = open ? value : "";
|
||||||
els.cameraDrawer.classList.toggle("is-open", open);
|
els.cameraDrawer.classList.toggle("is-open", open);
|
||||||
els.conversation.classList.toggle("has-camera", open);
|
els.conversation.classList.toggle("has-camera", open);
|
||||||
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
|
els.cameraDrawer.setAttribute("aria-hidden", open ? "false" : "true");
|
||||||
if (open) {
|
if (open) {
|
||||||
els.cameraState.textContent = `State ${value}`;
|
els.cameraState.textContent = `状态 ${value}`;
|
||||||
els.cameraQuestion.textContent = prompt;
|
els.cameraQuestion.textContent = prompt;
|
||||||
|
renderSampleThumbnails();
|
||||||
|
selectDefaultImage();
|
||||||
} else {
|
} else {
|
||||||
els.cameraState.textContent = "State -";
|
els.cameraState.textContent = "状态 -";
|
||||||
els.cameraQuestion.textContent = "";
|
els.cameraQuestion.textContent = "";
|
||||||
|
if (wasOpen) resetCameraInput();
|
||||||
}
|
}
|
||||||
setCameraButtonEnabled();
|
setCameraButtonEnabled();
|
||||||
}
|
}
|
||||||
|
|
||||||
function updateCameraQuestion(text) {
|
|
||||||
const value = typeof text === "string" ? text.trim() : "";
|
|
||||||
if (!state.cameraState || !value) return;
|
|
||||||
els.cameraQuestion.textContent = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
function addBubble(role, text) {
|
function addBubble(role, text) {
|
||||||
if (els.chatLog.querySelector(".chat__empty")) {
|
if (els.chatLog.querySelector(".chat__empty")) {
|
||||||
els.chatLog.innerHTML = "";
|
els.chatLog.innerHTML = "";
|
||||||
@@ -248,7 +277,7 @@ function addBubble(role, text) {
|
|||||||
if (role !== "system") {
|
if (role !== "system") {
|
||||||
const tag = document.createElement("span");
|
const tag = document.createElement("span");
|
||||||
tag.className = "bubble__role";
|
tag.className = "bubble__role";
|
||||||
tag.textContent = role === "user" ? "You" : "Assistant";
|
tag.textContent = role === "user" ? "你" : "助手";
|
||||||
bubble.appendChild(tag);
|
bubble.appendChild(tag);
|
||||||
}
|
}
|
||||||
const body = document.createElement("span");
|
const body = document.createElement("span");
|
||||||
@@ -260,6 +289,35 @@ function addBubble(role, text) {
|
|||||||
return bubble;
|
return bubble;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Render a single chat bubble holding an image and (optionally) text together.
|
||||||
|
function addImageBubble(role, imageUrl, text) {
|
||||||
|
if (els.chatLog.querySelector(".chat__empty")) {
|
||||||
|
els.chatLog.innerHTML = "";
|
||||||
|
}
|
||||||
|
const bubble = document.createElement("div");
|
||||||
|
bubble.className = `bubble bubble--${role}`;
|
||||||
|
if (role !== "system") {
|
||||||
|
const tag = document.createElement("span");
|
||||||
|
tag.className = "bubble__role";
|
||||||
|
tag.textContent = role === "user" ? "你" : "助手";
|
||||||
|
bubble.appendChild(tag);
|
||||||
|
}
|
||||||
|
const img = document.createElement("img");
|
||||||
|
img.className = "bubble__image";
|
||||||
|
img.src = imageUrl;
|
||||||
|
img.alt = text || "image";
|
||||||
|
bubble.appendChild(img);
|
||||||
|
|
||||||
|
const body = document.createElement("span");
|
||||||
|
body.className = "bubble__text";
|
||||||
|
body.textContent = text || "";
|
||||||
|
bubble.appendChild(body);
|
||||||
|
|
||||||
|
els.chatLog.appendChild(bubble);
|
||||||
|
scrollChatToBottom();
|
||||||
|
return bubble;
|
||||||
|
}
|
||||||
|
|
||||||
function appendToBubble(bubble, text) {
|
function appendToBubble(bubble, text) {
|
||||||
const body = bubble.querySelector(".bubble__text");
|
const body = bubble.querySelector(".bubble__text");
|
||||||
body.textContent += text;
|
body.textContent += text;
|
||||||
@@ -276,7 +334,7 @@ function clearChat() {
|
|||||||
setAssistantState("");
|
setAssistantState("");
|
||||||
const empty = document.createElement("div");
|
const empty = document.createElement("div");
|
||||||
empty.className = "chat__empty";
|
empty.className = "chat__empty";
|
||||||
empty.innerHTML = "<p>Chat cleared.</p>";
|
empty.innerHTML = "<p>对话已清空。</p>";
|
||||||
els.chatLog.appendChild(empty);
|
els.chatLog.appendChild(empty);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -499,6 +557,9 @@ function compactWsPayload(payload) {
|
|||||||
if (typeof compact.audio === "string") {
|
if (typeof compact.audio === "string") {
|
||||||
compact.audio = `<base64 ${compact.audio.length} chars>`;
|
compact.audio = `<base64 ${compact.audio.length} chars>`;
|
||||||
}
|
}
|
||||||
|
if (typeof compact.image === "string") {
|
||||||
|
compact.image = `<base64 ${compact.image.length} chars>`;
|
||||||
|
}
|
||||||
if (typeof compact.data === "string" && compact.data.length > 160) {
|
if (typeof compact.data === "string" && compact.data.length > 160) {
|
||||||
compact.data = `<string ${compact.data.length} chars>`;
|
compact.data = `<string ${compact.data.length} chars>`;
|
||||||
}
|
}
|
||||||
@@ -595,7 +656,7 @@ function wsSend(data) {
|
|||||||
function clearWsLog() {
|
function clearWsLog() {
|
||||||
state.wsLogGroup = null;
|
state.wsLogGroup = null;
|
||||||
els.wsLog.innerHTML =
|
els.wsLog.innerHTML =
|
||||||
'<div class="ws-log__empty">No websocket events yet.</div>';
|
'<div class="ws-log__empty">暂无 WebSocket 事件。</div>';
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------- Audio */
|
/* ---------------------------------------------------------------- Audio */
|
||||||
@@ -618,13 +679,13 @@ function renderMicDevices() {
|
|||||||
|
|
||||||
const defaultOption = document.createElement("option");
|
const defaultOption = document.createElement("option");
|
||||||
defaultOption.value = "";
|
defaultOption.value = "";
|
||||||
defaultOption.textContent = "Default microphone";
|
defaultOption.textContent = "默认麦克风";
|
||||||
els.micSelect.appendChild(defaultOption);
|
els.micSelect.appendChild(defaultOption);
|
||||||
|
|
||||||
state.micDevices.forEach((device, index) => {
|
state.micDevices.forEach((device, index) => {
|
||||||
const option = document.createElement("option");
|
const option = document.createElement("option");
|
||||||
option.value = device.deviceId;
|
option.value = device.deviceId;
|
||||||
option.textContent = device.label || `Microphone ${index + 1}`;
|
option.textContent = device.label || `麦克风 ${index + 1}`;
|
||||||
els.micSelect.appendChild(option);
|
els.micSelect.appendChild(option);
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -691,7 +752,7 @@ async function startMic() {
|
|||||||
|
|
||||||
state.micSourceNode.connect(state.recorderNode);
|
state.micSourceNode.connect(state.recorderNode);
|
||||||
state.micEnabled = true;
|
state.micEnabled = true;
|
||||||
addWsLog("system", "mic capture started (binary input.audio frames)");
|
addWsLog("system", "麦克风已开启(PCM 音频流)");
|
||||||
setMicButton();
|
setMicButton();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -727,7 +788,7 @@ function stopMic() {
|
|||||||
state.micEnabled = false;
|
state.micEnabled = false;
|
||||||
updateMeter(0);
|
updateMeter(0);
|
||||||
if (wasEnabled) {
|
if (wasEnabled) {
|
||||||
addWsLog("system", "mic capture stopped");
|
addWsLog("system", "麦克风已关闭");
|
||||||
}
|
}
|
||||||
setMicButton();
|
setMicButton();
|
||||||
}
|
}
|
||||||
@@ -807,6 +868,272 @@ function resetPlaybackClock() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ------------------------------------------------------ Camera / image */
|
||||||
|
|
||||||
|
function setPreviewMode(mode) {
|
||||||
|
// mode: "camera" | "photo" | "idle"
|
||||||
|
els.cameraPreview.classList.toggle("is-camera", mode === "camera");
|
||||||
|
els.cameraPreview.classList.toggle("is-photo", mode === "photo");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Draw an <img>/<video> source to the canvas and return a normalized payload
|
||||||
|
// (JPEG data URL + dimensions) suitable for an `input.image` message.
|
||||||
|
function mediaToPayload(source) {
|
||||||
|
const srcW = source.videoWidth || source.naturalWidth || source.width;
|
||||||
|
const srcH = source.videoHeight || source.naturalHeight || source.height;
|
||||||
|
if (!srcW || !srcH) return null;
|
||||||
|
|
||||||
|
let w = srcW;
|
||||||
|
let h = srcH;
|
||||||
|
const longest = Math.max(w, h);
|
||||||
|
if (longest > IMAGE_MAX_DIM) {
|
||||||
|
const scale = IMAGE_MAX_DIM / longest;
|
||||||
|
w = Math.round(w * scale);
|
||||||
|
h = Math.round(h * scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
const canvas = els.cameraCanvas;
|
||||||
|
canvas.width = w;
|
||||||
|
canvas.height = h;
|
||||||
|
const ctx = canvas.getContext("2d");
|
||||||
|
ctx.drawImage(source, 0, 0, w, h);
|
||||||
|
|
||||||
|
let dataUrl;
|
||||||
|
try {
|
||||||
|
dataUrl = canvas.toDataURL("image/jpeg", IMAGE_JPEG_QUALITY);
|
||||||
|
} catch (err) {
|
||||||
|
addWsLog("system", `图片编码失败:${err.message || err}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return { dataUrl, mime: "image/jpeg", width: w, height: h };
|
||||||
|
}
|
||||||
|
|
||||||
|
function setPendingImage(payload) {
|
||||||
|
state.pendingImage = payload;
|
||||||
|
if (payload) {
|
||||||
|
els.cameraPhoto.src = payload.dataUrl;
|
||||||
|
setPreviewMode("photo");
|
||||||
|
}
|
||||||
|
setCameraButtonEnabled();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function refreshVideoDevices() {
|
||||||
|
try {
|
||||||
|
const devices = await navigator.mediaDevices.enumerateDevices();
|
||||||
|
state.videoDevices = devices.filter((d) => d.kind === "videoinput");
|
||||||
|
} catch (_) {
|
||||||
|
state.videoDevices = [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill the camera dropdown from the enumerated devices. Labels are only exposed
|
||||||
|
// after camera permission has been granted, so before that we show generic
|
||||||
|
// names ("摄像头 1", …) or just the default option.
|
||||||
|
function populateDeviceSelect(activeDeviceId) {
|
||||||
|
const sel = els.cameraDeviceSelect;
|
||||||
|
sel.innerHTML = "";
|
||||||
|
if (state.videoDevices.length === 0) {
|
||||||
|
const opt = document.createElement("option");
|
||||||
|
opt.value = "";
|
||||||
|
opt.textContent = "默认摄像头";
|
||||||
|
sel.appendChild(opt);
|
||||||
|
sel.disabled = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
state.videoDevices.forEach((device, index) => {
|
||||||
|
const opt = document.createElement("option");
|
||||||
|
opt.value = device.deviceId;
|
||||||
|
opt.textContent = device.label || `摄像头 ${index + 1}`;
|
||||||
|
sel.appendChild(opt);
|
||||||
|
});
|
||||||
|
sel.disabled = false;
|
||||||
|
if (activeDeviceId) sel.value = activeDeviceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startCamera(deviceId) {
|
||||||
|
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
|
||||||
|
addWsLog("system", "该浏览器不支持摄像头访问");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
stopCameraStream();
|
||||||
|
const video = deviceId
|
||||||
|
? { deviceId: { exact: deviceId } }
|
||||||
|
: { facingMode: state.cameraFacing };
|
||||||
|
try {
|
||||||
|
state.cameraStream = await navigator.mediaDevices.getUserMedia({
|
||||||
|
video,
|
||||||
|
audio: false,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
addWsLog("system", `摄像头错误:${err.message || err}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
els.cameraVideo.srcObject = state.cameraStream;
|
||||||
|
try {
|
||||||
|
await els.cameraVideo.play();
|
||||||
|
} catch (_) {
|
||||||
|
/* autoplay may resolve later */
|
||||||
|
}
|
||||||
|
state.cameraActive = true;
|
||||||
|
state.pendingImage = null;
|
||||||
|
setPreviewMode("camera");
|
||||||
|
els.cameraStartBtn.classList.add("is-active");
|
||||||
|
clearSampleSelection();
|
||||||
|
|
||||||
|
// Device labels become available only after permission is granted; refresh
|
||||||
|
// the dropdown now and select whichever camera is actually streaming.
|
||||||
|
await refreshVideoDevices();
|
||||||
|
const activeId =
|
||||||
|
state.cameraStream.getVideoTracks?.()[0]?.getSettings?.().deviceId ||
|
||||||
|
deviceId;
|
||||||
|
populateDeviceSelect(activeId);
|
||||||
|
// Reveal the camera device dropdown only while the camera is in use.
|
||||||
|
els.cameraDeviceRow.hidden = false;
|
||||||
|
setCameraButtonEnabled();
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopCameraStream() {
|
||||||
|
if (state.cameraStream) {
|
||||||
|
state.cameraStream.getTracks().forEach((track) => track.stop());
|
||||||
|
state.cameraStream = null;
|
||||||
|
}
|
||||||
|
els.cameraVideo.srcObject = null;
|
||||||
|
state.cameraActive = false;
|
||||||
|
els.cameraStartBtn.classList.remove("is-active");
|
||||||
|
els.cameraDeviceRow.hidden = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
function captureFromCamera() {
|
||||||
|
const payload = mediaToPayload(els.cameraVideo);
|
||||||
|
if (!payload) return null;
|
||||||
|
stopCameraStream();
|
||||||
|
setPendingImage(payload);
|
||||||
|
return payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load a same-origin/object URL into an <img> and resolve once decoded.
|
||||||
|
function loadImage(src) {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const img = new Image();
|
||||||
|
img.onload = () => resolve(img);
|
||||||
|
img.onerror = () => reject(new Error(`failed to load image: ${src}`));
|
||||||
|
img.src = src;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function selectFileImage(file) {
|
||||||
|
if (!file) return;
|
||||||
|
const objectUrl = URL.createObjectURL(file);
|
||||||
|
try {
|
||||||
|
const img = await loadImage(objectUrl);
|
||||||
|
const payload = mediaToPayload(img);
|
||||||
|
if (!payload) return;
|
||||||
|
stopCameraStream();
|
||||||
|
clearSampleSelection();
|
||||||
|
setPendingImage(payload);
|
||||||
|
} catch (err) {
|
||||||
|
addWsLog("system", `上传错误:${err.message || err}`);
|
||||||
|
} finally {
|
||||||
|
URL.revokeObjectURL(objectUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function selectSampleImage(src, buttonEl) {
|
||||||
|
try {
|
||||||
|
const img = await loadImage(src);
|
||||||
|
const payload = mediaToPayload(img);
|
||||||
|
if (!payload) return;
|
||||||
|
stopCameraStream();
|
||||||
|
clearSampleSelection();
|
||||||
|
if (buttonEl) buttonEl.classList.add("is-selected");
|
||||||
|
setPendingImage(payload);
|
||||||
|
} catch (err) {
|
||||||
|
addWsLog("system", `示例图加载错误:${err.message || err}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearSampleSelection() {
|
||||||
|
els.cameraSamples
|
||||||
|
.querySelectorAll(".camera-drawer__sample.is-selected")
|
||||||
|
.forEach((el) => el.classList.remove("is-selected"));
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderSampleThumbnails() {
|
||||||
|
if (state.samplesRendered) return;
|
||||||
|
state.samplesRendered = true;
|
||||||
|
els.cameraSamples.innerHTML = "";
|
||||||
|
for (const sample of SAMPLE_IMAGES) {
|
||||||
|
const btn = document.createElement("button");
|
||||||
|
btn.type = "button";
|
||||||
|
btn.className = "camera-drawer__sample";
|
||||||
|
btn.title = sample.label;
|
||||||
|
const img = document.createElement("img");
|
||||||
|
img.src = sample.src;
|
||||||
|
img.alt = sample.label;
|
||||||
|
btn.appendChild(img);
|
||||||
|
btn.addEventListener("click", () => selectSampleImage(sample.src, btn));
|
||||||
|
els.cameraSamples.appendChild(btn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function resetCameraInput() {
|
||||||
|
stopCameraStream();
|
||||||
|
state.pendingImage = null;
|
||||||
|
clearSampleSelection();
|
||||||
|
els.cameraPhoto.removeAttribute("src");
|
||||||
|
setPreviewMode("idle");
|
||||||
|
setCameraButtonEnabled();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-select the first sample image so "拍摄完成" is immediately pressable when
|
||||||
|
// the drawer opens, without requiring the user to capture or pick first.
|
||||||
|
function selectDefaultImage() {
|
||||||
|
if (state.pendingImage || state.cameraActive) return;
|
||||||
|
const first = els.cameraSamples.querySelector(".camera-drawer__sample");
|
||||||
|
if (first && SAMPLE_IMAGES[0]) {
|
||||||
|
selectSampleImage(SAMPLE_IMAGES[0].src, first);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sendImage(payload, text) {
|
||||||
|
if (!payload) return false;
|
||||||
|
if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return false;
|
||||||
|
|
||||||
|
const message = {
|
||||||
|
type: "input.image",
|
||||||
|
image: payload.dataUrl,
|
||||||
|
mime_type: payload.mime,
|
||||||
|
width: payload.width,
|
||||||
|
height: payload.height,
|
||||||
|
text: text || CAMERA_DONE_TEXT,
|
||||||
|
interrupt: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
wsSend(JSON.stringify(message));
|
||||||
|
// Mirror the text-input path: interrupt in-flight bot audio and render the
|
||||||
|
// user's image + text together as one local bubble (the engine does not echo
|
||||||
|
// image input back as a transcript event).
|
||||||
|
stopPlaybackQueue();
|
||||||
|
state.currentAssistantBubble = null;
|
||||||
|
addImageBubble("user", payload.dataUrl, text || CAMERA_DONE_TEXT);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
function submitCameraImage() {
|
||||||
|
// If the live camera is on, grab the current frame first; otherwise use the
|
||||||
|
// already-selected (uploaded / sample / captured) image.
|
||||||
|
let payload = state.pendingImage;
|
||||||
|
if (state.cameraActive) {
|
||||||
|
payload = captureFromCamera() || payload;
|
||||||
|
}
|
||||||
|
if (!payload) return;
|
||||||
|
// Keep the existing workflow contract: the accompanying text stays the
|
||||||
|
// "【拍摄完成】" marker that advances the FastGPT camera step; the image is
|
||||||
|
// the new multimodal attachment.
|
||||||
|
if (!sendImage(payload, CAMERA_DONE_TEXT)) return;
|
||||||
|
resetCameraInput();
|
||||||
|
}
|
||||||
|
|
||||||
/* --------------------------------------------------------- Chat updates */
|
/* --------------------------------------------------------- Chat updates */
|
||||||
|
|
||||||
function handleUserTranscript(text) {
|
function handleUserTranscript(text) {
|
||||||
@@ -864,7 +1191,6 @@ function handleAssistantFinal(text, interrupted) {
|
|||||||
if (interrupted) {
|
if (interrupted) {
|
||||||
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
state.currentAssistantBubble.classList.add("bubble--interrupted");
|
||||||
}
|
}
|
||||||
updateCameraQuestion(text);
|
|
||||||
state.currentAssistantBubble = null;
|
state.currentAssistantBubble = null;
|
||||||
scrollChatToBottom();
|
scrollChatToBottom();
|
||||||
}
|
}
|
||||||
@@ -930,16 +1256,16 @@ async function connect() {
|
|||||||
const chatId = inputChatId || generateChatId();
|
const chatId = inputChatId || generateChatId();
|
||||||
const url = wsUrlWithChatId(chatId);
|
const url = wsUrlWithChatId(chatId);
|
||||||
if (!url) {
|
if (!url) {
|
||||||
setStatus("error", "Missing URL");
|
setStatus("error", "缺少服务器地址");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
state.connecting = true;
|
state.connecting = true;
|
||||||
state.chatId = chatId;
|
state.chatId = chatId;
|
||||||
els.chatId.value = chatId;
|
els.chatId.value = chatId;
|
||||||
setStatus("connecting", "Connecting…");
|
setStatus("connecting", "连接中…");
|
||||||
setConnectButton();
|
setConnectButton();
|
||||||
addWsLog("system", `connecting ${url}`);
|
addWsLog("system", `正在连接 ${url}`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Pre-warm audio context on user gesture so playback works on Safari.
|
// Pre-warm audio context on user gesture so playback works on Safari.
|
||||||
@@ -949,9 +1275,9 @@ async function connect() {
|
|||||||
state.connecting = false;
|
state.connecting = false;
|
||||||
state.chatId = "";
|
state.chatId = "";
|
||||||
if (!inputChatId) els.chatId.value = "";
|
if (!inputChatId) els.chatId.value = "";
|
||||||
setStatus("error", "Audio init failed");
|
setStatus("error", "音频初始化失败");
|
||||||
setConnectButton();
|
setConnectButton();
|
||||||
addWsLog("error", `audio init failed: ${err.message || err}`, "error");
|
addWsLog("error", `音频初始化失败:${err.message || err}`, "error");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -963,9 +1289,9 @@ async function connect() {
|
|||||||
state.connecting = false;
|
state.connecting = false;
|
||||||
state.chatId = "";
|
state.chatId = "";
|
||||||
if (!inputChatId) els.chatId.value = "";
|
if (!inputChatId) els.chatId.value = "";
|
||||||
setStatus("error", "Bad URL");
|
setStatus("error", "服务器地址无效");
|
||||||
setConnectButton();
|
setConnectButton();
|
||||||
addWsLog("error", `bad websocket URL: ${err.message || err}`, "error");
|
addWsLog("error", `WebSocket 地址无效:${err.message || err}`, "error");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ws.binaryType = "arraybuffer";
|
ws.binaryType = "arraybuffer";
|
||||||
@@ -986,15 +1312,15 @@ async function connect() {
|
|||||||
state.connecting = false;
|
state.connecting = false;
|
||||||
state.connected = true;
|
state.connected = true;
|
||||||
resetPlaybackClock();
|
resetPlaybackClock();
|
||||||
addWsLog("system", "websocket open");
|
addWsLog("system", "连接已建立");
|
||||||
setStatus("connected", "Connected");
|
setStatus("connected", "已连接");
|
||||||
setConnectButton();
|
setConnectButton();
|
||||||
setMicButton();
|
setMicButton();
|
||||||
setMicSelectEnabled();
|
setMicSelectEnabled();
|
||||||
refreshMicDevices();
|
refreshMicDevices();
|
||||||
|
|
||||||
wsSend(JSON.stringify(startMessage));
|
wsSend(JSON.stringify(startMessage));
|
||||||
addBubble("system", "Session started.");
|
addBubble("system", "会话已开始。");
|
||||||
setComposerEnabled(true);
|
setComposerEnabled(true);
|
||||||
setCameraButtonEnabled();
|
setCameraButtonEnabled();
|
||||||
els.textInput.focus();
|
els.textInput.focus();
|
||||||
@@ -1026,7 +1352,7 @@ async function connect() {
|
|||||||
|
|
||||||
ws.addEventListener("error", (err) => {
|
ws.addEventListener("error", (err) => {
|
||||||
console.error("WebSocket error", err);
|
console.error("WebSocket error", err);
|
||||||
setStatus("error", "Connection error");
|
setStatus("error", "连接错误");
|
||||||
addWsLog("error", "websocket error", "error");
|
addWsLog("error", "websocket error", "error");
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -1055,11 +1381,11 @@ async function connect() {
|
|||||||
if (wasConnected) {
|
if (wasConnected) {
|
||||||
addBubble(
|
addBubble(
|
||||||
"system",
|
"system",
|
||||||
`Session ended${event.reason ? ` — ${event.reason}` : ""}.`,
|
`会话已结束${event.reason ? `:${event.reason}` : ""}。`,
|
||||||
);
|
);
|
||||||
setStatus("idle", "Disconnected");
|
setStatus("idle", "未连接");
|
||||||
} else {
|
} else {
|
||||||
setStatus("error", "Connection closed");
|
setStatus("error", "连接已断开");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -1101,7 +1427,7 @@ els.micBtn.addEventListener("click", async () => {
|
|||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error("Mic error", err);
|
console.error("Mic error", err);
|
||||||
addBubble("system", `Mic error: ${err.message || err}`);
|
addBubble("system", `麦克风错误:${err.message || err}`);
|
||||||
} finally {
|
} finally {
|
||||||
els.micBtn.disabled = !state.connected;
|
els.micBtn.disabled = !state.connected;
|
||||||
}
|
}
|
||||||
@@ -1118,7 +1444,7 @@ els.micSelect.addEventListener("change", async () => {
|
|||||||
await startMic();
|
await startMic();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error("Mic switch error", err);
|
console.error("Mic switch error", err);
|
||||||
addBubble("system", `Mic switch error: ${err.message || err}`);
|
addBubble("system", `麦克风切换错误:${err.message || err}`);
|
||||||
} finally {
|
} finally {
|
||||||
setMicButton();
|
setMicButton();
|
||||||
setMicSelectEnabled();
|
setMicSelectEnabled();
|
||||||
@@ -1139,7 +1465,25 @@ els.clearWsLogBtn.addEventListener("click", () => {
|
|||||||
|
|
||||||
els.cameraDoneBtn.addEventListener("click", () => {
|
els.cameraDoneBtn.addEventListener("click", () => {
|
||||||
if (!state.cameraState) return;
|
if (!state.cameraState) return;
|
||||||
sendText(CAMERA_DONE_TEXT);
|
submitCameraImage();
|
||||||
|
});
|
||||||
|
|
||||||
|
els.cameraStartBtn.addEventListener("click", () => {
|
||||||
|
startCamera(els.cameraDeviceSelect.value || undefined);
|
||||||
|
});
|
||||||
|
|
||||||
|
els.cameraDeviceSelect.addEventListener("change", () => {
|
||||||
|
// Switching device only restarts the stream when the camera is already live;
|
||||||
|
// otherwise the choice is applied when "使用摄像头" is pressed.
|
||||||
|
if (state.cameraActive) {
|
||||||
|
startCamera(els.cameraDeviceSelect.value || undefined);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
els.cameraUpload.addEventListener("change", (event) => {
|
||||||
|
const file = event.target.files && event.target.files[0];
|
||||||
|
selectFileImage(file);
|
||||||
|
event.target.value = "";
|
||||||
});
|
});
|
||||||
|
|
||||||
function autosizeTextarea() {
|
function autosizeTextarea() {
|
||||||
@@ -1174,6 +1518,7 @@ els.textInput.addEventListener("keydown", (event) => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
window.addEventListener("beforeunload", () => {
|
window.addEventListener("beforeunload", () => {
|
||||||
|
stopCameraStream();
|
||||||
if (state.ws) {
|
if (state.ws) {
|
||||||
try {
|
try {
|
||||||
state.ws.close();
|
state.ws.close();
|
||||||
@@ -1192,7 +1537,7 @@ window.addEventListener("beforeunload", () => {
|
|||||||
|
|
||||||
els.url.value = defaultWsUrl();
|
els.url.value = defaultWsUrl();
|
||||||
|
|
||||||
setStatus("idle", "Disconnected");
|
setStatus("idle", "未连接");
|
||||||
setConnectButton();
|
setConnectButton();
|
||||||
setMicButton();
|
setMicButton();
|
||||||
setMicSelectEnabled();
|
setMicSelectEnabled();
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
<!doctype html>
|
<!doctype html>
|
||||||
<html lang="en">
|
<html lang="zh-CN">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8" />
|
<meta charset="utf-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
<div class="connection">
|
<div class="connection">
|
||||||
<label class="connection__field">
|
<label class="connection__field">
|
||||||
<span>WebSocket URL</span>
|
<span>服务器地址</span>
|
||||||
<input
|
<input
|
||||||
id="ws-url"
|
id="ws-url"
|
||||||
type="text"
|
type="text"
|
||||||
@@ -26,12 +26,12 @@
|
|||||||
/>
|
/>
|
||||||
</label>
|
</label>
|
||||||
<label class="connection__field connection__field--chat">
|
<label class="connection__field connection__field--chat">
|
||||||
<span>Chat ID</span>
|
<span>会话 ID</span>
|
||||||
<div class="chat-id-control">
|
<div class="chat-id-control">
|
||||||
<input
|
<input
|
||||||
id="chat-id"
|
id="chat-id"
|
||||||
type="text"
|
type="text"
|
||||||
placeholder="optional chatId"
|
placeholder="可选"
|
||||||
spellcheck="false"
|
spellcheck="false"
|
||||||
autocomplete="off"
|
autocomplete="off"
|
||||||
/>
|
/>
|
||||||
@@ -40,8 +40,8 @@
|
|||||||
class="chat-id-control__copy"
|
class="chat-id-control__copy"
|
||||||
type="button"
|
type="button"
|
||||||
disabled
|
disabled
|
||||||
title="Copy Chat ID"
|
title="复制会话 ID"
|
||||||
aria-label="Copy Chat ID"
|
aria-label="复制会话 ID"
|
||||||
>
|
>
|
||||||
<svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
|
<svg class="copy-icon copy-icon--default" viewBox="0 0 16 16" width="14" height="14" fill="none" aria-hidden="true">
|
||||||
<rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/>
|
<rect x="5" y="5" width="8" height="9" rx="1.5" stroke="currentColor" stroke-width="1.4"/>
|
||||||
@@ -54,13 +54,13 @@
|
|||||||
</div>
|
</div>
|
||||||
</label>
|
</label>
|
||||||
<button id="connect-btn" class="btn btn--primary" type="button">
|
<button id="connect-btn" class="btn btn--primary" type="button">
|
||||||
Connect
|
连接
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="status">
|
<div class="status">
|
||||||
<span id="status-dot" class="status__dot status__dot--idle"></span>
|
<span id="status-dot" class="status__dot status__dot--idle"></span>
|
||||||
<span id="status-text" class="status__text">Disconnected</span>
|
<span id="status-text" class="status__text">未连接</span>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
@@ -70,28 +70,87 @@
|
|||||||
<aside
|
<aside
|
||||||
id="camera-drawer"
|
id="camera-drawer"
|
||||||
class="camera-drawer"
|
class="camera-drawer"
|
||||||
aria-label="Camera capture step"
|
aria-label="拍照步骤"
|
||||||
aria-hidden="true"
|
aria-hidden="true"
|
||||||
>
|
>
|
||||||
<div class="camera-drawer__panel">
|
<div class="camera-drawer__panel">
|
||||||
<div class="camera-drawer__header">
|
<div class="camera-drawer__header">
|
||||||
<div>
|
<div>
|
||||||
<p class="camera-drawer__eyebrow">Camera</p>
|
<p class="camera-drawer__eyebrow">拍照</p>
|
||||||
<h2>拍照步骤</h2>
|
<h2>拍照步骤</h2>
|
||||||
</div>
|
</div>
|
||||||
<span id="camera-state" class="camera-drawer__state">State -</span>
|
<span id="camera-state" class="camera-drawer__state">状态 -</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="camera-drawer__preview" aria-hidden="true">
|
<div id="camera-preview" class="camera-drawer__preview">
|
||||||
|
<video
|
||||||
|
id="camera-video"
|
||||||
|
class="camera-drawer__video"
|
||||||
|
playsinline
|
||||||
|
muted
|
||||||
|
autoplay
|
||||||
|
></video>
|
||||||
|
<img
|
||||||
|
id="camera-photo"
|
||||||
|
class="camera-drawer__photo"
|
||||||
|
alt="已选择图片预览"
|
||||||
|
/>
|
||||||
<span class="camera-drawer__corner camera-drawer__corner--tl"></span>
|
<span class="camera-drawer__corner camera-drawer__corner--tl"></span>
|
||||||
<span class="camera-drawer__corner camera-drawer__corner--tr"></span>
|
<span class="camera-drawer__corner camera-drawer__corner--tr"></span>
|
||||||
<span class="camera-drawer__corner camera-drawer__corner--bl"></span>
|
<span class="camera-drawer__corner camera-drawer__corner--bl"></span>
|
||||||
<span class="camera-drawer__corner camera-drawer__corner--br"></span>
|
<span class="camera-drawer__corner camera-drawer__corner--br"></span>
|
||||||
<span class="camera-drawer__lens"></span>
|
<span class="camera-drawer__lens"></span>
|
||||||
<span class="camera-drawer__scan"></span>
|
<span class="camera-drawer__scan"></span>
|
||||||
|
<span id="camera-placeholder" class="camera-drawer__placeholder">
|
||||||
|
打开摄像头实时拍摄,或从下方选择 / 上传图片
|
||||||
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p id="camera-question" class="camera-drawer__question"></p>
|
<p id="camera-question" class="camera-drawer__question"></p>
|
||||||
|
|
||||||
|
<div
|
||||||
|
id="camera-samples"
|
||||||
|
class="camera-drawer__samples"
|
||||||
|
aria-label="示例图片,点击选择"
|
||||||
|
></div>
|
||||||
|
|
||||||
|
<div class="camera-drawer__sources">
|
||||||
|
<label
|
||||||
|
class="btn btn--ghost camera-drawer__source"
|
||||||
|
>
|
||||||
|
上传图片
|
||||||
|
<input
|
||||||
|
id="camera-upload"
|
||||||
|
type="file"
|
||||||
|
accept="image/*"
|
||||||
|
hidden
|
||||||
|
/>
|
||||||
|
</label>
|
||||||
|
<button
|
||||||
|
id="camera-start-btn"
|
||||||
|
class="btn btn--ghost camera-drawer__source"
|
||||||
|
type="button"
|
||||||
|
title="打开摄像头"
|
||||||
|
>
|
||||||
|
使用摄像头
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<label
|
||||||
|
id="camera-device-row"
|
||||||
|
class="device-picker camera-drawer__device-row"
|
||||||
|
hidden
|
||||||
|
>
|
||||||
|
<span class="device-picker__label">选择摄像头</span>
|
||||||
|
<select
|
||||||
|
id="camera-device-select"
|
||||||
|
class="device-picker__select"
|
||||||
|
disabled
|
||||||
|
>
|
||||||
|
<option value="">默认摄像头</option>
|
||||||
|
</select>
|
||||||
|
</label>
|
||||||
|
|
||||||
<button
|
<button
|
||||||
id="camera-done-btn"
|
id="camera-done-btn"
|
||||||
class="btn btn--primary camera-drawer__button"
|
class="btn btn--primary camera-drawer__button"
|
||||||
@@ -100,23 +159,24 @@
|
|||||||
>
|
>
|
||||||
拍摄完成
|
拍摄完成
|
||||||
</button>
|
</button>
|
||||||
|
<canvas id="camera-canvas" hidden></canvas>
|
||||||
</div>
|
</div>
|
||||||
</aside>
|
</aside>
|
||||||
|
|
||||||
<section class="chat" aria-label="Conversation history">
|
<section class="chat" aria-label="对话记录">
|
||||||
<div id="chat-log" class="chat__log" role="log" aria-live="polite">
|
<div id="chat-log" class="chat__log" role="log" aria-live="polite">
|
||||||
<div class="chat__empty">
|
<div class="chat__empty">
|
||||||
<p>Connect to the engine, enable your mic, and start talking.</p>
|
<p>连接服务、开启麦克风后即可开始对话。</p>
|
||||||
<p class="chat__hint">
|
<p class="chat__hint">
|
||||||
Audio is streamed as PCM16 mono @ 16 kHz over
|
音频通过 <code>/ws-product</code> 以 PCM16 单声道 16 kHz
|
||||||
<code>/ws-product</code>.
|
传输。
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<footer class="controls" aria-label="Chat controls">
|
<footer class="controls" aria-label="操作栏">
|
||||||
<div class="meter" aria-hidden="true">
|
<div class="meter" aria-hidden="true">
|
||||||
<div id="meter-fill" class="meter__fill"></div>
|
<div id="meter-fill" class="meter__fill"></div>
|
||||||
</div>
|
</div>
|
||||||
@@ -126,7 +186,7 @@
|
|||||||
id="text-input"
|
id="text-input"
|
||||||
class="composer__input"
|
class="composer__input"
|
||||||
rows="1"
|
rows="1"
|
||||||
placeholder="Type a message, or use the mic…"
|
placeholder="输入消息,或使用麦克风…"
|
||||||
disabled
|
disabled
|
||||||
></textarea>
|
></textarea>
|
||||||
<button
|
<button
|
||||||
@@ -134,17 +194,17 @@
|
|||||||
class="btn btn--primary composer__send"
|
class="btn btn--primary composer__send"
|
||||||
type="submit"
|
type="submit"
|
||||||
disabled
|
disabled
|
||||||
title="Send message (Enter)"
|
title="发送消息 (Enter)"
|
||||||
>
|
>
|
||||||
Send
|
发送
|
||||||
</button>
|
</button>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
<div class="controls__row">
|
<div class="controls__row">
|
||||||
<label class="device-picker">
|
<label class="device-picker">
|
||||||
<span class="device-picker__label">Microphone</span>
|
<span class="device-picker__label">麦克风</span>
|
||||||
<select id="mic-select" class="device-picker__select" disabled>
|
<select id="mic-select" class="device-picker__select" disabled>
|
||||||
<option value="">Default microphone</option>
|
<option value="">默认麦克风</option>
|
||||||
</select>
|
</select>
|
||||||
</label>
|
</label>
|
||||||
|
|
||||||
@@ -154,7 +214,7 @@
|
|||||||
type="button"
|
type="button"
|
||||||
disabled
|
disabled
|
||||||
aria-pressed="false"
|
aria-pressed="false"
|
||||||
title="Mic is off"
|
title="麦克风已关闭"
|
||||||
>
|
>
|
||||||
<svg
|
<svg
|
||||||
class="mic-btn__icon"
|
class="mic-btn__icon"
|
||||||
@@ -172,52 +232,52 @@
|
|||||||
fill="currentColor"
|
fill="currentColor"
|
||||||
/>
|
/>
|
||||||
</svg>
|
</svg>
|
||||||
<span class="mic-btn__label">Enable mic</span>
|
<span class="mic-btn__label">开启麦克风</span>
|
||||||
</button>
|
</button>
|
||||||
|
|
||||||
<div class="indicators">
|
<div class="indicators">
|
||||||
<span id="mic-indicator" class="indicator">
|
<span id="mic-indicator" class="indicator">
|
||||||
<span class="indicator__dot indicator__dot--mic"></span>
|
<span class="indicator__dot indicator__dot--mic"></span>
|
||||||
<span class="indicator__label">Mic</span>
|
<span class="indicator__label">麦克风</span>
|
||||||
</span>
|
</span>
|
||||||
<span id="bot-indicator" class="indicator">
|
<span id="bot-indicator" class="indicator">
|
||||||
<span class="indicator__dot indicator__dot--bot"></span>
|
<span class="indicator__dot indicator__dot--bot"></span>
|
||||||
<span class="indicator__label">Bot</span>
|
<span class="indicator__label">助手</span>
|
||||||
</span>
|
</span>
|
||||||
<span id="state-indicator" class="indicator indicator--state">
|
<span id="state-indicator" class="indicator indicator--state">
|
||||||
<span class="indicator__dot indicator__dot--state"></span>
|
<span class="indicator__dot indicator__dot--state"></span>
|
||||||
<span id="state-label" class="indicator__label">State -</span>
|
<span id="state-label" class="indicator__label">状态 -</span>
|
||||||
</span>
|
</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<button id="clear-btn" class="btn btn--ghost" type="button">
|
<button id="clear-btn" class="btn btn--ghost" type="button">
|
||||||
Clear
|
清空
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p class="hint">
|
<p class="hint">
|
||||||
Press <kbd>Enter</kbd> to send, <kbd>Shift</kbd>+<kbd>Enter</kbd>
|
按 <kbd>Enter</kbd> 发送,<kbd>Shift</kbd>+<kbd>Enter</kbd>
|
||||||
for newline. Sending text will interrupt the bot if it's speaking.
|
换行。发送文字会打断正在说话的助手。
|
||||||
Browser echo cancellation is on; use headphones if echo persists.
|
浏览器回声消除已开启,如有回音请使用耳机。
|
||||||
</p>
|
</p>
|
||||||
</footer>
|
</footer>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<section class="ws-log" aria-label="WebSocket log">
|
<section class="ws-log" aria-label="WebSocket 日志">
|
||||||
<div class="ws-log__header">
|
<div class="ws-log__header">
|
||||||
<div class="ws-log__header-left">
|
<div class="ws-log__header-left">
|
||||||
<h2>WebSocket Log</h2>
|
<h2>WebSocket 日志</h2>
|
||||||
<div class="ws-log__legend" aria-hidden="true">
|
<div class="ws-log__legend" aria-hidden="true">
|
||||||
<span class="ws-log__legend-item ws-log__legend-item--send">Send</span>
|
<span class="ws-log__legend-item ws-log__legend-item--send">发送</span>
|
||||||
<span class="ws-log__legend-item ws-log__legend-item--recv">Recv</span>
|
<span class="ws-log__legend-item ws-log__legend-item--recv">接收</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
|
<button id="clear-ws-log-btn" class="btn btn--ghost" type="button">
|
||||||
Clear log
|
清空日志
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
|
<div id="ws-log" class="ws-log__body" role="log" aria-live="polite">
|
||||||
<div class="ws-log__empty">No websocket events yet.</div>
|
<div class="ws-log__empty">暂无 WebSocket 事件。</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
BIN
static/voice-demo/samples/.DS_Store
vendored
Normal file
BIN
static/voice-demo/samples/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
static/voice-demo/samples/damage1.png
Normal file
BIN
static/voice-demo/samples/damage1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 273 KiB |
BIN
static/voice-demo/samples/damage2.png
Normal file
BIN
static/voice-demo/samples/damage2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 323 KiB |
BIN
static/voice-demo/samples/plate1.jpg
Normal file
BIN
static/voice-demo/samples/plate1.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 7.5 KiB |
BIN
static/voice-demo/samples/plate2.jpg
Normal file
BIN
static/voice-demo/samples/plate2.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 229 KiB |
BIN
static/voice-demo/samples/user1.jpg
Normal file
BIN
static/voice-demo/samples/user1.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
BIN
static/voice-demo/samples/user2.jpg
Normal file
BIN
static/voice-demo/samples/user2.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 105 KiB |
@@ -136,7 +136,8 @@ body {
|
|||||||
|
|
||||||
.camera-drawer__preview {
|
.camera-drawer__preview {
|
||||||
position: relative;
|
position: relative;
|
||||||
min-height: 210px;
|
aspect-ratio: 4 / 3;
|
||||||
|
min-height: 200px;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
border: 1px solid rgba(149, 160, 187, 0.28);
|
border: 1px solid rgba(149, 160, 187, 0.28);
|
||||||
border-radius: 14px;
|
border-radius: 14px;
|
||||||
@@ -148,6 +149,49 @@ body {
|
|||||||
background-size: 34px 34px, 34px 34px, auto, auto;
|
background-size: 34px 34px, 34px 34px, auto, auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.camera-drawer__video,
|
||||||
|
.camera-drawer__photo {
|
||||||
|
position: absolute;
|
||||||
|
inset: 0;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
object-fit: cover;
|
||||||
|
display: none;
|
||||||
|
z-index: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__photo {
|
||||||
|
object-fit: contain;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__preview.is-camera .camera-drawer__video {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__preview.is-photo .camera-drawer__photo {
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hide the decorative lens/scan/placeholder once real media is showing. */
|
||||||
|
.camera-drawer__preview.is-camera .camera-drawer__lens,
|
||||||
|
.camera-drawer__preview.is-photo .camera-drawer__lens,
|
||||||
|
.camera-drawer__preview.is-camera .camera-drawer__scan,
|
||||||
|
.camera-drawer__preview.is-photo .camera-drawer__scan,
|
||||||
|
.camera-drawer__preview.is-camera .camera-drawer__placeholder,
|
||||||
|
.camera-drawer__preview.is-photo .camera-drawer__placeholder {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__placeholder {
|
||||||
|
position: absolute;
|
||||||
|
inset: auto 18px 16px;
|
||||||
|
z-index: 2;
|
||||||
|
color: rgba(214, 220, 235, 0.78);
|
||||||
|
font-size: 12px;
|
||||||
|
line-height: 1.5;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
.camera-drawer__lens {
|
.camera-drawer__lens {
|
||||||
position: absolute;
|
position: absolute;
|
||||||
top: 50%;
|
top: 50%;
|
||||||
@@ -174,6 +218,7 @@ body {
|
|||||||
|
|
||||||
.camera-drawer__corner {
|
.camera-drawer__corner {
|
||||||
position: absolute;
|
position: absolute;
|
||||||
|
z-index: 2;
|
||||||
width: 28px;
|
width: 28px;
|
||||||
height: 28px;
|
height: 28px;
|
||||||
border-color: rgba(255, 255, 255, 0.7);
|
border-color: rgba(255, 255, 255, 0.7);
|
||||||
@@ -229,6 +274,87 @@ body {
|
|||||||
cursor: not-allowed;
|
cursor: not-allowed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* 上传图片 + 使用摄像头 share one row. */
|
||||||
|
.camera-drawer__sources {
|
||||||
|
display: flex;
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The camera device dropdown only appears after "使用摄像头" is selected. */
|
||||||
|
.camera-drawer__device-row {
|
||||||
|
max-width: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__device-row[hidden] {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Active state for the "使用摄像头" button once the camera is live. */
|
||||||
|
.camera-drawer__source.is-active {
|
||||||
|
border-color: var(--success);
|
||||||
|
color: var(--success);
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__source {
|
||||||
|
flex: 1 1 0;
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
text-align: center;
|
||||||
|
min-height: 38px;
|
||||||
|
font-size: 13px;
|
||||||
|
font-weight: 600;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.camera-drawer__source.is-active {
|
||||||
|
border-color: var(--success);
|
||||||
|
color: var(--success);
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__source:disabled {
|
||||||
|
opacity: 0.5;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__samples {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(4, 1fr);
|
||||||
|
gap: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__samples:empty {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__sample {
|
||||||
|
position: relative;
|
||||||
|
aspect-ratio: 4 / 3;
|
||||||
|
padding: 0;
|
||||||
|
border: 2px solid transparent;
|
||||||
|
border-radius: 10px;
|
||||||
|
overflow: hidden;
|
||||||
|
cursor: pointer;
|
||||||
|
background: #0f141f;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__sample img {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
object-fit: contain;
|
||||||
|
display: block;
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__sample:hover {
|
||||||
|
border-color: rgba(149, 160, 187, 0.6);
|
||||||
|
}
|
||||||
|
|
||||||
|
.camera-drawer__sample.is-selected {
|
||||||
|
border-color: var(--success);
|
||||||
|
box-shadow: 0 0 0 1px var(--success);
|
||||||
|
}
|
||||||
|
|
||||||
.app__body {
|
.app__body {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
|
grid-template-columns: minmax(0, 1fr) clamp(300px, 32vw, 420px);
|
||||||
@@ -511,6 +637,18 @@ body {
|
|||||||
margin-bottom: 4px;
|
margin-bottom: 4px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.bubble__image {
|
||||||
|
display: block;
|
||||||
|
max-width: 240px;
|
||||||
|
width: 100%;
|
||||||
|
border-radius: 10px;
|
||||||
|
margin-bottom: 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.bubble__image + .bubble__text:empty {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
/* WebSocket log --------------------------------------------------------- */
|
/* WebSocket log --------------------------------------------------------- */
|
||||||
|
|
||||||
.ws-log {
|
.ws-log {
|
||||||
@@ -567,8 +705,8 @@ body {
|
|||||||
margin: 0;
|
margin: 0;
|
||||||
font-size: 12px;
|
font-size: 12px;
|
||||||
color: var(--text-dim);
|
color: var(--text-dim);
|
||||||
text-transform: uppercase;
|
letter-spacing: 0.5px;
|
||||||
letter-spacing: 0.8px;
|
white-space: nowrap;
|
||||||
}
|
}
|
||||||
|
|
||||||
.ws-log__header-left {
|
.ws-log__header-left {
|
||||||
@@ -823,11 +961,7 @@ body {
|
|||||||
outline: none;
|
outline: none;
|
||||||
width: 100%;
|
width: 100%;
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
text-overflow: ellipsis;
|
||||||
|
|
||||||
.device-picker__select:focus {
|
|
||||||
border-color: var(--accent);
|
|
||||||
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.device-picker__select:disabled {
|
.device-picker__select:disabled {
|
||||||
@@ -835,6 +969,11 @@ body {
|
|||||||
cursor: not-allowed;
|
cursor: not-allowed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.device-picker__select:focus {
|
||||||
|
border-color: var(--accent);
|
||||||
|
box-shadow: 0 0 0 3px rgba(79, 140, 255, 0.18);
|
||||||
|
}
|
||||||
|
|
||||||
.mic-btn {
|
.mic-btn {
|
||||||
display: inline-flex;
|
display: inline-flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
|
|||||||
Reference in New Issue
Block a user