From 21f6c17388b72444aed81c0b9c81757538e8395c Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Tue, 2 Jun 2026 08:24:53 +0800 Subject: [PATCH] Add image upload in conversation --- config/fastgpt.example.json | 3 +- engine/config.py | 11 + engine/fastgpt_llm.py | 155 ++++++++++- engine/services.py | 1 + examples/webpage/app.js | 300 +++++++++++++++++++++- examples/webpage/index.html | 52 +++- examples/webpage/samples/front-damage.jpg | Bin 0 -> 18440 bytes examples/webpage/samples/license.jpg | Bin 0 -> 17843 bytes examples/webpage/samples/plate.jpg | Bin 0 -> 17715 bytes examples/webpage/samples/scene.jpg | Bin 0 -> 17278 bytes examples/webpage/styles.css | 108 ++++++++ 11 files changed, 622 insertions(+), 8 deletions(-) create mode 100644 examples/webpage/samples/front-damage.jpg create mode 100644 examples/webpage/samples/license.jpg create mode 100644 examples/webpage/samples/plate.jpg create mode 100644 examples/webpage/samples/scene.jpg diff --git a/config/fastgpt.example.json b/config/fastgpt.example.json index 8ba2153..36cb819 100644 --- a/config/fastgpt.example.json +++ b/config/fastgpt.example.json @@ -70,7 +70,8 @@ "app_id": "6a153aed53e3f8d9f2744905", "variables": {}, "detail": false, - "timeout_sec": 60.0 + "timeout_sec": 60.0, + "image_input_mode": "base64" }, "tts": { "provider": "xfyun", diff --git a/engine/config.py b/engine/config.py index bb060fe..e458e99 100644 --- a/engine/config.py +++ b/engine/config.py @@ -148,6 +148,8 @@ class LLMConfig: variables: dict[str, str] = field(default_factory=dict) detail: bool = False timeout_sec: float = 60.0 + # FastGPT image input mode: "base64" (inline data URL) or "upload" (presigned upload). + image_input_mode: str = "base64" @property def is_fastgpt(self) -> bool: @@ -257,6 +259,15 @@ def config_from_dict(data: dict) -> EngineConfig: llm["app_id"] = None if not isinstance(llm.get("variables"), dict): llm["variables"] = {} + image_input_mode = str( + llm.get("image_input_mode", LLMConfig().image_input_mode) + ).strip().lower() + if image_input_mode not in {"base64", "upload"}: + raise ValueError( + "services.llm.image_input_mode must be 'base64' or 'upload', " + f"got {llm.get('image_input_mode')!r}" + ) + llm["image_input_mode"] = image_input_mode if agent.get("greeting_mode") == "fastgpt_opener" and llm["provider"] != "fastgpt": raise ValueError( "agent.greeting_mode='fastgpt_opener' requires services.llm.provider='fastgpt'" diff --git a/engine/fastgpt_llm.py b/engine/fastgpt_llm.py index 935d0a7..916cb1a 100644 --- a/engine/fastgpt_llm.py +++ b/engine/fastgpt_llm.py @@ -1,7 +1,11 @@ from __future__ import annotations import asyncio +import base64 +import binascii import json +import os +import tempfile import uuid from dataclasses import dataclass, field from typing import Any @@ -73,6 +77,50 @@ def _message_text(message: dict[str, Any]) -> str: return "" +IMAGE_INPUT_MODE_BASE64 = "base64" +IMAGE_INPUT_MODE_UPLOAD = "upload" +SUPPORTED_IMAGE_INPUT_MODES = frozenset({IMAGE_INPUT_MODE_BASE64, IMAGE_INPUT_MODE_UPLOAD}) + +_MIME_TO_EXT = { + "image/jpeg": ".jpg", + "image/png": ".png", + "image/webp": ".webp", +} + + +def _message_has_image(message: dict[str, Any]) -> bool: + content = message.get("content") + if not isinstance(content, list): + return False + return any( + isinstance(part, dict) and part.get("type") == "image_url" + for part in content + ) + + +def _redact_messages_for_log(messages: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Replace base64 image data URLs with a short placeholder for logging.""" + redacted: list[dict[str, Any]] = [] + for message in messages: + content = message.get("content") + if not isinstance(content, list): + redacted.append(message) + continue + parts: list[Any] = [] + for part in content: + if ( + isinstance(part, dict) + and part.get("type") == "image_url" + and isinstance(part.get("image_url"), dict) + ): + url = str(part["image_url"].get("url") or "") + parts.append({"type": "image_url", "image_url": {"url": f"<{len(url)} chars>"}}) + else: + parts.append(part) + redacted.append({**message, "content": parts}) + return redacted + + def _first_nonempty_text(*values: Any) -> str: for value in values: if isinstance(value, str): @@ -172,6 +220,7 @@ class FastGPTLLMService(LLMService): app_id: str | None = None, greeting_prompt: str | None = None, timeout: float = 60.0, + image_input_mode: str = IMAGE_INPUT_MODE_BASE64, settings: FastGPTLLMSettings | None = None, **kwargs, ) -> None: @@ -183,6 +232,20 @@ class FastGPTLLMService(LLMService): self._chat_id = chat_id or f"voice_{uuid.uuid4().hex[:16]}" self._app_id = (app_id or "").strip() self._greeting_prompt = (greeting_prompt or "你好").strip() or "你好" + + mode = (image_input_mode or IMAGE_INPUT_MODE_BASE64).strip().lower() + if mode not in SUPPORTED_IMAGE_INPUT_MODES: + raise ValueError( + f"Unsupported image_input_mode {image_input_mode!r}; " + f"expected one of {sorted(SUPPORTED_IMAGE_INPUT_MODES)}" + ) + if mode == IMAGE_INPUT_MODE_UPLOAD and not self._app_id: + logger.warning( + "FastGPT image_input_mode='upload' requires app_id; " + "falling back to inline base64" + ) + mode = IMAGE_INPUT_MODE_BASE64 + self._image_input_mode = mode self._client = AsyncChatClient( api_key=api_key, base_url=base_url, @@ -310,26 +373,114 @@ class FastGPTLLMService(LLMService): if response is not None: await response.aclose() - def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, str]]: + def _build_fastgpt_messages(self, context: LLMContext) -> list[dict[str, Any]]: raw_messages = context.get_messages() for message in reversed(raw_messages): if not isinstance(message, dict) or message.get("role") != "user": continue + if _message_has_image(message): + # Multimodal turn: forward the OpenAI-style content list as-is + # (text parts + image_url with a base64 data URL). FastGPT's + # /chat/completions accepts this directly. + return [{"role": "user", "content": message["content"]}] text = _message_text(message) if text: return [{"role": "user", "content": text}] return [{"role": "user", "content": self._greeting_prompt}] + async def _resolve_image_inputs( + self, messages: list[dict[str, Any]] + ) -> list[dict[str, Any]]: + """In ``upload`` mode, replace inline base64 image data URLs with uploaded URLs. + + In ``base64`` mode the messages are returned untouched (inline data URLs). + New message/content objects are built so the shared ``LLMContext`` messages + are never mutated. + """ + if self._image_input_mode != IMAGE_INPUT_MODE_UPLOAD: + return messages + + resolved: list[dict[str, Any]] = [] + for message in messages: + content = message.get("content") + if not isinstance(content, list): + resolved.append(message) + continue + + new_content: list[Any] = [] + for part in content: + url = ( + part.get("image_url", {}).get("url") + if isinstance(part, dict) and part.get("type") == "image_url" + else None + ) + if isinstance(url, str) and url.startswith("data:image/"): + uploaded = await self._upload_data_url(url) + new_content.append( + {"type": "image_url", "image_url": {"url": uploaded}} + ) + else: + new_content.append(part) + resolved.append({**message, "content": new_content}) + + return resolved + + async def _upload_data_url(self, data_url: str) -> str: + """Upload a ``data:image/...;base64,...`` URL via FastGPT and return its URL. + + Falls back to the original data URL if parsing or upload fails so the turn + still proceeds with inline base64. + """ + header, _, payload = data_url.partition(",") + mime_type = header[len("data:") :].split(";", 1)[0].strip() or "image/jpeg" + try: + raw = base64.b64decode(payload, validate=True) + except (binascii.Error, ValueError) as exc: + logger.warning(f"FastGPT image upload skipped; invalid base64: {exc}") + return data_url + + suffix = _MIME_TO_EXT.get(mime_type, ".jpg") + tmp_path: str | None = None + try: + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp.write(raw) + tmp_path = tmp.name + result = await self._client.upload_chat_image( + appId=self._app_id, + chatId=self._chat_id, + file_path=tmp_path, + ) + url = result.get("url") if isinstance(result, dict) else None + if isinstance(url, str) and url: + logger.info( + f"FastGPT image uploaded chatId={self._chat_id} " + f"bytes={len(raw)} url={url}" + ) + return url + logger.warning("FastGPT image upload returned no url; using inline base64") + return data_url + except Exception as exc: + logger.warning(f"FastGPT image upload failed; using inline base64: {exc}") + return data_url + finally: + if tmp_path is not None: + try: + os.unlink(tmp_path) + except OSError: + pass + async def _process_context(self, context: LLMContext) -> None: messages = self._build_fastgpt_messages(context) + messages = await self._resolve_image_inputs(messages) variables = self._settings.variables or None logger.info( "FastGPT chat completion " f"chatId={self._chat_id} appId={self._app_id or '-'} " - f"variables={sorted((variables or {}).keys())} messages={messages!r}" + f"variables={sorted((variables or {}).keys())} " + f"messages={_redact_messages_for_log(messages)!r}" ) await self.start_ttfb_metrics() diff --git a/engine/services.py b/engine/services.py index 8fbd916..142cc25 100644 --- a/engine/services.py +++ b/engine/services.py @@ -65,6 +65,7 @@ def create_llm_service( app_id=config.app_id, greeting_prompt=greeting_prompt, timeout=config.timeout_sec, + image_input_mode=config.image_input_mode, settings=FastGPTLLMSettings( model=config.model or "fastgpt", variables=variables, diff --git a/examples/webpage/app.js b/examples/webpage/app.js index defd4bd..2e64506 100644 --- a/examples/webpage/app.js +++ b/examples/webpage/app.js @@ -24,6 +24,17 @@ const WS_LOG_GROUP_KEYS = { AUDIO_SEND: "send:input.audio", }; const CAMERA_DONE_TEXT = "【拍摄完成】"; +// Sample images shown as thumbnails under the camera preview. Same-origin files +// so they can be drawn to a canvas (for base64 + dimensions) without tainting. +const SAMPLE_IMAGES = [ + { src: "./samples/front-damage.jpg", label: "车辆前部" }, + { src: "./samples/plate.jpg", label: "车牌" }, + { src: "./samples/license.jpg", label: "驾驶证" }, + { src: "./samples/scene.jpg", label: "事故现场" }, +]; +// Cap the longer edge before JPEG-encoding so payloads stay small. +const IMAGE_MAX_DIM = 1280; +const IMAGE_JPEG_QUALITY = 0.85; const CAMERA_STATE_PROMPTS = { 2000: "请对准车辆碰撞部位拍摄照片。", 2001: "请对准车辆碰撞部位拍摄照片。", @@ -62,6 +73,14 @@ const els = { cameraState: document.getElementById("camera-state"), cameraQuestion: document.getElementById("camera-question"), cameraDoneBtn: document.getElementById("camera-done-btn"), + cameraPreview: document.getElementById("camera-preview"), + cameraVideo: document.getElementById("camera-video"), + cameraPhoto: document.getElementById("camera-photo"), + cameraCanvas: document.getElementById("camera-canvas"), + cameraStartBtn: document.getElementById("camera-start-btn"), + cameraFlipBtn: document.getElementById("camera-flip-btn"), + cameraUpload: document.getElementById("camera-upload"), + cameraSamples: document.getElementById("camera-samples"), clearBtn: document.getElementById("clear-btn"), clearWsLogBtn: document.getElementById("clear-ws-log-btn"), wsLog: document.getElementById("ws-log"), @@ -125,6 +144,13 @@ const state = { assistantState: "", cameraState: "", + // Camera / image input. + cameraStream: null, + cameraActive: false, + cameraFacing: "environment", + pendingImage: null, + samplesRendered: false, + // VU meter smoothing. meterLevel: 0, @@ -211,14 +237,16 @@ function setAssistantState(value) { function setCameraButtonEnabled() { if (!els.cameraDoneBtn) return; - els.cameraDoneBtn.disabled = - !state.connected || !state.cameraState || - !state.ws || state.ws.readyState !== WebSocket.OPEN; + const wsReady = + state.connected && state.ws && state.ws.readyState === WebSocket.OPEN; + const hasImageSource = state.cameraActive || Boolean(state.pendingImage); + els.cameraDoneBtn.disabled = !wsReady || !state.cameraState || !hasImageSource; } function syncCameraDrawer(value) { const prompt = CAMERA_STATE_PROMPTS[value]; const open = Boolean(prompt); + const wasOpen = Boolean(state.cameraState); state.cameraState = open ? value : ""; els.cameraDrawer.classList.toggle("is-open", open); els.conversation.classList.toggle("has-camera", open); @@ -226,9 +254,11 @@ function syncCameraDrawer(value) { if (open) { els.cameraState.textContent = `State ${value}`; els.cameraQuestion.textContent = prompt; + renderSampleThumbnails(); } else { els.cameraState.textContent = "State -"; els.cameraQuestion.textContent = ""; + if (wasOpen) resetCameraInput(); } setCameraButtonEnabled(); } @@ -260,6 +290,35 @@ function addBubble(role, text) { return bubble; } +// Render a single chat bubble holding an image and (optionally) text together. +function addImageBubble(role, imageUrl, text) { + if (els.chatLog.querySelector(".chat__empty")) { + els.chatLog.innerHTML = ""; + } + const bubble = document.createElement("div"); + bubble.className = `bubble bubble--${role}`; + if (role !== "system") { + const tag = document.createElement("span"); + tag.className = "bubble__role"; + tag.textContent = role === "user" ? "You" : "Assistant"; + bubble.appendChild(tag); + } + const img = document.createElement("img"); + img.className = "bubble__image"; + img.src = imageUrl; + img.alt = text || "image"; + bubble.appendChild(img); + + const body = document.createElement("span"); + body.className = "bubble__text"; + body.textContent = text || ""; + bubble.appendChild(body); + + els.chatLog.appendChild(bubble); + scrollChatToBottom(); + return bubble; +} + function appendToBubble(bubble, text) { const body = bubble.querySelector(".bubble__text"); body.textContent += text; @@ -499,6 +558,9 @@ function compactWsPayload(payload) { if (typeof compact.audio === "string") { compact.audio = ``; } + if (typeof compact.image === "string") { + compact.image = ``; + } if (typeof compact.data === "string" && compact.data.length > 160) { compact.data = ``; } @@ -807,6 +869,219 @@ function resetPlaybackClock() { } } +/* ------------------------------------------------------ Camera / image */ + +function setPreviewMode(mode) { + // mode: "camera" | "photo" | "idle" + els.cameraPreview.classList.toggle("is-camera", mode === "camera"); + els.cameraPreview.classList.toggle("is-photo", mode === "photo"); +} + +// Draw an /