diff --git a/agents/.env.example b/agents/.env.example new file mode 100644 index 0000000..9869c6a --- /dev/null +++ b/agents/.env.example @@ -0,0 +1,22 @@ +LIVEKIT_API_SECRET="secret" +LIVEKIT_API_KEY="devkey" +LIVEKIT_URL="ws://127.0.0.1:7880" + +MINIMAX_API_KEY="aa" + +DEEPSEEK_API_KEY="bb" + +AZURE_SPEECH_KEY="aa" +AZURE_SPEECH_REGION="eastasia" + +CARTESIA_API_KEY="bb" +CARTESIA_LANGUAGE="zh" + +SILICONFLOW_API_KEY="aa" + +DASHSCOPE_API_KEY="aa" + +VOLCENGINE_TTS_ACCESS_TOKEN="aa" +VOLCENGINE_STT_ACCESS_TOKEN="aa" +VOLCENGINE_LLM_API_KEY="aa" +VOLCENGINE_REALTIME_ACCESS_TOKEN="aa" \ No newline at end of file diff --git a/agents/README.md b/agents/README.md new file mode 100644 index 0000000..e69de29 diff --git a/agents/my_basic_agent_1_2_9.py b/agents/my_basic_agent_1_2_9.py new file mode 100644 index 0000000..f390a2f --- /dev/null +++ b/agents/my_basic_agent_1_2_9.py @@ -0,0 +1,1012 @@ +import argparse +import asyncio +import base64 +import json +import logging +import os +import sys +import re +from dataclasses import asdict, dataclass +from functools import partial + +import aiohttp +import httpx +from dotenv import load_dotenv + +from livekit import api, rtc +from livekit.agents import ( + Agent, + AgentSession, + AudioConfig, + BackgroundAudioPlayer, + BuiltinAudioClip, + JobContext, + JobProcess, + MetricsCollectedEvent, + RoomInputOptions, + RoomOutputOptions, + RunContext, + WorkerOptions, + cli, + get_job_context, + metrics, +) +from livekit.agents.llm import ImageContent, ToolError, function_tool +from typing import Any, List, Optional +from livekit.agents.voice.avatar import DataStreamAudioOutput +from livekit.agents.voice.io import PlaybackFinishedEvent +from livekit.agents.voice.room_io import ATTRIBUTE_PUBLISH_ON_BEHALF +from livekit.plugins import silero +from livekit.plugins import openai, azure, minimax, aliyun, volcengine + +from datetime import datetime + +logger = logging.getLogger("basic-agent") + +load_dotenv() + +AVATAR_IDENTITY = "avatar_worker" + +DEFAULT_INSTRUCTIONS = """#角色 +你是一个高度集成、安全第一的交通事故处理AI智能体。 + +你像真人一样和用户对话,而不是机器人。你收到的文字是经过ASR识别的语音输入。 + +当前时间:{datetime} + +#对话要求 +你主动对话并推进事故信息采集的完成。语言简洁,一次询问一个问题。 +不要在你的回复中使用 emojis, asterisks, markdown, 或其他特殊字符。 +你可以对一个用户回复多次调用工具,比如askImageCapture没有得到合适的照片的时候,可以继续调用askImageCapture工具让用户重新拍摄,直到得到合适的照片为止。 +只有在我要求使用askImportantQuestion的时候才去调用,否则禁止使用askImportantQuestion工具。 + +你已经说过下面的开场白所以不需要重复说: +“您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域,开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了,请点击继续办理,如需人工服务,请说转人工。” + +#任务 +你的职责是全流程引导用户完成:事故情况初审 -> 现场证据拍照 -> 驾驶员信息核实。 + +##事故初审阶段 +在事故情况初审阶段:询问是否有人受伤,请求用户简单描述事故情况,询问事故发生时间并通过复述标准化时间向用户确认,询问事故车辆数量,询问事故发生的原因(例如追尾、刮擦、碰撞等)。采集完成后进入现场证据拍照阶段。 +如果用户回答已包含需要问题的答案,改为与用户确认答案是否正确。 +禁止在事故初审阶段使用askImageCapture工具。只有在完成事故信息采集后,才能进入现场证据拍照阶段并使用askImageCapture。 + +##现场证据拍照阶段 +使用askImageCapture工具引导用户依次拍摄事故现场照片,驾驶员正脸照片,车牌号 +每次拍摄完成后立即判断是否符合要求:符合则直接进入下一步;不符合则立即再次调用askImageCapture让用户重新拍摄,并明确指出问题与改进要求 +ask_image_capture的分析结果中只要target_found为false或者quality_ok为false,就必须向用户解释问题(结合quality_notes或缺失的目标),并立刻再次调用askImageCapture给出更具体的改进提示;在获得有效照片之前禁止进入下一步 +拍完需要的照片后,复述车牌号并让用户确认或修正;确认后进入驾驶员信息核实。 + +##驾驶员信息核实阶段 +询问司机的姓名。 +之后根据车牌号查询驾驶员手机号,如果查询到则用手机号后四位向用户确认,如果未查询到或者用户告知手机号后四位错误,则询问用户手机号。 +接着根据车牌号查询驾驶员身份证号,如果查询到则用身份证号后四位向用户确认,如果未查询到或者用户告知身份证号后四位错误,则询问用户身份证号。 +之后告诉用户采集完成,显示重要消息显示已经采集的信息,提示用户点击转人工继续处理。 + + +## 黄金对话路径示例 (GOLDEN_CONVERSATION_PATH) + +``` +# 事故初审阶段 +AI: 您好,请问现场有人受伤吗? +用户: 没有人员受伤。 +AI: 请您简单描述一下事故的经过。 +用户: 我的车等红灯时被后面的车追尾了。 +AI: 请问事故大概发生在什么时间? +用户: 今天早上八点左右发生的。 +AI: 一共涉及几辆车? +用户: 两辆。 +AI: 事故主要原因是追尾,对吗? +用户: 对,是被追尾。 +AI: 好的,接下来我们将进行现场证据采集。 + +# 现场证据拍照阶段 +AI: 现在需要采集事故现场照片,请先拍摄事故现场的整体照片并上传。 +(调用 ask_image_capture,prompt="请拍摄事故现场的整体照片", extract_target="事故现场", requirements="清晰、完整,无模糊和遮挡") +工具返回: {"success": true, "analysis": {"target_found": false, "quality_ok": false, "quality_notes": ["画面模糊"]}, "raw": "..."} +用户上传模糊照片 +AI:您上传的照片不够清晰或存在遮挡,请确保画面清晰、完整地拍摄事故现场,请重新拍摄并上传。 +(调用 ask_image_capture,prompt="请重新清晰完整地拍摄事故现场照片", extract_target="事故现场", requirements="清晰、完整,无模糊和遮挡") +工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "清晰的事故现场概览"}, "raw": "..."} +用户上传清晰照片 +AI:很好,事故现场照片已收到。接下来请拍摄驾驶员正脸照片。 +(调用 ask_image_capture,prompt="请拍摄驾驶员正脸照片", extract_target="驾驶员正脸", requirements="面部清晰、正对镜头") +工具返回: {"success": true, "analysis": {"target_found": false, "quality_ok": false, "quality_notes": ["未正对镜头"]}, "raw": "..."} +用户上传侧脸照片 +AI:检测到照片未正对镜头或面部不清楚,请正对镜头并确保面部清晰,再次拍摄并上传正脸照片。 +(调用 ask_image_capture,prompt="请正对镜头并拍摄清晰的驾驶员正脸照片", extract_target="驾驶员正脸", requirements="面部清晰、正对镜头") +工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "驾驶员正脸清晰"}, "raw": "..."} +用户上传正脸照片 +AI:驾驶员正脸照片已收到。最后,请拍摄并上传车辆号牌的清晰照片。 +(调用 ask_image_capture,prompt="请拍摄车辆号牌照片", extract_target="车辆号牌", requirements="完整、清晰、无遮挡") +工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "车牌完整可读", "target_details": "苏A12345"}, "raw": "..."} +用户上传合格车牌照片 +AI:我识别到车牌号是“苏A12345”,请确认是否正确,如需修改请告知。 + +# 驾驶员信息核实阶段 +AI: 请提供您的姓名,用于信息核实。 +用户: 张三 +AI:(调用 get_mobile_by_plate,plate_number="苏A12345") +工具返回: {"success": true, "plate": "苏A12345", "mobile": "13800001234"} +AI:系统查询到登记手机号尾号1234,请确认是否正确;如不正确,请告知真实手机号。 +用户: 正确 +AI:(调用 get_id_card_by_plate,plate_number="苏A12345") +工具返回: {"success": true, "plate": "苏A12345", "id_card": "320101198001011234"} +AI:系统查询到登记身份证尾号1234,请确认是否正确;如不正确,请告知完整身份证号。 +用户: 正确 +AI: 信息采集已完成,请点击确认进入后续办理。(调用 askImportantQuestion,message="请确认已采集信息:\n姓名:张三\n手机号:13800001234(尾号1234)\n身份证号:320101198001011234(尾号1234)\n车牌号:苏A12345", options=["确认无误", "需要修改"]) +工具返回: {"selection": "确认无误"} +AI: 已确认,后续流程请点击转人工继续处理。(调用 enter_hand_off_to_human_mode) +``` + + +""" + +@dataclass +class AvatarConnectionInfo: + room_name: str + url: str + """LiveKit server URL""" + token: str + """Token for avatar worker to join""" + + +class MyAgent(Agent): + def __init__(self, instructions: str, vision_backend: str = "aliyun") -> None: + self.vision_backend = vision_backend + self._tasks = [] + self._captured_images = [] + self._image_event = asyncio.Event() + super().__init__( + instructions=instructions, + ) + + async def on_enter(self): + # Register byte stream handler for image uploads from frontend + def _image_received_handler(reader, participant_identity): + task = asyncio.create_task( + self._image_received(reader, participant_identity) + ) + self._tasks.append(task) + task.add_done_callback(lambda t: self._tasks.remove(t)) + + # Add the handler when the agent joins + get_job_context().room.register_byte_stream_handler("image", _image_received_handler) + + # Proactively ask the user the initial important question via askImportantQuestion + # using the greeting message from instructions (lines 59-60) + initial_question = ( + "您好,这里是无锡交警,我将为您远程处理交通事故。请将人员撤离至路侧安全区域," + "开启危险报警双闪灯、放置三角警告牌、做好安全防护,谨防二次事故伤害。若您已经准备好了," + "请点击继续办理,如需人工服务,请说转人工。" + ) + options = ["继续办理", "转人工"] + + async def _ask_initial(): + try: + room = get_job_context().room + if not room.remote_participants: + logger.warning("No remote participants yet; skipping initial askImportantQuestion.") + return + participant_identity = next(iter(room.remote_participants)) + + # Speak the initial question so the user hears it + try: + self.session.say(initial_question, allow_interruptions=False) + except Exception as e: + logger.error(f"Failed to speak initial question: {e}") + + payload_data = { + "message": initial_question, + "options": options, + } + # Log tool call in chat + await self._send_chat_message( + f"🔨 Call: ask_important_question\n • message: \"{initial_question}\"\n • options: {options}" + ) + response = await room.local_participant.perform_rpc( + destination_identity=participant_identity, + method="askImportantQuestion", + payload=json.dumps(payload_data), + response_timeout=60.0, + ) + logger.info(f"Initial important question response: {response}") + try: + response_data = json.loads(response) + selection = response_data.get("selection", "") + await self._send_chat_message( + f"✅ Result: ask_important_question\n • selection: \"{selection}\"" + ) + except json.JSONDecodeError: + logger.error(f"Failed to parse initial response: {response}") + except Exception as e: + logger.error(f"Failed to ask initial important question: {e}") + + # when the agent is added to the session, it'll generate a reply + # according to its instructions + self.session.generate_reply() + + asyncio.create_task(_ask_initial()) + + async def _send_chat_message(self, message: str): + """Helper to send a chat message to the room.""" + try: + room = get_job_context().room + import time + # Construct payload matching what useChat likely expects or just a simple JSON + # Standard LiveKit chat format often involves a simple JSON wrapper + payload = json.dumps({ + "message": message, + "timestamp": int(time.time() * 1000) + }).encode('utf-8') + + await room.local_participant.publish_data( + payload=payload, + topic="lk-chat-topic" + ) + except Exception as e: + logger.error(f"Failed to send chat message: {e}") + + async def _image_received(self, reader, participant_identity): + """Handle image uploads from the frontend via byte stream.""" + logger.info(f"Received image upload from participant: {participant_identity}") + image_bytes = bytes() + async for chunk in reader: + image_bytes += chunk + + logger.info(f"Image received: {len(image_bytes)} bytes from {participant_identity}") + + # Store the received image bytes in memory + self._captured_images.append(image_bytes) + self._image_event.set() + + # Notify user that image was received + # chat_ctx = self.chat_ctx.copy() + # chat_ctx.add_message( + # role="user", + # content=["I have uploaded an image. You can analyze it when I ask you to."] + # ) + # await self.update_chat_ctx(chat_ctx) + + # Trigger a reply acknowledging receipt + # self.session.generate_reply() + + def _normalize_vision_response(self, raw_content: Any) -> str: + """Convert various content formats to a clean JSON string (no code fences).""" + # Handle multimodal content list responses + if isinstance(raw_content, list): + text_parts: List[str] = [] + for part in raw_content: + if isinstance(part, dict): + if part.get("type") == "text" and "text" in part: + text_parts.append(str(part["text"])) + else: + text_parts.append(str(part)) + raw_content = "".join(text_parts) + + content = str(raw_content).strip() + if content.startswith("```"): + # Strip fences like ```json ... ``` + content = content.lstrip("`") + if content.lower().startswith("json"): + content = content[4:].lstrip() + if "```" in content: + content = content.split("```", 1)[0].strip() + return content + + async def describe_image_with_api( + self, + base64_image: str, + backend: str = "siliconflow", + prompt: str = "", + extract_target: str = "", + requirements: str = "", + ) -> str: + """ + Send base64 image to Vision API (SiliconFlow or Aliyun) and request a + JSON-formatted analysis aligned with ask_image_capture signature. + """ + if backend == "aliyun": + return await self._describe_image_aliyun( + base64_image, + prompt=prompt, + extract_target=extract_target, + requirements=requirements, + ) + else: + return await self._describe_image_siliconflow( + base64_image, + prompt=prompt, + extract_target=extract_target, + requirements=requirements, + ) + + async def _describe_image_aliyun( + self, + base64_image: str, + prompt: str, + extract_target: str, + requirements: str, + ) -> str: + logger.info("Sending image to Aliyun Vision API (Qwen-VL-Max)...") + api_key = os.getenv("DASHSCOPE_API_KEY") + if not api_key: + logger.error("DASHSCOPE_API_KEY not found") + return "Error: DASHSCOPE_API_KEY not configured." + + url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions" + + analysis_prompt = ( + "请根据以下任务要求对图片进行结构化分析,并且只返回紧凑的JSON:" + f'{{"prompt": "{prompt}", ' + f'"extract_target": "{extract_target}", ' + f'"requirements": "{requirements}", ' + '"summary": "<用不超过50字中文概述图片>", ' + '"target_found": , ' + '"target_details": "<若存在目标,描述关键特征;否则写未发现>", ' + '"quality_ok": , ' + '"quality_notes": ["<不满足要求的原因,若满足可为空数组>"]}' + "。不要添加额外说明、前缀或代码块。" + ) + + payload = { + "model": "qwen-vl-plus", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + }, + { + "type": "text", + "text": analysis_prompt + } + ] + } + ] + } + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + data = await response.json() + raw_content = data['choices'][0]['message']['content'] + normalized = self._normalize_vision_response(raw_content) + try: + parsed = json.loads(normalized) + description = json.dumps(parsed, ensure_ascii=False) + except Exception: + description = normalized + logger.info(f"Got image description from Aliyun: {description}") + return description + else: + error_text = await response.text() + logger.error(f"Aliyun API error: {response.status} - {error_text}") + return "Failed to analyze image due to Aliyun API error." + except Exception as e: + logger.error(f"Exception calling Aliyun Vision API: {e}") + return f"Error analyzing image: {str(e)}" + + async def _describe_image_siliconflow( + self, + base64_image: str, + prompt: str, + extract_target: str, + requirements: str, + ) -> str: + """ + Send base64 image to SiliconFlow Vision API. + """ + logger.info("Sending image to SiliconFlow Vision API...") + + # NOTE: You need to set your SiliconFlow API key here or in your .env file + api_key = os.getenv("SILICONFLOW_API_KEY", "your-api-key") + + url = "https://api.siliconflow.cn/v1/chat/completions" + + analysis_prompt = ( + "请根据以下任务要求输出JSON结果,不要附加任何解释或代码块:" + f'{{"prompt": "{prompt}", ' + f'"extract_target": "{extract_target}", ' + f'"requirements": "{requirements}", ' + '"summary": "<50字内中文概述图片>", ' + '"target_found": , ' + '"target_details": "<若找到目标描述关键特征,否则写未发现>", ' + '"quality_ok": , ' + '"quality_notes": ["<若不满足拍摄要求,列出问题;满足则为空数组>"]}' + ) + + payload = { + "model": "Qwen/Qwen3-VL-8B-Instruct", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + }, + { + "type": "text", + "text": analysis_prompt + } + ] + } + ], + "max_tokens": 512 + } + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload, headers=headers) as response: + if response.status == 200: + data = await response.json() + raw_content = data['choices'][0]['message']['content'] + normalized = self._normalize_vision_response(raw_content) + try: + parsed = json.loads(normalized) + description = json.dumps(parsed, ensure_ascii=False) + except Exception: + description = normalized + logger.info(f"Got image description: {description}") + return description + else: + error_text = await response.text() + logger.error(f"SiliconFlow API error: {response.status} - {error_text}") + return "Failed to analyze image due to API error." + except Exception as e: + logger.error(f"Exception calling Vision API: {e}") + return f"Error analyzing image: {str(e)}" + + @function_tool() + async def ask_image_capture( + self, + context: RunContext, + prompt: str = "请拍摄照片", + extract_target: str = "车辆号牌", + requirements: str = "清晰完整、无模糊", + ): + """ + 请求用户根据提供的提示语拍摄或上传照片,并检查照片中是否包含指定的提取目标且满足相关要求。 + + 参数: + context (RunContext): 代理运行上下文。 + prompt (str, 可选): 提示用户拍摄什么内容(例如:“请拍摄事故现场照片”,“请拍摄车牌号”)。默认值为“请拍摄照片”。 + extract_target (str, 可选): 期望照片中包含的主要目标(例如:“车辆号牌”)。默认值为“车辆号牌”。 + requirements (str, 可选): 拍摄的具体要求说明(例如:清晰、构图完整)。默认值为“清晰完整、无模糊”。 + + 返回: + dict: 简洁可序列化的工具结果。 + - success: bool + - analysis: 解析后的结果(解析失败则为字符串) + - raw: 原始文本结果(字符串) + - error/details: 仅失败时返回 + """ + await self._send_chat_message( + "┌─🔨 Call: ask_image_capture\n" + f"│ prompt: \"{prompt}\"\n" + "└───────────────" + ) + try: + room = get_job_context().room + participant_identity = next(iter(room.remote_participants)) + + # Clear previous image event + self._image_event.clear() + + # Speak the capture prompt so the user hears what to do + self.session.say(prompt, allow_interruptions=True) + + # Ask for image capture and wait for user to capture/upload + response = await room.local_participant.perform_rpc( + destination_identity=participant_identity, + method="askImageCapture", + payload=json.dumps({ + "prompt": prompt, + }), + response_timeout=120.0, # Longer timeout for user to capture image + ) + logger.info(f"Received image capture response: {response}") + + # Parse the response to check for success/error (image data comes via byte stream) + try: + response_data = json.loads(response) + + if "error" in response_data: + return { + "success": False, + "error": "图片采集失败", + "details": response_data.get("error"), + } + + # Wait for the image byte stream to complete (signaled by _image_received) + if not self._image_event.is_set(): + try: + # Give a reasonable buffer for file transfer to finish after RPC returns + await asyncio.wait_for(self._image_event.wait(), timeout=10.0) + except asyncio.TimeoutError: + if not self._captured_images: + return { + "success": False, + "error": "未收到图片数据", + "details": "Timeout waiting for file stream", + } + + # Use the LAST stored image + last_image_bytes = self._captured_images[-1] + logger.info(f"Analyzing last stored image, size: {len(last_image_bytes)} bytes") + + base64_image = base64.b64encode(last_image_bytes).decode('utf-8') + + # Analyze the image using the configured vision backend + description = await self.describe_image_with_api( + base64_image, + self.vision_backend, + prompt=prompt, + extract_target=extract_target, + requirements=requirements, + ) + logger.info(f"Image analysis result: {description}") + + # Try to parse analysis as JSON; fall back to raw string + raw_text = description if isinstance(description, str) else json.dumps(description, ensure_ascii=False) + parsed_analysis = raw_text + try: + parsed_analysis = json.loads(raw_text) + except Exception: + parsed_analysis = raw_text + + # Derive validity signals for the LLM to decide on retries + valid = False + problems = [] + target_details = None + if isinstance(parsed_analysis, dict): + target_found = bool(parsed_analysis.get("target_found")) + quality_ok = bool(parsed_analysis.get("quality_ok")) + valid = target_found and quality_ok + target_details = parsed_analysis.get("target_details") + if not target_found: + problems.append(f"未发现{extract_target}") + if not quality_ok: + notes = parsed_analysis.get("quality_notes") or [] + problems.extend(notes) + + await self._send_chat_message( + "┌─✅ Result: ask_image_capture\n" + f"│ valid: {valid}, problems: {problems}\n" + f"│ analysis: \"{str(raw_text)[:50]}...\"\n" + "└───────────────" + ) + + return { + "success": True, + "analysis": parsed_analysis, + "raw": raw_text, + "valid": valid, + "problems": problems, + "target_details": target_details, + } + + except json.JSONDecodeError: + logger.error(f"Failed to parse response: {response}") + return { + "success": False, + "error": "图片数据解析失败", + "details": response, + } + + except Exception as e: + logger.error(f"Failed to capture and analyze image: {e}") + raise ToolError(f"无法完成图片采集和分析: {str(e)}") + + @function_tool() + async def get_mobile_by_plate( + self, + context: RunContext, + plate_number: str, + ): + """ + 根据车牌号查询登记的手机号。 + + 返回: + dict: { success: bool, plate: str, mobile: str | None, error?: str } + """ + normalized_plate = plate_number.strip().upper() + await self._send_chat_message( + "┌─🔨 Call: get_mobile_by_plate\n" + f"│ plate: \"{normalized_plate}\"\n" + "└───────────────" + ) + # Dummy fixed response (placeholder backend) + return { + "success": True, + "plate": normalized_plate, + "mobile": "13800001234", + } + + @function_tool() + async def get_id_card_by_plate( + self, + context: RunContext, + plate_number: str, + ): + """ + 根据车牌号查询登记的身份证号。 + + 返回: + dict: { success: bool, plate: str, id_card: str | None, error?: str } + """ + normalized_plate = plate_number.strip().upper() + await self._send_chat_message( + "┌─🔨 Call: get_id_card_by_plate\n" + f"│ plate: \"{normalized_plate}\"\n" + "└───────────────" + ) + # Dummy fixed response (placeholder backend) + return { + "success": True, + "plate": normalized_plate, + "id_card": "320101198001011234", + } + + @function_tool() + async def validate_mobile_number( + self, + context: RunContext, + mobile: str, + ): + """ + 检查手机号格式是否正确(大陆 11 位手机号,1[3-9] 开头)。 + + 返回: + dict: { success: bool, valid: bool, mobile: str, error?: str } + """ + normalized = mobile.strip().replace(" ", "") + await self._send_chat_message( + "┌─🔨 Call: validate_mobile_number\n" + f"│ mobile: \"{normalized}\"\n" + "└───────────────" + ) + is_valid = bool(re.fullmatch(r"1[3-9]\\d{9}", normalized)) + if is_valid: + return { + "success": True, + "valid": True, + "mobile": normalized, + } + return { + "success": True, + "valid": False, + "mobile": normalized, + "error": "手机号格式不正确,应为1[3-9]开头的11位数字", + } + + @function_tool() + async def validate_id_card_number( + self, + context: RunContext, + id_card: str, + ): + """ + 检查身份证号格式是否正确(18位含校验位X或15位纯数字)。 + + 返回: + dict: { success: bool, valid: bool, id_card: str, error?: str } + """ + normalized = id_card.strip().replace(" ", "").upper() + await self._send_chat_message( + "┌─🔨 Call: validate_id_card_number\n" + f"│ id_card: \"{normalized}\"\n" + "└───────────────" + ) + is_valid = bool(re.fullmatch(r"(\\d{17}[\\dX]|\\d{15})", normalized)) + if is_valid: + return { + "success": True, + "valid": True, + "id_card": normalized, + } + return { + "success": True, + "valid": False, + "id_card": normalized, + "error": "身份证格式不正确,应为18位(末位可为X)或15位数字", + } + + @function_tool() + async def enter_hand_off_to_human_mode( + self, + context: RunContext, + ): + """切换到“转人工”模式(前端电话界面进入人工处理)。返回成功/失败。""" + await self._send_chat_message("🔨 Call: enter_hand_off_to_human_mode") + try: + room = get_job_context().room + participant_identity = next(iter(room.remote_participants)) + response = await room.local_participant.perform_rpc( + destination_identity=participant_identity, + method="enterHandOffToHumanMode", + payload=json.dumps({}), + response_timeout=5.0, + ) + logger.info(f"Entered hand off to human mode: {response}") + await self._send_chat_message(f"✅ Result: enter_hand_off_to_human_mode\n • status: success") + return response + except Exception as e: + logger.error(f"Failed to enter hand off to human mode: {e}") + raise ToolError(f"Unable to enter hand off to human mode: {str(e)}") + + @function_tool() + async def hang_up_call( + self, + context: RunContext, + ): + """挂断当前通话(结束会话),返回成功/失败。""" + await self._send_chat_message("🔨 Call: hang_up_call") + try: + room = get_job_context().room + participant_identity = next(iter(room.remote_participants)) + response = await room.local_participant.perform_rpc( + destination_identity=participant_identity, + method="hangUpCall", + payload=json.dumps({}), + response_timeout=5.0, + ) + logger.info(f"Hung up call: {response}") + await self._send_chat_message(f"✅ Result: hang_up_call\n • status: disconnected") + return response + except Exception as e: + logger.error(f"Failed to hang up call: {e}") + raise ToolError(f"Unable to hang up call: {str(e)}") + + @function_tool() + async def ask_important_question(self, context: RunContext, message: str, options: Optional[List[str]] = None): + """询问关键问题并等待用户选择选项,返回用户的选择结果。 + + 参数: + message: 要朗读/展示的问题。 + options: 可选项列表(字符串数组),如 ["继续办理", "转人工"]。 + + 返回: + str: 用户选择的文本内容。 + """ + await self._send_chat_message(f"🔨 Call: ask_important_question\n • message: \"{message}\"\n • options: {options}") + try: + room = get_job_context().room + participant_identity = next(iter(room.remote_participants)) + + payload_data = { + "message": message, + } + + if options: + # Support both list input and JSON-stringified list input + if isinstance(options, str): + try: + options = json.loads(options) + except json.JSONDecodeError: + logger.error(f"Failed to parse options string: {options}") + options = None + + # Ensure options is a list of strings + if isinstance(options, list): + payload_data["options"] = options + + # Speak the message + speech_handle = self.session.say(message, allow_interruptions=True) + + # Wait for user selection with longer timeout since user needs time to respond + response = await room.local_participant.perform_rpc( + destination_identity=participant_identity, + method="askImportantQuestion", + payload=json.dumps(payload_data), + response_timeout=60.0, # Increased timeout for user interaction + ) + + # Interrupt speech if user makes a selection while agent is speaking + if speech_handle and hasattr(speech_handle, "interrupt"): + speech_handle.interrupt() + logger.info("Interrupted speech due to user selection") + + logger.info(f"User made selection: {response}") + + # Parse the response to get the user's selection + try: + response_data = json.loads(response) + user_selection = response_data.get("selection", "确认") + logger.info(f"User selected: {user_selection}") + + await self._send_chat_message(f"✅ Result: ask_important_question\n • selection: \"{user_selection}\"") + return f"用户选择了: {user_selection}" + except json.JSONDecodeError: + logger.error(f"Failed to parse response: {response}") + return f"用户选择了: {response}" + + except Exception as e: + logger.error(f"Failed to ask important question: {e}") + raise ToolError(f"Unable to ask important question: {str(e)}") + + +def prewarm(proc: JobProcess): + proc.userdata["vad"] = silero.VAD.load() + + +async def launch_avatar(ctx: JobContext, avatar_dispatcher_url: str, avatar_identity: str) -> None: + """ + Send a request to the avatar service for it to join the room + + This function should be wrapped in a avatar plugin. + """ + + # create a token for the avatar to join the room + token = ( + api.AccessToken() + .with_identity(avatar_identity) + .with_name("Avatar Runner") + .with_grants(api.VideoGrants(room_join=True, room=ctx.room.name)) + .with_kind("agent") + .with_attributes({ATTRIBUTE_PUBLISH_ON_BEHALF: ctx.local_participant_identity}) + .to_jwt() + ) + + logger.info(f"Sending connection info to avatar dispatcher {avatar_dispatcher_url}") + connection_info = AvatarConnectionInfo(room_name=ctx.room.name, url=ctx._info.url, token=token) + async with httpx.AsyncClient() as client: + response = await client.post(avatar_dispatcher_url, json=asdict(connection_info)) + response.raise_for_status() + logger.info("Avatar handshake completed") + + +async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_backend: str = "siliconflow", llm_backend: str = "deepseek"): + # each log entry will include these fields + ctx.log_context_fields = { + "room": ctx.room.name, + } + + # IMPORTANT: Connect to room first + logger.info("connecting to room") + await ctx.connect() + + logger.info("waiting for participant") + participant = await ctx.wait_for_participant() + logger.info(f"starting agent for participant {participant.identity}") + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + logger.info(f"Current time: {current_time}") + + initial_voice_id = "BV001_streaming" # Female voice + if participant.attributes.get("voice"): + initial_voice_id = participant.attributes.get("voice") + logger.info(f"User selected voice: {initial_voice_id}") + + # initial_instructions = """Your name is Kelly. You would interact with users via voice. + # with that in mind keep your responses concise and to the point. + # do not use emojis, asterisks, markdown, or other special characters in your responses. + # You are curious and friendly, and have a sense of humor. + # you will speak chinese to the user" + # """ + initial_instructions = DEFAULT_INSTRUCTIONS + if participant.attributes.get("instructions"): + initial_instructions = participant.attributes.get("instructions") + logger.info(f"User selected instructions: {initial_instructions}") + + # Only replace the datetime placeholder to avoid KeyError from other braces in the prompt + initial_instructions = initial_instructions.replace("{datetime}", current_time) + logger.info(f"Initial instructions: {initial_instructions}") + + if llm_backend == "dashscope": + logger.info("Using DashScope DeepSeek backend") + llm = openai.LLM( + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_key=os.getenv("DASHSCOPE_API_KEY"), + model="deepseek-v3.2" + ) + else: + logger.info("Using default DeepSeek backend") + llm = openai.LLM.with_deepseek( + model='deepseek-chat' + ) + + session = AgentSession( + vad=ctx.proc.userdata["vad"], + # any combination of STT, LLM, TTS, or realtime API can be used + # stt = aliyun.STT(model="paraformer-realtime-v2"), + stt = volcengine.BigModelSTT( + app_id="8607675070" + ), + # stt = volcengine.STT( + # app_id="2931820332", + # cluster="volcengine_streaming_common" + # ), + llm=llm, + # tts=aliyun.TTS(model="cosyvoice-v2", voice="longcheng_v2"), + tts=volcengine.TTS( + app_id="2931820332", + cluster="volcano_tts", + voice=initial_voice_id, + sample_rate=8000, + speed=1.1, + ), + # allow the LLM to generate a response while waiting for the end of turn + preemptive_generation=True, + # sometimes background noise could interrupt the agent session, these are considered false positive interruptions + # when it's detected, you may resume the agent's speech + resume_false_interruption=True, + false_interruption_timeout=1.0, + ) + + # log metrics as they are emitted, and total usage after session is over + usage_collector = metrics.UsageCollector() + + @session.on("metrics_collected") + def _on_metrics_collected(ev: MetricsCollectedEvent): + metrics.log_metrics(ev.metrics) + usage_collector.collect(ev.metrics) + + async def log_usage(): + summary = usage_collector.get_summary() + logger.info(f"Usage: {summary}") + + # shutdown callbacks are triggered when the session is over + ctx.add_shutdown_callback(log_usage) + + # Launch avatar if avatar_dispatcher_url is provided + if avatar_dispatcher_url: + await launch_avatar(ctx, avatar_dispatcher_url, AVATAR_IDENTITY) + session.output.audio = DataStreamAudioOutput( + ctx.room, + destination_identity=AVATAR_IDENTITY, + # (optional) wait for the avatar to publish video track before generating a reply + wait_remote_track=rtc.TrackKind.KIND_VIDEO, + ) + + @session.output.audio.on("playback_finished") + def on_playback_finished(ev: PlaybackFinishedEvent) -> None: + # the avatar should notify when the audio playback is finished + logger.info( + "playback_finished", + extra={ + "playback_position": ev.playback_position, + "interrupted": ev.interrupted, + }, + ) + + await session.start( + agent=MyAgent(initial_instructions, vision_backend=vision_backend), + room=ctx.room, + room_input_options=RoomInputOptions( + # uncomment to enable Krisp BVC noise cancellation + # noise_cancellation=noise_cancellation.BVC(), + ), + room_output_options=RoomOutputOptions(transcription_enabled=True), + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL") + parser.add_argument("--vision-backend", type=str, default="aliyun", choices=["siliconflow", "aliyun"], help="Vision API backend") + parser.add_argument("--llm-backend", type=str, default="dashscope", choices=["deepseek", "dashscope"], help="LLM backend") + args, remaining_args = parser.parse_known_args() + sys.argv = sys.argv[:1] + remaining_args + + if args.avatar_url: + cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, avatar_dispatcher_url=args.avatar_url, vision_backend=args.vision_backend, llm_backend=args.llm_backend), prewarm_fnc=prewarm)) + else: + cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, vision_backend=args.vision_backend, llm_backend=args.llm_backend), prewarm_fnc=prewarm)) diff --git a/src/components/chat/ChatMessageInput.tsx b/src/components/chat/ChatMessageInput.tsx index 37205e5..9b17787 100644 --- a/src/components/chat/ChatMessageInput.tsx +++ b/src/components/chat/ChatMessageInput.tsx @@ -6,6 +6,7 @@ type ChatMessageInput = { accentColor: string; height: number; onSend?: (message: string) => void; + disabled?: boolean; }; export const ChatMessageInput = ({ @@ -13,6 +14,7 @@ export const ChatMessageInput = ({ accentColor, height, onSend, + disabled = false, }: ChatMessageInput) => { const [message, setMessage] = useState(""); const [inputTextWidth, setInputTextWidth] = useState(0); @@ -79,7 +81,8 @@ export const ChatMessageInput = ({ > 0 ? "12px" : "24px", caretShape: "block", @@ -108,11 +111,11 @@ export const ChatMessageInput = ({ {message.replaceAll(" ", "\u00a0")} diff --git a/src/components/chat/ChatTile.tsx b/src/components/chat/ChatTile.tsx index 0ed606e..e0c1415 100644 --- a/src/components/chat/ChatTile.tsx +++ b/src/components/chat/ChatTile.tsx @@ -16,9 +16,10 @@ type ChatTileProps = { messages: ChatMessageType[]; accentColor: string; onSend?: (message: string) => Promise; + inputDisabled?: boolean; }; -export const ChatTile = ({ messages, accentColor, onSend }: ChatTileProps) => { +export const ChatTile = ({ messages, accentColor, onSend, inputDisabled }: ChatTileProps) => { const containerRef = useRef(null); useEffect(() => { if (containerRef.current) { @@ -30,7 +31,7 @@ export const ChatTile = ({ messages, accentColor, onSend }: ChatTileProps) => {
{ placeholder="Type a message" accentColor={accentColor} onSend={onSend} + disabled={inputDisabled} />
); diff --git a/src/components/playground/PhoneSimulator.tsx b/src/components/playground/PhoneSimulator.tsx index 3969f7c..224c935 100644 --- a/src/components/playground/PhoneSimulator.tsx +++ b/src/components/playground/PhoneSimulator.tsx @@ -18,17 +18,29 @@ import { useToast } from "@/components/toast/ToasterProvider"; export interface PhoneSimulatorProps { onConnect: () => void; onDisconnect: () => void; - phoneMode?: "normal" | "capture"; + phoneMode?: "normal" | "capture" | "important_message" | "hand_off"; onCapture?: (image: File) => void; capturePrompt?: string; + importantMessage?: string; + importantMessageOptions?: string[]; + onImportantMessageAction?: (text: string) => void; } -export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", onCapture, capturePrompt }: PhoneSimulatorProps) { +export function PhoneSimulator({ + onConnect, + onDisconnect, + phoneMode = "normal", + onCapture, + capturePrompt, + importantMessage, + importantMessageOptions, + onImportantMessageAction +}: PhoneSimulatorProps) { const { config, setUserSettings } = useConfig(); const { setToastMessage } = useToast(); const room = useRoomContext(); const roomState = useConnectionState(); - const { localParticipant } = useLocalParticipant(); + const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant(); const tracks = useTracks(); const voiceAssistant = useVoiceAssistant(); const fileInputRef = useRef(null); @@ -43,6 +55,10 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", const [processingSource, setProcessingSource] = useState< "camera" | "upload" | null >(null); + const MAX_UPLOAD_MB = 10; + const isAgentSpeaking = voiceAssistant.state === "speaking"; + const wasMicEnabledRef = useRef(false); + const lastPhoneMode = useRef(phoneMode); useEffect(() => { const voiceAttr = config.settings.attributes?.find(a => a.key === "voice"); @@ -136,6 +152,40 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", } }, [voiceAssistant.state]); + useEffect(() => { + const enteringMode = (mode: typeof phoneMode) => + phoneMode === mode && lastPhoneMode.current !== mode; + + // Entering important message / capture / hand_off: remember mic state and mute if needed + if (enteringMode("important_message") || enteringMode("capture") || enteringMode("hand_off")) { + wasMicEnabledRef.current = isMicEnabled; + if (isMicEnabled) { + localParticipant.setMicrophoneEnabled(false); + } + } + // Exiting important message mode or hand off mode or capture mode + else if ( + (phoneMode !== "important_message" && lastPhoneMode.current === "important_message") || + (phoneMode !== "hand_off" && lastPhoneMode.current === "hand_off") || + (phoneMode !== "capture" && lastPhoneMode.current === "capture") + ) { + // Restore mic to previous state + localParticipant.setMicrophoneEnabled(wasMicEnabledRef.current); + + // If exiting capture mode, clear processing image + if (lastPhoneMode.current === "capture") { + setProcessingImage(null); + setProcessingSource(null); + } + } + // Enforce mic off in important message mode, hand off mode, or capture mode + else if ((phoneMode === "important_message" || phoneMode === "hand_off" || phoneMode === "capture") && isMicEnabled) { + localParticipant.setMicrophoneEnabled(false); + } + + lastPhoneMode.current = phoneMode; + }, [phoneMode, isMicEnabled, localParticipant]); + useEffect(() => { const updateTime = () => { const now = new Date(); @@ -159,8 +209,6 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", ({ source }) => source === Track.Source.Camera ); - const isMicEnabled = localParticipant.isMicrophoneEnabled; - const handleMicToggle = async () => { if (isMicEnabled) { await localParticipant.setMicrophoneEnabled(false); @@ -173,6 +221,20 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", onDisconnect(); }; + const validateImageFile = (file: File) => { + const isImage = file.type.startsWith("image/"); + const isSizeOk = file.size > 0 && file.size <= MAX_UPLOAD_MB * 1024 * 1024; + return { + isValid: isImage && isSizeOk, + isImage, + isSizeOk, + }; + }; + + const showErrorToast = (message: string) => { + setToastMessage({ message, type: "error" }); + }; + const handleCapture = async () => { if (!localCameraTrack || !onCapture || isCapturing) return; setIsCapturing(true); @@ -278,6 +340,14 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", const file = new File([blob], "camera-capture.jpg", { type: "image/jpeg", }); + const { isValid } = validateImageFile(file); + if (!isValid) { + setProcessingImage(null); + setProcessingSource(null); + showErrorToast(`请拍摄${MAX_UPLOAD_MB}MB以内的图片文件。`); + setIsCapturing(false); + return; + } onCapture(file); } setIsCapturing(false); @@ -340,9 +410,17 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", const handleFileChange = (event: React.ChangeEvent) => { const file = event.target.files?.[0]; if (file && onCapture) { - onCapture(file); - setProcessingImage(URL.createObjectURL(file)); - setProcessingSource("upload"); + const { isValid, isImage, isSizeOk } = validateImageFile(file); + if (!isValid) { + const msg = !isImage + ? "请选择图片文件。" + : `请上传${MAX_UPLOAD_MB}MB以内的图片文件。`; + showErrorToast(msg); + } else { + onCapture(file); + setProcessingImage(URL.createObjectURL(file)); + setProcessingSource("upload"); + } } // Reset input so the same file can be selected again if (event.target) { @@ -461,9 +539,26 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", .scan-animation { animation: scan 3s linear infinite; } + @keyframes handoffPulse { + 0%, 100% { + opacity: 1; + transform: scale(1); + } + 50% { + opacity: 0.8; + transform: scale(1.05); + } + } + @keyframes handoffFade { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.6; } + } + .handoff-animation { + animation: handoffPulse 2s ease-in-out infinite, handoffFade 2s ease-in-out infinite; + } `} {/* Status Bar */} -
+
{currentTime}
@@ -471,12 +566,17 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
- {/* Dynamic Island / Notch Placeholder */} -
- {/* Main Content */} -
+
+
{videoContent} +
{processingImage && (
{/* eslint-disable-next-line @next/next/no-img-element */} @@ -492,8 +592,11 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
-
- 正在分析... +
+ 正在分析照片,请稍候...
)} @@ -530,11 +633,68 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
)} - {/* Agent Audio Visualizer (Draggable) */} - {roomState === ConnectionState.Connected && voiceAssistant.audioTrack && ( +
+ + {/* Important Message Overlay */} + {roomState === ConnectionState.Connected && phoneMode === "important_message" && ( +
+
+
+

+ {importantMessage} +

+
+
+ {importantMessageOptions && importantMessageOptions.length > 0 ? ( + importantMessageOptions.map((option, index) => ( + + )) + ) : ( + + )} +
+
+
+ )} + + {/* Hand Off Mode Overlay */} + {roomState === ConnectionState.Connected && phoneMode === "hand_off" && ( +
+
+ 转人工... +
+
+ )} + + {/* Agent Audio Visualizer (Draggable) */} + {roomState === ConnectionState.Connected && voiceAssistant.audioTrack && phoneMode !== "hand_off" && (
)} -
{/* Call Controls Overlay */} {roomState === ConnectionState.Connected && ( -
- {phoneMode === "capture" ? ( -
+ phoneMode === "capture" ? ( +
{/* Camera Controls Row */}
{/* Left: Upload */} {/* Center: Capture */} @@ -579,8 +747,13 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", {/* Right: Switch Camera */}
@@ -607,24 +780,8 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
- {/* Call Controls Row */} + {/* Call Controls Row (mic hidden in capture mode) */}
- {/* Mic Toggle */} - - {/* End Call */}
) : ( -
+
- + {phoneMode !== "important_message" && phoneMode !== "hand_off" && ( + + )}
- )} -
+ ) )}
); diff --git a/src/components/playground/Playground.tsx b/src/components/playground/Playground.tsx index 0cfa4b4..4a14ee4 100644 --- a/src/components/playground/Playground.tsx +++ b/src/components/playground/Playground.tsx @@ -30,7 +30,7 @@ import { } from "@livekit/components-react"; import { ConnectionState, LocalParticipant, Track, RpcError, RpcInvocationData } from "livekit-client"; import { QRCodeSVG } from "qrcode.react"; -import { ReactNode, useCallback, useEffect, useMemo, useState } from "react"; +import { ReactNode, useCallback, useEffect, useMemo, useRef, useState } from "react"; import tailwindTheme from "../../lib/tailwindTheme.preval"; import { EditableNameValueRow } from "@/components/config/NameValueRow"; import { AttributesInspector } from "@/components/config/AttributesInspector"; @@ -58,15 +58,19 @@ export default function Playground({ const { name } = useRoomInfo(); const [transcripts, setTranscripts] = useState([]); const { localParticipant } = useLocalParticipant(); - const { send: sendChat } = useChat(); + const { send: sendChat, chatMessages } = useChat(); const voiceAssistant = useVoiceAssistant(); const roomState = useConnectionState(); const tracks = useTracks(); const room = useRoomContext(); - const [phoneMode, setPhoneMode] = useState<"normal" | "capture">("normal"); + const [phoneMode, setPhoneMode] = useState<"normal" | "capture" | "important_message" | "hand_off">("normal"); const [capturePrompt, setCapturePrompt] = useState(""); + const [importantMessage, setImportantMessage] = useState(""); + const [importantMessageOptions, setImportantMessageOptions] = useState([]); + const importantMessageResolverRef = useRef<((value: string) => void) | null>(null); + const imageCaptureResolverRef = useRef<((value: string) => void) | null>(null); const [rpcMethod, setRpcMethod] = useState(""); const [rpcPayload, setRpcPayload] = useState(""); @@ -106,39 +110,115 @@ export default function Playground({ ); localParticipant.registerRpcMethod( - 'enterImageCaptureMode', + 'askImageCapture', async (data: RpcInvocationData) => { - if (data.payload) { - try { - const payload = JSON.parse(data.payload); - if (payload.prompt) { - setCapturePrompt(payload.prompt); - } - } catch (e) { - console.error("Failed to parse enterImageCaptureMode payload", e); - } - } - setPhoneMode("capture"); + // Return a promise that will be resolved when user captures/uploads an image + return new Promise((resolve) => { + if (data.payload) { + try { + const payload = JSON.parse(data.payload); + if (payload.prompt) { + setCapturePrompt(payload.prompt); + } + } catch (e) { + console.error("Failed to parse askImageCapture payload", e); + } + } + setPhoneMode("capture"); + // Store the resolver to be called when user captures/uploads an image + imageCaptureResolverRef.current = (imageData: string) => { + resolve(imageData); + }; + }); + } + ); + + localParticipant.registerRpcMethod( + 'enterHandOffToHumanMode', + async () => { + setPhoneMode("hand_off"); return JSON.stringify({ success: true }); } ); localParticipant.registerRpcMethod( - 'exitImageCaptureMode', + 'hangUpCall', async () => { - setPhoneMode("normal"); - setCapturePrompt(""); + // Disconnect the call + onConnect(false); return JSON.stringify({ success: true }); } ); - }, [localParticipant, roomState]); + + localParticipant.registerRpcMethod( + 'askImportantQuestion', + async (data: RpcInvocationData) => { + // Return a promise that will be resolved when user makes a selection + return new Promise((resolve) => { + if (data.payload) { + try { + const payload = JSON.parse(data.payload); + if (payload.message) { + setImportantMessage(payload.message); + } + if (payload.options && Array.isArray(payload.options)) { + setImportantMessageOptions(payload.options); + } else { + // Default fallback if no options provided + setImportantMessageOptions(["确认"]); + } + } catch (e) { + console.error("Failed to parse askImportantQuestion payload", e); + } + } + setPhoneMode("important_message"); + // Store the resolver to be called when user makes a selection + importantMessageResolverRef.current = (selection: string) => { + resolve(JSON.stringify({ selection })); + }; + }); + } + ); + }, [localParticipant, roomState, onConnect]); useEffect(() => { if (roomState === ConnectionState.Connected) { setPhoneMode("normal"); + } else if (roomState === ConnectionState.Disconnected) { + setPhoneMode("normal"); + // Clean up any pending important message RPC + if (importantMessageResolverRef.current) { + importantMessageResolverRef.current("disconnected"); + importantMessageResolverRef.current = null; + } + // Clean up any pending image capture RPC + if (imageCaptureResolverRef.current) { + imageCaptureResolverRef.current(JSON.stringify({ error: "disconnected" })); + imageCaptureResolverRef.current = null; + } } }, [roomState]); + useEffect(() => { + // When agent starts speaking, exit capture mode (and PhoneSimulator will clear processing image) + // Only exit if we are NOT waiting for capture (resolver is null), meaning this speech is likely the analysis result + if (voiceAssistant.state === "speaking" && phoneMode === "capture" && !imageCaptureResolverRef.current) { + setPhoneMode("normal"); + setCapturePrompt(""); + } + }, [voiceAssistant.state, phoneMode]); + + useEffect(() => { + // Also exit capture mode if we receive a completion message in chat (in case agent doesn't speak immediately) + if (chatMessages.length > 0) { + const lastMsg = chatMessages[chatMessages.length - 1]; + if (lastMsg.message && lastMsg.message.includes("✅ Result: ask_image_capture") && phoneMode === "capture") { + setPhoneMode("normal"); + setCapturePrompt(""); + } + } + }, [chatMessages, phoneMode]); + useEffect(() => { if (!localParticipant || roomState !== ConnectionState.Connected) { return; @@ -191,6 +271,18 @@ export default function Playground({ ({ source }) => source === Track.Source.Microphone, ); + const handleImportantMessageAction = useCallback(async (text: string) => { + setPhoneMode("normal"); + setImportantMessage(""); + setImportantMessageOptions([]); + + // Resolve the RPC with the user's selection + if (importantMessageResolverRef.current) { + importantMessageResolverRef.current(text || "确认"); + importantMessageResolverRef.current = null; + } + }, []); + const onDataReceived = useCallback( (msg: any) => { if (msg.topic === "transcription") { @@ -317,6 +409,7 @@ export default function Playground({ ); } @@ -325,6 +418,7 @@ export default function Playground({ config.settings.theme_color, voiceAssistant.audioTrack, voiceAssistant.agent, + phoneMode, ]); const instructionsContent = ( @@ -630,10 +724,23 @@ export default function Playground({ onDisconnect={() => onConnect(false)} phoneMode={phoneMode} capturePrompt={capturePrompt} + importantMessage={importantMessage} + importantMessageOptions={importantMessageOptions} + onImportantMessageAction={handleImportantMessageAction} onCapture={async (content: File) => { if (localParticipant) { + // Send file via LiveKit byte stream await localParticipant.sendFile(content, { topic: "image" }); - await sendChat("用户上传了照片" ); + + // Resolve RPC to signal completion (without image data) + if (imageCaptureResolverRef.current) { + // Do NOT exit capture mode immediately - wait for agent response (voiceAssistant.state === "speaking") + // setPhoneMode("normal"); + // setCapturePrompt(""); + imageCaptureResolverRef.current(JSON.stringify({ success: true })); + imageCaptureResolverRef.current = null; + } + } }} /> @@ -717,10 +824,23 @@ export default function Playground({ onDisconnect={() => onConnect(false)} phoneMode={phoneMode} capturePrompt={capturePrompt} + importantMessage={importantMessage} + importantMessageOptions={importantMessageOptions} + onImportantMessageAction={handleImportantMessageAction} onCapture={async (content: File) => { if (localParticipant) { + // Send file via LiveKit byte stream await localParticipant.sendFile(content, { topic: "image" }); - await sendChat("用户上传了一张照片"); + + // Resolve RPC to signal completion (without image data) + if (imageCaptureResolverRef.current) { + // Do NOT exit capture mode immediately - wait for agent response (voiceAssistant.state === "speaking") + // setPhoneMode("normal"); + // setCapturePrompt(""); + imageCaptureResolverRef.current(JSON.stringify({ success: true })); + imageCaptureResolverRef.current = null; + } + } }} /> diff --git a/src/transcriptions/TranscriptionTile.tsx b/src/transcriptions/TranscriptionTile.tsx index 73dbb6e..f2129d3 100644 --- a/src/transcriptions/TranscriptionTile.tsx +++ b/src/transcriptions/TranscriptionTile.tsx @@ -16,9 +16,11 @@ import { useEffect, useState } from "react"; export function TranscriptionTile({ agentAudioTrack, accentColor, + inputDisabled, }: { agentAudioTrack?: TrackReferenceOrPlaceholder; accentColor: string; + inputDisabled?: boolean; }) { const agentMessages = useTrackTranscription(agentAudioTrack || undefined); const localParticipant = useLocalParticipant(); @@ -97,7 +99,7 @@ export function TranscriptionTile({ ]); return ( - + ); }