diff --git a/agents/.env.example b/agents/.env.example
new file mode 100644
index 0000000..9869c6a
--- /dev/null
+++ b/agents/.env.example
@@ -0,0 +1,22 @@
+LIVEKIT_API_SECRET="secret"
+LIVEKIT_API_KEY="devkey"
+LIVEKIT_URL="ws://127.0.0.1:7880"
+
+MINIMAX_API_KEY="aa"
+
+DEEPSEEK_API_KEY="bb"
+
+AZURE_SPEECH_KEY="aa"
+AZURE_SPEECH_REGION="eastasia"
+
+CARTESIA_API_KEY="bb"
+CARTESIA_LANGUAGE="zh"
+
+SILICONFLOW_API_KEY="aa"
+
+DASHSCOPE_API_KEY="aa"
+
+VOLCENGINE_TTS_ACCESS_TOKEN="aa"
+VOLCENGINE_STT_ACCESS_TOKEN="aa"
+VOLCENGINE_LLM_API_KEY="aa"
+VOLCENGINE_REALTIME_ACCESS_TOKEN="aa"
\ No newline at end of file
diff --git a/agents/README.md b/agents/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/agents/my_basic_agent_1_2_9.py b/agents/my_basic_agent_1_2_9.py
new file mode 100644
index 0000000..f390a2f
--- /dev/null
+++ b/agents/my_basic_agent_1_2_9.py
@@ -0,0 +1,1012 @@
+import argparse
+import asyncio
+import base64
+import json
+import logging
+import os
+import sys
+import re
+from dataclasses import asdict, dataclass
+from functools import partial
+
+import aiohttp
+import httpx
+from dotenv import load_dotenv
+
+from livekit import api, rtc
+from livekit.agents import (
+    Agent,
+    AgentSession,
+    AudioConfig,
+    BackgroundAudioPlayer,
+    BuiltinAudioClip,
+    JobContext,
+    JobProcess,
+    MetricsCollectedEvent,
+    RoomInputOptions,
+    RoomOutputOptions,
+    RunContext,
+    WorkerOptions,
+    cli,
+    get_job_context,
+    metrics,
+)
+from livekit.agents.llm import ImageContent, ToolError, function_tool
+from typing import Any, List, Optional
+from livekit.agents.voice.avatar import DataStreamAudioOutput
+from livekit.agents.voice.io import PlaybackFinishedEvent
+from livekit.agents.voice.room_io import ATTRIBUTE_PUBLISH_ON_BEHALF
+from livekit.plugins import silero
+from livekit.plugins import openai, azure, minimax, aliyun, volcengine
+
+from datetime import datetime
+
+logger = logging.getLogger("basic-agent")
+
+load_dotenv()
+
+AVATAR_IDENTITY = "avatar_worker"
+
+DEFAULT_INSTRUCTIONS = """#角色
+你是一个高度集成、安全第一的交通事故处理AI智能体。
+
+你像真人一样和用户对话，而不是机器人。你收到的文字是经过ASR识别的语音输入。
+
+当前时间：{datetime}
+
+#对话要求
+你主动对话并推进事故信息采集的完成。语言简洁，一次询问一个问题。
+不要在你的回复中使用 emojis, asterisks, markdown, 或其他特殊字符。
+你可以对一个用户回复多次调用工具，比如askImageCapture没有得到合适的照片的时候，可以继续调用askImageCapture工具让用户重新拍摄，直到得到合适的照片为止。
+只有在我要求使用askImportantQuestion的时候才去调用，否则禁止使用askImportantQuestion工具。
+
+你已经说过下面的开场白所以不需要重复说：
+“您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。”
+
+#任务
+你的职责是全流程引导用户完成：事故情况初审 -> 现场证据拍照 -> 驾驶员信息核实。
+
+##事故初审阶段
+在事故情况初审阶段：询问是否有人受伤，请求用户简单描述事故情况，询问事故发生时间并通过复述标准化时间向用户确认，询问事故车辆数量，询问事故发生的原因（例如追尾、刮擦、碰撞等）。采集完成后进入现场证据拍照阶段。
+如果用户回答已包含需要问题的答案，改为与用户确认答案是否正确。
+禁止在事故初审阶段使用askImageCapture工具。只有在完成事故信息采集后，才能进入现场证据拍照阶段并使用askImageCapture。
+
+##现场证据拍照阶段
+使用askImageCapture工具引导用户依次拍摄事故现场照片，驾驶员正脸照片，车牌号
+每次拍摄完成后立即判断是否符合要求：符合则直接进入下一步；不符合则立即再次调用askImageCapture让用户重新拍摄，并明确指出问题与改进要求
+ask_image_capture的分析结果中只要target_found为false或者quality_ok为false，就必须向用户解释问题（结合quality_notes或缺失的目标），并立刻再次调用askImageCapture给出更具体的改进提示；在获得有效照片之前禁止进入下一步
+拍完需要的照片后，复述车牌号并让用户确认或修正；确认后进入驾驶员信息核实。
+
+##驾驶员信息核实阶段
+询问司机的姓名。
+之后根据车牌号查询驾驶员手机号，如果查询到则用手机号后四位向用户确认，如果未查询到或者用户告知手机号后四位错误，则询问用户手机号。
+接着根据车牌号查询驾驶员身份证号，如果查询到则用身份证号后四位向用户确认，如果未查询到或者用户告知身份证号后四位错误，则询问用户身份证号。
+之后告诉用户采集完成，显示重要消息显示已经采集的信息，提示用户点击转人工继续处理。
+
+
+## 黄金对话路径示例 （GOLDEN_CONVERSATION_PATH）
+
+```
+# 事故初审阶段
+AI: 您好，请问现场有人受伤吗？
+用户: 没有人员受伤。
+AI: 请您简单描述一下事故的经过。
+用户: 我的车等红灯时被后面的车追尾了。
+AI: 请问事故大概发生在什么时间？
+用户: 今天早上八点左右发生的。
+AI: 一共涉及几辆车？
+用户: 两辆。
+AI: 事故主要原因是追尾，对吗？
+用户: 对，是被追尾。
+AI: 好的，接下来我们将进行现场证据采集。
+
+# 现场证据拍照阶段
+AI: 现在需要采集事故现场照片，请先拍摄事故现场的整体照片并上传。
+(调用 ask_image_capture，prompt="请拍摄事故现场的整体照片", extract_target="事故现场", requirements="清晰、完整，无模糊和遮挡")
+工具返回: {"success": true, "analysis": {"target_found": false, "quality_ok": false, "quality_notes": ["画面模糊"]}, "raw": "..."}
+用户上传模糊照片
+AI：您上传的照片不够清晰或存在遮挡，请确保画面清晰、完整地拍摄事故现场，请重新拍摄并上传。
+(调用 ask_image_capture，prompt="请重新清晰完整地拍摄事故现场照片", extract_target="事故现场", requirements="清晰、完整，无模糊和遮挡")
+工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "清晰的事故现场概览"}, "raw": "..."}
+用户上传清晰照片
+AI：很好，事故现场照片已收到。接下来请拍摄驾驶员正脸照片。
+(调用 ask_image_capture，prompt="请拍摄驾驶员正脸照片", extract_target="驾驶员正脸", requirements="面部清晰、正对镜头")
+工具返回: {"success": true, "analysis": {"target_found": false, "quality_ok": false, "quality_notes": ["未正对镜头"]}, "raw": "..."}
+用户上传侧脸照片
+AI：检测到照片未正对镜头或面部不清楚，请正对镜头并确保面部清晰，再次拍摄并上传正脸照片。
+(调用 ask_image_capture，prompt="请正对镜头并拍摄清晰的驾驶员正脸照片", extract_target="驾驶员正脸", requirements="面部清晰、正对镜头")
+工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "驾驶员正脸清晰"}, "raw": "..."}
+用户上传正脸照片
+AI：驾驶员正脸照片已收到。最后，请拍摄并上传车辆号牌的清晰照片。
+(调用 ask_image_capture，prompt="请拍摄车辆号牌照片", extract_target="车辆号牌", requirements="完整、清晰、无遮挡")
+工具返回: {"success": true, "analysis": {"target_found": true, "quality_ok": true, "summary": "车牌完整可读", "target_details": "苏A12345"}, "raw": "..."}
+用户上传合格车牌照片
+AI：我识别到车牌号是“苏A12345”，请确认是否正确，如需修改请告知。
+
+# 驾驶员信息核实阶段
+AI: 请提供您的姓名，用于信息核实。
+用户: 张三
+AI：(调用 get_mobile_by_plate，plate_number="苏A12345")
+工具返回: {"success": true, "plate": "苏A12345", "mobile": "13800001234"}
+AI：系统查询到登记手机号尾号1234，请确认是否正确；如不正确，请告知真实手机号。
+用户: 正确
+AI：(调用 get_id_card_by_plate，plate_number="苏A12345")
+工具返回: {"success": true, "plate": "苏A12345", "id_card": "320101198001011234"}
+AI：系统查询到登记身份证尾号1234，请确认是否正确；如不正确，请告知完整身份证号。
+用户: 正确
+AI: 信息采集已完成，请点击确认进入后续办理。(调用 askImportantQuestion，message="请确认已采集信息：\n姓名：张三\n手机号：13800001234（尾号1234）\n身份证号：320101198001011234（尾号1234）\n车牌号：苏A12345", options=["确认无误", "需要修改"])
+工具返回: {"selection": "确认无误"}
+AI: 已确认，后续流程请点击转人工继续处理。(调用 enter_hand_off_to_human_mode)
+```
+
+
+"""
+
+@dataclass
+class AvatarConnectionInfo:
+    room_name: str
+    url: str
+    """LiveKit server URL"""
+    token: str
+    """Token for avatar worker to join"""
+
+
+class MyAgent(Agent):
+    def __init__(self, instructions: str, vision_backend: str = "aliyun") -> None:
+        self.vision_backend = vision_backend
+        self._tasks = []
+        self._captured_images = []
+        self._image_event = asyncio.Event()
+        super().__init__(
+            instructions=instructions,
+        )
+
+    async def on_enter(self):
+        # Register byte stream handler for image uploads from frontend
+        def _image_received_handler(reader, participant_identity):
+            task = asyncio.create_task(
+                self._image_received(reader, participant_identity)
+            )
+            self._tasks.append(task)
+            task.add_done_callback(lambda t: self._tasks.remove(t))
+
+        # Add the handler when the agent joins
+        get_job_context().room.register_byte_stream_handler("image", _image_received_handler)
+
+        # Proactively ask the user the initial important question via askImportantQuestion
+        # using the greeting message from instructions (lines 59-60)
+        initial_question = (
+            "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，"
+            "开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，"
+            "请点击继续办理，如需人工服务，请说转人工。"
+        )
+        options = ["继续办理", "转人工"]
+
+        async def _ask_initial():
+            try:
+                room = get_job_context().room
+                if not room.remote_participants:
+                    logger.warning("No remote participants yet; skipping initial askImportantQuestion.")
+                    return
+                participant_identity = next(iter(room.remote_participants))
+
+                # Speak the initial question so the user hears it
+                try:
+                    self.session.say(initial_question, allow_interruptions=False)
+                except Exception as e:
+                    logger.error(f"Failed to speak initial question: {e}")
+
+                payload_data = {
+                    "message": initial_question,
+                    "options": options,
+                }
+                # Log tool call in chat
+                await self._send_chat_message(
+                    f"🔨 Call: ask_important_question\n  • message: \"{initial_question}\"\n  • options: {options}"
+                )
+                response = await room.local_participant.perform_rpc(
+                    destination_identity=participant_identity,
+                    method="askImportantQuestion",
+                    payload=json.dumps(payload_data),
+                    response_timeout=60.0,
+                )
+                logger.info(f"Initial important question response: {response}")
+                try:
+                    response_data = json.loads(response)
+                    selection = response_data.get("selection", "")
+                    await self._send_chat_message(
+                        f"✅ Result: ask_important_question\n  • selection: \"{selection}\""
+                    )
+                except json.JSONDecodeError:
+                    logger.error(f"Failed to parse initial response: {response}")
+            except Exception as e:
+                logger.error(f"Failed to ask initial important question: {e}")
+                
+            # when the agent is added to the session, it'll generate a reply
+            # according to its instructions
+            self.session.generate_reply()
+
+        asyncio.create_task(_ask_initial())
+
+    async def _send_chat_message(self, message: str):
+        """Helper to send a chat message to the room."""
+        try:
+            room = get_job_context().room
+            import time
+            # Construct payload matching what useChat likely expects or just a simple JSON
+            # Standard LiveKit chat format often involves a simple JSON wrapper
+            payload = json.dumps({
+                "message": message,
+                "timestamp": int(time.time() * 1000)
+            }).encode('utf-8')
+            
+            await room.local_participant.publish_data(
+                payload=payload,
+                topic="lk-chat-topic"
+            )
+        except Exception as e:
+            logger.error(f"Failed to send chat message: {e}")
+
+    async def _image_received(self, reader, participant_identity):
+        """Handle image uploads from the frontend via byte stream."""
+        logger.info(f"Received image upload from participant: {participant_identity}")
+        image_bytes = bytes()
+        async for chunk in reader:
+            image_bytes += chunk
+
+        logger.info(f"Image received: {len(image_bytes)} bytes from {participant_identity}")
+
+        # Store the received image bytes in memory
+        self._captured_images.append(image_bytes)
+        self._image_event.set()
+        
+        # Notify user that image was received
+        # chat_ctx = self.chat_ctx.copy()
+        # chat_ctx.add_message(
+        #     role="user",
+        #     content=["I have uploaded an image. You can analyze it when I ask you to."]
+        # )
+        # await self.update_chat_ctx(chat_ctx)
+        
+        # Trigger a reply acknowledging receipt
+        # self.session.generate_reply()
+
+    def _normalize_vision_response(self, raw_content: Any) -> str:
+        """Convert various content formats to a clean JSON string (no code fences)."""
+        # Handle multimodal content list responses
+        if isinstance(raw_content, list):
+            text_parts: List[str] = []
+            for part in raw_content:
+                if isinstance(part, dict):
+                    if part.get("type") == "text" and "text" in part:
+                        text_parts.append(str(part["text"]))
+                else:
+                    text_parts.append(str(part))
+            raw_content = "".join(text_parts)
+
+        content = str(raw_content).strip()
+        if content.startswith("```"):
+            # Strip fences like ```json ... ```
+            content = content.lstrip("`")
+            if content.lower().startswith("json"):
+                content = content[4:].lstrip()
+            if "```" in content:
+                content = content.split("```", 1)[0].strip()
+        return content
+
+    async def describe_image_with_api(
+        self,
+        base64_image: str,
+        backend: str = "siliconflow",
+        prompt: str = "",
+        extract_target: str = "",
+        requirements: str = "",
+    ) -> str:
+        """
+        Send base64 image to Vision API (SiliconFlow or Aliyun) and request a
+        JSON-formatted analysis aligned with ask_image_capture signature.
+        """
+        if backend == "aliyun":
+            return await self._describe_image_aliyun(
+                base64_image,
+                prompt=prompt,
+                extract_target=extract_target,
+                requirements=requirements,
+            )
+        else:
+            return await self._describe_image_siliconflow(
+                base64_image,
+                prompt=prompt,
+                extract_target=extract_target,
+                requirements=requirements,
+            )
+
+    async def _describe_image_aliyun(
+        self,
+        base64_image: str,
+        prompt: str,
+        extract_target: str,
+        requirements: str,
+    ) -> str:
+        logger.info("Sending image to Aliyun Vision API (Qwen-VL-Max)...")
+        api_key = os.getenv("DASHSCOPE_API_KEY")
+        if not api_key:
+            logger.error("DASHSCOPE_API_KEY not found")
+            return "Error: DASHSCOPE_API_KEY not configured."
+
+        url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
+        
+        analysis_prompt = (
+            "请根据以下任务要求对图片进行结构化分析，并且只返回紧凑的JSON："
+            f'{{"prompt": "{prompt}", '
+            f'"extract_target": "{extract_target}", '
+            f'"requirements": "{requirements}", '
+            '"summary": "<用不超过50字中文概述图片>", '
+            '"target_found": <true|false>, '
+            '"target_details": "<若存在目标，描述关键特征；否则写未发现>", '
+            '"quality_ok": <true|false>, '
+            '"quality_notes": ["<不满足要求的原因，若满足可为空数组>"]}'
+            "。不要添加额外说明、前缀或代码块。"
+        )
+
+        payload = {
+            "model": "qwen-vl-plus",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": analysis_prompt
+                        }
+                    ]
+                }
+            ]
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+        }
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        raw_content = data['choices'][0]['message']['content']
+                        normalized = self._normalize_vision_response(raw_content)
+                        try:
+                            parsed = json.loads(normalized)
+                            description = json.dumps(parsed, ensure_ascii=False)
+                        except Exception:
+                            description = normalized
+                        logger.info(f"Got image description from Aliyun: {description}")
+                        return description
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"Aliyun API error: {response.status} - {error_text}")
+                        return "Failed to analyze image due to Aliyun API error."
+        except Exception as e:
+            logger.error(f"Exception calling Aliyun Vision API: {e}")
+            return f"Error analyzing image: {str(e)}"
+
+    async def _describe_image_siliconflow(
+        self,
+        base64_image: str,
+        prompt: str,
+        extract_target: str,
+        requirements: str,
+    ) -> str:
+        """
+        Send base64 image to SiliconFlow Vision API.
+        """
+        logger.info("Sending image to SiliconFlow Vision API...")
+        
+        # NOTE: You need to set your SiliconFlow API key here or in your .env file
+        api_key = os.getenv("SILICONFLOW_API_KEY", "your-api-key")
+        
+        url = "https://api.siliconflow.cn/v1/chat/completions"
+        
+        analysis_prompt = (
+            "请根据以下任务要求输出JSON结果，不要附加任何解释或代码块："
+            f'{{"prompt": "{prompt}", '
+            f'"extract_target": "{extract_target}", '
+            f'"requirements": "{requirements}", '
+            '"summary": "<50字内中文概述图片>", '
+            '"target_found": <true|false>, '
+            '"target_details": "<若找到目标描述关键特征，否则写未发现>", '
+            '"quality_ok": <true|false>, '
+            '"quality_notes": ["<若不满足拍摄要求，列出问题；满足则为空数组>"]}'
+        )
+
+        payload = {
+            "model": "Qwen/Qwen3-VL-8B-Instruct",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}"
+                            }
+                        },
+                        {
+                            "type": "text",
+                            "text": analysis_prompt
+                        }
+                    ]
+                }
+            ],
+            "max_tokens": 512
+        }
+        
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json"
+        }
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(url, json=payload, headers=headers) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        raw_content = data['choices'][0]['message']['content']
+                        normalized = self._normalize_vision_response(raw_content)
+                        try:
+                            parsed = json.loads(normalized)
+                            description = json.dumps(parsed, ensure_ascii=False)
+                        except Exception:
+                            description = normalized
+                        logger.info(f"Got image description: {description}")
+                        return description
+                    else:
+                        error_text = await response.text()
+                        logger.error(f"SiliconFlow API error: {response.status} - {error_text}")
+                        return "Failed to analyze image due to API error."
+        except Exception as e:
+            logger.error(f"Exception calling Vision API: {e}")
+            return f"Error analyzing image: {str(e)}"
+
+    @function_tool()
+    async def ask_image_capture(
+        self,
+        context: RunContext,
+        prompt: str = "请拍摄照片",
+        extract_target: str = "车辆号牌",
+        requirements: str = "清晰完整、无模糊",
+    ):
+        """
+        请求用户根据提供的提示语拍摄或上传照片，并检查照片中是否包含指定的提取目标且满足相关要求。
+
+        参数:
+            context (RunContext): 代理运行上下文。
+            prompt (str, 可选): 提示用户拍摄什么内容（例如：“请拍摄事故现场照片”，“请拍摄车牌号”）。默认值为“请拍摄照片”。
+            extract_target (str, 可选): 期望照片中包含的主要目标（例如：“车辆号牌”）。默认值为“车辆号牌”。
+            requirements (str, 可选): 拍摄的具体要求说明（例如：清晰、构图完整）。默认值为“清晰完整、无模糊”。
+
+        返回:
+            dict: 简洁可序列化的工具结果。
+                - success: bool
+                - analysis: 解析后的结果（解析失败则为字符串）
+                - raw: 原始文本结果（字符串）
+                - error/details: 仅失败时返回
+        """
+        await self._send_chat_message(
+            "┌─🔨 Call: ask_image_capture\n"
+            f"│ prompt: \"{prompt}\"\n"
+            "└───────────────"
+        )
+        try:
+            room = get_job_context().room
+            participant_identity = next(iter(room.remote_participants))
+            
+            # Clear previous image event
+            self._image_event.clear()
+            
+            # Speak the capture prompt so the user hears what to do
+            self.session.say(prompt, allow_interruptions=True)
+
+            # Ask for image capture and wait for user to capture/upload
+            response = await room.local_participant.perform_rpc(
+                destination_identity=participant_identity,
+                method="askImageCapture",
+                payload=json.dumps({
+                    "prompt": prompt,
+                }),
+                response_timeout=120.0,  # Longer timeout for user to capture image
+            )
+            logger.info(f"Received image capture response: {response}")
+            
+            # Parse the response to check for success/error (image data comes via byte stream)
+            try:
+                response_data = json.loads(response)
+                
+                if "error" in response_data:
+                    return {
+                        "success": False,
+                        "error": "图片采集失败",
+                        "details": response_data.get("error"),
+                    }
+                
+                # Wait for the image byte stream to complete (signaled by _image_received)
+                if not self._image_event.is_set():
+                    try:
+                        # Give a reasonable buffer for file transfer to finish after RPC returns
+                        await asyncio.wait_for(self._image_event.wait(), timeout=10.0)
+                    except asyncio.TimeoutError:
+                        if not self._captured_images:
+                            return {
+                                "success": False,
+                                "error": "未收到图片数据",
+                                "details": "Timeout waiting for file stream",
+                            }
+                
+                # Use the LAST stored image
+                last_image_bytes = self._captured_images[-1]
+                logger.info(f"Analyzing last stored image, size: {len(last_image_bytes)} bytes")
+                
+                base64_image = base64.b64encode(last_image_bytes).decode('utf-8')
+                
+                # Analyze the image using the configured vision backend
+                description = await self.describe_image_with_api(
+                    base64_image,
+                    self.vision_backend,
+                    prompt=prompt,
+                    extract_target=extract_target,
+                    requirements=requirements,
+                )
+                logger.info(f"Image analysis result: {description}")
+                
+                # Try to parse analysis as JSON; fall back to raw string
+                raw_text = description if isinstance(description, str) else json.dumps(description, ensure_ascii=False)
+                parsed_analysis = raw_text
+                try:
+                    parsed_analysis = json.loads(raw_text)
+                except Exception:
+                    parsed_analysis = raw_text
+
+                # Derive validity signals for the LLM to decide on retries
+                valid = False
+                problems = []
+                target_details = None
+                if isinstance(parsed_analysis, dict):
+                    target_found = bool(parsed_analysis.get("target_found"))
+                    quality_ok = bool(parsed_analysis.get("quality_ok"))
+                    valid = target_found and quality_ok
+                    target_details = parsed_analysis.get("target_details")
+                    if not target_found:
+                        problems.append(f"未发现{extract_target}")
+                    if not quality_ok:
+                        notes = parsed_analysis.get("quality_notes") or []
+                        problems.extend(notes)
+
+                await self._send_chat_message(
+                    "┌─✅ Result: ask_image_capture\n"
+                    f"│ valid: {valid}, problems: {problems}\n"
+                    f"│ analysis: \"{str(raw_text)[:50]}...\"\n"
+                    "└───────────────"
+                )
+
+                return {
+                    "success": True,
+                    "analysis": parsed_analysis,
+                    "raw": raw_text,
+                    "valid": valid,
+                    "problems": problems,
+                    "target_details": target_details,
+                }
+                
+            except json.JSONDecodeError:
+                logger.error(f"Failed to parse response: {response}")
+                return {
+                    "success": False,
+                    "error": "图片数据解析失败",
+                    "details": response,
+                }
+                
+        except Exception as e:
+            logger.error(f"Failed to capture and analyze image: {e}")
+            raise ToolError(f"无法完成图片采集和分析: {str(e)}")
+
+    @function_tool()
+    async def get_mobile_by_plate(
+        self,
+        context: RunContext,
+        plate_number: str,
+    ):
+        """
+        根据车牌号查询登记的手机号。
+
+        返回:
+            dict: { success: bool, plate: str, mobile: str | None, error?: str }
+        """
+        normalized_plate = plate_number.strip().upper()
+        await self._send_chat_message(
+            "┌─🔨 Call: get_mobile_by_plate\n"
+            f"│ plate: \"{normalized_plate}\"\n"
+            "└───────────────"
+        )
+        # Dummy fixed response (placeholder backend)
+        return {
+            "success": True,
+            "plate": normalized_plate,
+            "mobile": "13800001234",
+        }
+
+    @function_tool()
+    async def get_id_card_by_plate(
+        self,
+        context: RunContext,
+        plate_number: str,
+    ):
+        """
+        根据车牌号查询登记的身份证号。
+
+        返回:
+            dict: { success: bool, plate: str, id_card: str | None, error?: str }
+        """
+        normalized_plate = plate_number.strip().upper()
+        await self._send_chat_message(
+            "┌─🔨 Call: get_id_card_by_plate\n"
+            f"│ plate: \"{normalized_plate}\"\n"
+            "└───────────────"
+        )
+        # Dummy fixed response (placeholder backend)
+        return {
+            "success": True,
+            "plate": normalized_plate,
+            "id_card": "320101198001011234",
+        }
+
+    @function_tool()
+    async def validate_mobile_number(
+        self,
+        context: RunContext,
+        mobile: str,
+    ):
+        """
+        检查手机号格式是否正确（大陆 11 位手机号，1[3-9] 开头）。
+
+        返回:
+            dict: { success: bool, valid: bool, mobile: str, error?: str }
+        """
+        normalized = mobile.strip().replace(" ", "")
+        await self._send_chat_message(
+            "┌─🔨 Call: validate_mobile_number\n"
+            f"│ mobile: \"{normalized}\"\n"
+            "└───────────────"
+        )
+        is_valid = bool(re.fullmatch(r"1[3-9]\\d{9}", normalized))
+        if is_valid:
+            return {
+                "success": True,
+                "valid": True,
+                "mobile": normalized,
+            }
+        return {
+            "success": True,
+            "valid": False,
+            "mobile": normalized,
+            "error": "手机号格式不正确，应为1[3-9]开头的11位数字",
+        }
+
+    @function_tool()
+    async def validate_id_card_number(
+        self,
+        context: RunContext,
+        id_card: str,
+    ):
+        """
+        检查身份证号格式是否正确（18位含校验位X或15位纯数字）。
+
+        返回:
+            dict: { success: bool, valid: bool, id_card: str, error?: str }
+        """
+        normalized = id_card.strip().replace(" ", "").upper()
+        await self._send_chat_message(
+            "┌─🔨 Call: validate_id_card_number\n"
+            f"│ id_card: \"{normalized}\"\n"
+            "└───────────────"
+        )
+        is_valid = bool(re.fullmatch(r"(\\d{17}[\\dX]|\\d{15})", normalized))
+        if is_valid:
+            return {
+                "success": True,
+                "valid": True,
+                "id_card": normalized,
+            }
+        return {
+            "success": True,
+            "valid": False,
+            "id_card": normalized,
+            "error": "身份证格式不正确，应为18位（末位可为X）或15位数字",
+        }
+
+    @function_tool()
+    async def enter_hand_off_to_human_mode(
+        self,
+        context: RunContext,
+    ):
+        """切换到“转人工”模式（前端电话界面进入人工处理）。返回成功/失败。"""
+        await self._send_chat_message("🔨 Call: enter_hand_off_to_human_mode")
+        try:
+            room = get_job_context().room
+            participant_identity = next(iter(room.remote_participants))
+            response = await room.local_participant.perform_rpc(
+                destination_identity=participant_identity,
+                method="enterHandOffToHumanMode",
+                payload=json.dumps({}),
+                response_timeout=5.0,
+            )
+            logger.info(f"Entered hand off to human mode: {response}")
+            await self._send_chat_message(f"✅ Result: enter_hand_off_to_human_mode\n  • status: success")
+            return response
+        except Exception as e:
+            logger.error(f"Failed to enter hand off to human mode: {e}")
+            raise ToolError(f"Unable to enter hand off to human mode: {str(e)}")
+
+    @function_tool()
+    async def hang_up_call(
+        self,
+        context: RunContext,
+    ):
+        """挂断当前通话（结束会话），返回成功/失败。"""
+        await self._send_chat_message("🔨 Call: hang_up_call")
+        try:
+            room = get_job_context().room
+            participant_identity = next(iter(room.remote_participants))
+            response = await room.local_participant.perform_rpc(
+                destination_identity=participant_identity,
+                method="hangUpCall",
+                payload=json.dumps({}),
+                response_timeout=5.0,
+            )
+            logger.info(f"Hung up call: {response}")
+            await self._send_chat_message(f"✅ Result: hang_up_call\n  • status: disconnected")
+            return response
+        except Exception as e:
+            logger.error(f"Failed to hang up call: {e}")
+            raise ToolError(f"Unable to hang up call: {str(e)}")
+
+    @function_tool()
+    async def ask_important_question(self, context: RunContext, message: str, options: Optional[List[str]] = None):
+        """询问关键问题并等待用户选择选项，返回用户的选择结果。
+
+        参数:
+            message: 要朗读/展示的问题。
+            options: 可选项列表（字符串数组），如 ["继续办理", "转人工"]。
+
+        返回:
+            str: 用户选择的文本内容。
+        """
+        await self._send_chat_message(f"🔨 Call: ask_important_question\n  • message: \"{message}\"\n  • options: {options}")
+        try:
+            room = get_job_context().room
+            participant_identity = next(iter(room.remote_participants))
+            
+            payload_data = {
+                "message": message,
+            }
+            
+            if options:
+                # Support both list input and JSON-stringified list input
+                if isinstance(options, str):
+                    try:
+                        options = json.loads(options)
+                    except json.JSONDecodeError:
+                        logger.error(f"Failed to parse options string: {options}")
+                        options = None
+
+                # Ensure options is a list of strings
+                if isinstance(options, list):
+                    payload_data["options"] = options
+                
+            # Speak the message
+            speech_handle = self.session.say(message, allow_interruptions=True)
+            
+            # Wait for user selection with longer timeout since user needs time to respond
+            response = await room.local_participant.perform_rpc(
+                destination_identity=participant_identity,
+                method="askImportantQuestion",
+                payload=json.dumps(payload_data),
+                response_timeout=60.0,  # Increased timeout for user interaction
+            )
+            
+            # Interrupt speech if user makes a selection while agent is speaking
+            if speech_handle and hasattr(speech_handle, "interrupt"):
+                speech_handle.interrupt()
+                logger.info("Interrupted speech due to user selection")
+                
+            logger.info(f"User made selection: {response}")
+            
+            # Parse the response to get the user's selection
+            try:
+                response_data = json.loads(response)
+                user_selection = response_data.get("selection", "确认")
+                logger.info(f"User selected: {user_selection}")
+                
+                await self._send_chat_message(f"✅ Result: ask_important_question\n  • selection: \"{user_selection}\"")
+                return f"用户选择了: {user_selection}"
+            except json.JSONDecodeError:
+                logger.error(f"Failed to parse response: {response}")
+                return f"用户选择了: {response}"
+                
+        except Exception as e:
+            logger.error(f"Failed to ask important question: {e}")
+            raise ToolError(f"Unable to ask important question: {str(e)}")
+
+
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+async def launch_avatar(ctx: JobContext, avatar_dispatcher_url: str, avatar_identity: str) -> None:
+    """
+    Send a request to the avatar service for it to join the room
+
+    This function should be wrapped in a avatar plugin.
+    """
+
+    # create a token for the avatar to join the room
+    token = (
+        api.AccessToken()
+        .with_identity(avatar_identity)
+        .with_name("Avatar Runner")
+        .with_grants(api.VideoGrants(room_join=True, room=ctx.room.name))
+        .with_kind("agent")
+        .with_attributes({ATTRIBUTE_PUBLISH_ON_BEHALF: ctx.local_participant_identity})
+        .to_jwt()
+    )
+
+    logger.info(f"Sending connection info to avatar dispatcher {avatar_dispatcher_url}")
+    connection_info = AvatarConnectionInfo(room_name=ctx.room.name, url=ctx._info.url, token=token)
+    async with httpx.AsyncClient() as client:
+        response = await client.post(avatar_dispatcher_url, json=asdict(connection_info))
+        response.raise_for_status()
+    logger.info("Avatar handshake completed")
+
+
+async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_backend: str = "siliconflow", llm_backend: str = "deepseek"):
+    # each log entry will include these fields
+    ctx.log_context_fields = {
+        "room": ctx.room.name,
+    }
+
+    # IMPORTANT: Connect to room first
+    logger.info("connecting to room")
+    await ctx.connect()
+
+    logger.info("waiting for participant")
+    participant = await ctx.wait_for_participant()
+    logger.info(f"starting agent for participant {participant.identity}")
+
+    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    logger.info(f"Current time: {current_time}")
+
+    initial_voice_id = "BV001_streaming" # Female voice
+    if participant.attributes.get("voice"):
+        initial_voice_id = participant.attributes.get("voice")
+        logger.info(f"User selected voice: {initial_voice_id}")
+
+    # initial_instructions = """Your name is Kelly. You would interact with users via voice.
+    # with that in mind keep your responses concise and to the point.
+    # do not use emojis, asterisks, markdown, or other special characters in your responses.
+    # You are curious and friendly, and have a sense of humor.
+    # you will speak chinese to the user"
+    # """
+    initial_instructions = DEFAULT_INSTRUCTIONS
+    if participant.attributes.get("instructions"):
+        initial_instructions = participant.attributes.get("instructions")
+        logger.info(f"User selected instructions: {initial_instructions}")
+
+    # Only replace the datetime placeholder to avoid KeyError from other braces in the prompt
+    initial_instructions = initial_instructions.replace("{datetime}", current_time)
+    logger.info(f"Initial instructions: {initial_instructions}")
+
+    if llm_backend == "dashscope":
+        logger.info("Using DashScope DeepSeek backend")
+        llm = openai.LLM(
+            base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+            api_key=os.getenv("DASHSCOPE_API_KEY"),
+            model="deepseek-v3.2"
+        )
+    else:
+        logger.info("Using default DeepSeek backend")
+        llm = openai.LLM.with_deepseek(
+            model='deepseek-chat'
+        )
+
+    session = AgentSession(
+        vad=ctx.proc.userdata["vad"],
+        # any combination of STT, LLM, TTS, or realtime API can be used
+        # stt = aliyun.STT(model="paraformer-realtime-v2"),
+        stt = volcengine.BigModelSTT(
+            app_id="8607675070"
+        ),
+        # stt = volcengine.STT(
+        #     app_id="2931820332",
+        #     cluster="volcengine_streaming_common"
+        # ),
+        llm=llm,
+        # tts=aliyun.TTS(model="cosyvoice-v2", voice="longcheng_v2"),
+        tts=volcengine.TTS(
+            app_id="2931820332",
+            cluster="volcano_tts",
+            voice=initial_voice_id,
+            sample_rate=8000,
+            speed=1.1,
+        ),
+        # allow the LLM to generate a response while waiting for the end of turn
+        preemptive_generation=True,
+        # sometimes background noise could interrupt the agent session, these are considered false positive interruptions
+        # when it's detected, you may resume the agent's speech
+        resume_false_interruption=True,
+        false_interruption_timeout=1.0,
+    )
+
+    # log metrics as they are emitted, and total usage after session is over
+    usage_collector = metrics.UsageCollector()
+
+    @session.on("metrics_collected")
+    def _on_metrics_collected(ev: MetricsCollectedEvent):
+        metrics.log_metrics(ev.metrics)
+        usage_collector.collect(ev.metrics)
+
+    async def log_usage():
+        summary = usage_collector.get_summary()
+        logger.info(f"Usage: {summary}")
+
+    # shutdown callbacks are triggered when the session is over
+    ctx.add_shutdown_callback(log_usage)
+
+    # Launch avatar if avatar_dispatcher_url is provided
+    if avatar_dispatcher_url:
+        await launch_avatar(ctx, avatar_dispatcher_url, AVATAR_IDENTITY)
+        session.output.audio = DataStreamAudioOutput(
+            ctx.room,
+            destination_identity=AVATAR_IDENTITY,
+            # (optional) wait for the avatar to publish video track before generating a reply
+            wait_remote_track=rtc.TrackKind.KIND_VIDEO,
+        )
+
+        @session.output.audio.on("playback_finished")
+        def on_playback_finished(ev: PlaybackFinishedEvent) -> None:
+            # the avatar should notify when the audio playback is finished
+            logger.info(
+                "playback_finished",
+                extra={
+                    "playback_position": ev.playback_position,
+                    "interrupted": ev.interrupted,
+                },
+            )
+
+    await session.start(
+        agent=MyAgent(initial_instructions, vision_backend=vision_backend),
+        room=ctx.room,
+        room_input_options=RoomInputOptions(
+            # uncomment to enable Krisp BVC noise cancellation
+            # noise_cancellation=noise_cancellation.BVC(),
+        ),
+        room_output_options=RoomOutputOptions(transcription_enabled=True),
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL")
+    parser.add_argument("--vision-backend", type=str, default="aliyun", choices=["siliconflow", "aliyun"], help="Vision API backend")
+    parser.add_argument("--llm-backend", type=str, default="dashscope", choices=["deepseek", "dashscope"], help="LLM backend")
+    args, remaining_args = parser.parse_known_args()
+    sys.argv = sys.argv[:1] + remaining_args
+
+    if args.avatar_url:
+        cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, avatar_dispatcher_url=args.avatar_url, vision_backend=args.vision_backend, llm_backend=args.llm_backend), prewarm_fnc=prewarm))
+    else:
+        cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, vision_backend=args.vision_backend, llm_backend=args.llm_backend), prewarm_fnc=prewarm))
diff --git a/src/components/chat/ChatMessageInput.tsx b/src/components/chat/ChatMessageInput.tsx
index 37205e5..9b17787 100644
--- a/src/components/chat/ChatMessageInput.tsx
+++ b/src/components/chat/ChatMessageInput.tsx
@@ -6,6 +6,7 @@ type ChatMessageInput = {
   accentColor: string;
   height: number;
   onSend?: (message: string) => void;
+  disabled?: boolean;
 };
 
 export const ChatMessageInput = ({
@@ -13,6 +14,7 @@ export const ChatMessageInput = ({
   accentColor,
   height,
   onSend,
+  disabled = false,
 }: ChatMessageInput) => {
   const [message, setMessage] = useState("");
   const [inputTextWidth, setInputTextWidth] = useState(0);
@@ -79,7 +81,8 @@ export const ChatMessageInput = ({
         ></div>
         <input
           ref={inputRef}
-          className={`w-full text-xs caret-transparent bg-transparent opacity-25 text-gray-300 p-2 pr-6 rounded-sm focus:opacity-100 focus:outline-none focus:border-${accentColor}-700 focus:ring-1 focus:ring-${accentColor}-700`}
+          disabled={disabled}
+          className={`w-full text-xs caret-transparent bg-transparent opacity-25 text-gray-300 p-2 pr-6 rounded-sm focus:opacity-100 focus:outline-none focus:border-${accentColor}-700 focus:ring-1 focus:ring-${accentColor}-700 disabled:opacity-10 disabled:cursor-not-allowed`}
           style={{
             paddingLeft: message.length > 0 ? "12px" : "24px",
             caretShape: "block",
@@ -108,11 +111,11 @@ export const ChatMessageInput = ({
           {message.replaceAll(" ", "\u00a0")}
         </span>
         <button
-          disabled={message.length === 0 || !onSend}
+          disabled={disabled || message.length === 0 || !onSend}
           onClick={handleSend}
           className={`text-xs uppercase text-${accentColor}-500 hover:bg-${accentColor}-950 p-2 rounded-md opacity-${
-            message.length > 0 ? 100 : 25
-          } pointer-events-${message.length > 0 ? "auto" : "none"}`}
+            !disabled && message.length > 0 ? 100 : 25
+          } pointer-events-${!disabled && message.length > 0 ? "auto" : "none"}`}
         >
           Send
         </button>
diff --git a/src/components/chat/ChatTile.tsx b/src/components/chat/ChatTile.tsx
index 0ed606e..e0c1415 100644
--- a/src/components/chat/ChatTile.tsx
+++ b/src/components/chat/ChatTile.tsx
@@ -16,9 +16,10 @@ type ChatTileProps = {
   messages: ChatMessageType[];
   accentColor: string;
   onSend?: (message: string) => Promise<ComponentsChatMessage>;
+  inputDisabled?: boolean;
 };
 
-export const ChatTile = ({ messages, accentColor, onSend }: ChatTileProps) => {
+export const ChatTile = ({ messages, accentColor, onSend, inputDisabled }: ChatTileProps) => {
   const containerRef = useRef<HTMLDivElement>(null);
   useEffect(() => {
     if (containerRef.current) {
@@ -30,7 +31,7 @@ export const ChatTile = ({ messages, accentColor, onSend }: ChatTileProps) => {
     <div className="flex flex-col gap-4 w-full h-full">
       <div
         ref={containerRef}
-        className="overflow-y-auto"
+        className="overflow-y-auto select-text"
         style={{
           height: `calc(100% - ${inputHeight}px)`,
         }}
@@ -58,6 +59,7 @@ export const ChatTile = ({ messages, accentColor, onSend }: ChatTileProps) => {
         placeholder="Type a message"
         accentColor={accentColor}
         onSend={onSend}
+        disabled={inputDisabled}
       />
     </div>
   );
diff --git a/src/components/playground/PhoneSimulator.tsx b/src/components/playground/PhoneSimulator.tsx
index 3969f7c..224c935 100644
--- a/src/components/playground/PhoneSimulator.tsx
+++ b/src/components/playground/PhoneSimulator.tsx
@@ -18,17 +18,29 @@ import { useToast } from "@/components/toast/ToasterProvider";
 export interface PhoneSimulatorProps {
     onConnect: () => void;
     onDisconnect: () => void;
-    phoneMode?: "normal" | "capture";
+    phoneMode?: "normal" | "capture" | "important_message" | "hand_off";
     onCapture?: (image: File) => void;
     capturePrompt?: string;
+    importantMessage?: string;
+    importantMessageOptions?: string[];
+    onImportantMessageAction?: (text: string) => void;
 }
 
-export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal", onCapture, capturePrompt }: PhoneSimulatorProps) {
+export function PhoneSimulator({ 
+    onConnect, 
+    onDisconnect, 
+    phoneMode = "normal", 
+    onCapture, 
+    capturePrompt,
+    importantMessage,
+    importantMessageOptions,
+    onImportantMessageAction
+}: PhoneSimulatorProps) {
   const { config, setUserSettings } = useConfig();
   const { setToastMessage } = useToast();
   const room = useRoomContext();
   const roomState = useConnectionState();
-  const { localParticipant } = useLocalParticipant();
+  const { localParticipant, isMicrophoneEnabled: isMicEnabled } = useLocalParticipant();
   const tracks = useTracks();
   const voiceAssistant = useVoiceAssistant();
   const fileInputRef = useRef<HTMLInputElement>(null);
@@ -43,6 +55,10 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
   const [processingSource, setProcessingSource] = useState<
     "camera" | "upload" | null
   >(null);
+  const MAX_UPLOAD_MB = 10;
+  const isAgentSpeaking = voiceAssistant.state === "speaking";
+  const wasMicEnabledRef = useRef(false);
+  const lastPhoneMode = useRef(phoneMode);
 
   useEffect(() => {
     const voiceAttr = config.settings.attributes?.find(a => a.key === "voice");
@@ -136,6 +152,40 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
     }
   }, [voiceAssistant.state]);
 
+  useEffect(() => {
+    const enteringMode = (mode: typeof phoneMode) =>
+      phoneMode === mode && lastPhoneMode.current !== mode;
+
+    // Entering important message / capture / hand_off: remember mic state and mute if needed
+    if (enteringMode("important_message") || enteringMode("capture") || enteringMode("hand_off")) {
+      wasMicEnabledRef.current = isMicEnabled;
+      if (isMicEnabled) {
+        localParticipant.setMicrophoneEnabled(false);
+      }
+    }
+    // Exiting important message mode or hand off mode or capture mode
+    else if (
+      (phoneMode !== "important_message" && lastPhoneMode.current === "important_message") ||
+      (phoneMode !== "hand_off" && lastPhoneMode.current === "hand_off") ||
+      (phoneMode !== "capture" && lastPhoneMode.current === "capture")
+    ) {
+      // Restore mic to previous state
+      localParticipant.setMicrophoneEnabled(wasMicEnabledRef.current);
+
+      // If exiting capture mode, clear processing image
+      if (lastPhoneMode.current === "capture") {
+          setProcessingImage(null);
+          setProcessingSource(null);
+      }
+    }
+    // Enforce mic off in important message mode, hand off mode, or capture mode
+    else if ((phoneMode === "important_message" || phoneMode === "hand_off" || phoneMode === "capture") && isMicEnabled) {
+      localParticipant.setMicrophoneEnabled(false);
+    }
+
+    lastPhoneMode.current = phoneMode;
+  }, [phoneMode, isMicEnabled, localParticipant]);
+
   useEffect(() => {
     const updateTime = () => {
       const now = new Date();
@@ -159,8 +209,6 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
     ({ source }) => source === Track.Source.Camera
   );
 
-  const isMicEnabled = localParticipant.isMicrophoneEnabled;
-
   const handleMicToggle = async () => {
     if (isMicEnabled) {
       await localParticipant.setMicrophoneEnabled(false);
@@ -173,6 +221,20 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
     onDisconnect();
   };
 
+  const validateImageFile = (file: File) => {
+    const isImage = file.type.startsWith("image/");
+    const isSizeOk = file.size > 0 && file.size <= MAX_UPLOAD_MB * 1024 * 1024;
+    return {
+      isValid: isImage && isSizeOk,
+      isImage,
+      isSizeOk,
+    };
+  };
+
+  const showErrorToast = (message: string) => {
+    setToastMessage({ message, type: "error" });
+  };
+
   const handleCapture = async () => {
     if (!localCameraTrack || !onCapture || isCapturing) return;
     setIsCapturing(true);
@@ -278,6 +340,14 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
             const file = new File([blob], "camera-capture.jpg", {
               type: "image/jpeg",
             });
+            const { isValid } = validateImageFile(file);
+            if (!isValid) {
+              setProcessingImage(null);
+              setProcessingSource(null);
+              showErrorToast(`请拍摄${MAX_UPLOAD_MB}MB以内的图片文件。`);
+              setIsCapturing(false);
+              return;
+            }
             onCapture(file);
           }
           setIsCapturing(false);
@@ -340,9 +410,17 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
   const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
     const file = event.target.files?.[0];
     if (file && onCapture) {
-      onCapture(file);
-      setProcessingImage(URL.createObjectURL(file));
-      setProcessingSource("upload");
+      const { isValid, isImage, isSizeOk } = validateImageFile(file);
+      if (!isValid) {
+        const msg = !isImage
+          ? "请选择图片文件。"
+          : `请上传${MAX_UPLOAD_MB}MB以内的图片文件。`;
+        showErrorToast(msg);
+      } else {
+        onCapture(file);
+        setProcessingImage(URL.createObjectURL(file));
+        setProcessingSource("upload");
+      }
     }
     // Reset input so the same file can be selected again
     if (event.target) {
@@ -461,9 +539,26 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
         .scan-animation {
           animation: scan 3s linear infinite;
         }
+        @keyframes handoffPulse {
+          0%, 100% { 
+            opacity: 1;
+            transform: scale(1);
+          }
+          50% { 
+            opacity: 0.8;
+            transform: scale(1.05);
+          }
+        }
+        @keyframes handoffFade {
+          0%, 100% { opacity: 1; }
+          50% { opacity: 0.6; }
+        }
+        .handoff-animation {
+          animation: handoffPulse 2s ease-in-out infinite, handoffFade 2s ease-in-out infinite;
+        }
       `}</style>
       {/* Status Bar */}
-      <div className="h-12 w-full bg-black/20 absolute top-0 left-0 z-20 flex items-center justify-between px-6 text-white text-xs font-medium backdrop-blur-sm">
+      <div className="h-12 w-full bg-black/20 backdrop-blur-sm absolute top-0 left-0 z-50 flex items-center justify-between px-6 text-white text-xs font-medium">
         <span>{currentTime}</span>
         <div className="flex items-center gap-2">
           <WifiIcon className="w-4 h-4" />
@@ -471,12 +566,17 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
         </div>
       </div>
 
-      {/* Dynamic Island / Notch Placeholder */}
-      <div className="absolute top-0 left-1/2 -translate-x-1/2 w-32 h-7 bg-black rounded-b-2xl z-30"></div>
-
       {/* Main Content */}
-      <div ref={phoneContainerRef} className="flex-grow relative bg-gray-950 w-full h-full">
+      <div ref={phoneContainerRef} className="flex-grow relative bg-gray-950 w-full h-full overflow-hidden">
+        <div className={`h-full w-full transition-all duration-500 ease-in-out transform ${
+            phoneMode === "hand_off" && roomState === ConnectionState.Connected 
+                ? "blur-md scale-105" 
+                : phoneMode === "capture"
+                    ? "scale-105 brightness-110"
+                    : "scale-100"
+        }`}>
         {videoContent}
+        </div>
         {processingImage && (
           <div className="absolute inset-0 z-10 bg-black flex items-center justify-center">
             {/* eslint-disable-next-line @next/next/no-img-element */}
@@ -492,8 +592,11 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
             <div className="absolute inset-0 overflow-hidden">
                <div className="w-full h-[2px] bg-blue-500 shadow-[0_0_15px_rgba(59,130,246,1)] scan-animation absolute"></div>
             </div>
-            <div className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 w-full text-center text-blue-400 font-mono text-sm animate-pulse z-20">
-              正在分析...
+            <div 
+                className="absolute top-1/2 left-1/2 -translate-x-1/2 -translate-y-1/2 w-full text-center font-mono text-2xl font-semibold animate-pulse z-20"
+                style={{ color: "var(--lk-theme-color)" }}
+            >
+              正在分析照片，请稍候...
             </div>
           </div>
         )}
@@ -530,11 +633,68 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
             </div>
         )}
         
-        {/* Agent Audio Visualizer (Draggable) */}
-        {roomState === ConnectionState.Connected && voiceAssistant.audioTrack && (
+      </div>
+
+      {/* Important Message Overlay */}
+      {roomState === ConnectionState.Connected && phoneMode === "important_message" && (
+           <div className="absolute inset-0 z-30 bg-black/60 backdrop-blur-md flex items-center justify-center p-6">
+               <div className="bg-gray-800/80 backdrop-blur-xl border border-white/10 p-6 rounded-[2rem] shadow-2xl w-full max-w-xs text-center flex flex-col gap-6 transform transition-all">
+                   <div className="flex flex-col items-center gap-4">
+                       <p className="text-white text-lg font-medium leading-relaxed">
+                           {importantMessage}
+                       </p>
+                   </div>
+                   <div className="flex flex-col gap-3 w-full">
+                       {importantMessageOptions && importantMessageOptions.length > 0 ? (
+                           importantMessageOptions.map((option, index) => (
+                               <button 
+                                   key={index}
+                                   onClick={() => {
+                                     if (!isAgentSpeaking) {
+                                       onImportantMessageAction?.(option);
+                                     }
+                                   }}
+                                   disabled={isAgentSpeaking}
+                                   className={`w-full bg-white text-black font-semibold py-3.5 rounded-xl hover:bg-gray-200 transition-colors active:scale-95 transform ${isAgentSpeaking ? "opacity-50 cursor-not-allowed" : ""}`}
+                               >
+                                   {option}
+                               </button>
+                           ))
+                       ) : (
+                   <button 
+                               onClick={() => {
+                                 if (!isAgentSpeaking) {
+                                   onImportantMessageAction?.("确认");
+                                 }
+                               }}
+                               disabled={isAgentSpeaking}
+                               className={`w-full bg-white text-black font-semibold py-3.5 rounded-xl hover:bg-gray-200 transition-colors active:scale-95 transform ${isAgentSpeaking ? "opacity-50 cursor-not-allowed" : ""}`}
+                   >
+                               确认
+                   </button>
+                       )}
+                   </div>
+               </div>
+           </div>
+      )}
+
+      {/* Hand Off Mode Overlay */}
+      {roomState === ConnectionState.Connected && phoneMode === "hand_off" && (
+           <div className="absolute inset-0 z-30 flex items-center justify-center p-6">
+               <div 
+                   className="text-3xl font-semibold font-mono tracking-widest handoff-animation"
+                   style={{ color: "var(--lk-theme-color)" }}
+               >
+                   转人工...
+               </div>
+           </div>
+      )}
+
+      {/* Agent Audio Visualizer (Draggable) */}
+      {roomState === ConnectionState.Connected && voiceAssistant.audioTrack && phoneMode !== "hand_off" && (
           <div
             ref={visualizerRef}
-            className="absolute z-20 p-2 bg-black/40 backdrop-blur-md rounded-lg border border-white/10 shadow-lg cursor-move select-none"
+            className="absolute z-50 p-2 bg-black/40 backdrop-blur-md rounded-lg border border-white/10 shadow-lg cursor-move select-none"
             style={{
               left: visualizerPosition.x,
               top: visualizerPosition.y,
@@ -551,27 +711,35 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
             </div>
           </div>
         )}
-      </div>
 
       {/* Call Controls Overlay */}
       {roomState === ConnectionState.Connected && (
-        <div className="absolute bottom-[5%] left-0 w-full px-[8%] z-20">
-          {phoneMode === "capture" ? (
-            <div className="absolute top-0 left-0 w-full h-full flex flex-col justify-end pb-[5%] px-[8%] z-20">
+          phoneMode === "capture" ? (
+            <div className="absolute top-0 left-0 w-full h-full flex flex-col justify-end pb-[5%] px-[8%] z-40">
               {/* Camera Controls Row */}
               <div className="w-full flex items-center justify-evenly mb-8">
                   {/* Left: Upload */}
                   <button
-                    className="p-3 rounded-full bg-gray-800/50 text-white hover:bg-gray-800/70 transition-colors"
-                    onClick={handleUpload}
+                    className={`p-3 rounded-full bg-gray-800/50 text-white hover:bg-gray-800/70 transition-colors ${isAgentSpeaking ? "opacity-50 cursor-not-allowed" : ""}`}
+                    onClick={() => {
+                      if (!isAgentSpeaking) {
+                        handleUpload();
+                      }
+                    }}
+                    disabled={isAgentSpeaking}
                   >
                     <ImageIcon className="w-6 h-6" />
                   </button>
 
                   {/* Center: Capture */}
                   <button
-                    className="w-16 h-16 rounded-full border-4 border-white p-1 hover:scale-105 transition-transform shrink-0"
-                    onClick={handleCapture}
+                    className={`w-16 h-16 rounded-full border-4 border-white p-1 hover:scale-105 transition-transform shrink-0 ${isAgentSpeaking ? "opacity-50 cursor-not-allowed" : ""}`}
+                    onClick={() => {
+                      if (!isAgentSpeaking) {
+                        handleCapture();
+                      }
+                    }}
+                    disabled={isAgentSpeaking}
                   >
                     <div className="w-full h-full bg-white rounded-full"></div>
                   </button>
@@ -579,8 +747,13 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
                   {/* Right: Switch Camera */}
                   <div className="relative">
                     <button
-                      className="p-3 rounded-full bg-gray-800/50 text-white hover:bg-gray-800/70 transition-colors"
-                      onClick={handleSwitchCamera}
+                      className={`p-3 rounded-full bg-gray-800/50 text-white hover:bg-gray-800/70 transition-colors ${isAgentSpeaking ? "opacity-50 cursor-not-allowed" : ""}`}
+                      onClick={(e) => {
+                        if (!isAgentSpeaking) {
+                          handleSwitchCamera(e);
+                        }
+                      }}
+                      disabled={isAgentSpeaking}
                     >
                       <SwitchCameraIcon className="w-6 h-6" />
                     </button>
@@ -607,24 +780,8 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
                   </div>
               </div>
 
-              {/* Call Controls Row */}
+              {/* Call Controls Row (mic hidden in capture mode) */}
               <div className="w-full flex items-center justify-center gap-8">
-                  {/* Mic Toggle */}
-                  <button
-                    className={`p-4 rounded-full backdrop-blur-md transition-colors ${
-                      !isMicEnabled
-                        ? "bg-white text-black"
-                        : "bg-gray-600/50 text-white hover:bg-gray-600/70"
-                    }`}
-                    onClick={handleMicToggle}
-                  >
-                    {isMicEnabled ? (
-                      <MicIcon className="w-6 h-6" />
-                    ) : (
-                      <MicOffIcon className="w-6 h-6" />
-                    )}
-                  </button>
-
                    {/* End Call */}
                    <button
                     className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
@@ -635,22 +792,24 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
               </div>
             </div>
           ) : (
-            <div className="absolute bottom-[5%] left-0 w-full px-[8%] z-20">
+            <div className="absolute bottom-[5%] left-0 w-full px-[8%] z-40">
             <div className="w-full flex items-center justify-center gap-8">
-              <button
-                className={`p-4 rounded-full backdrop-blur-md transition-colors ${
-                  !isMicEnabled
-                    ? "bg-white text-black"
-                    : "bg-gray-600/50 text-white hover:bg-gray-600/70"
-                }`}
-                onClick={handleMicToggle}
-              >
-                {isMicEnabled ? (
-                  <MicIcon className="w-6 h-6" />
-                ) : (
-                  <MicOffIcon className="w-6 h-6" />
-                )}
-              </button>
+              {phoneMode !== "important_message" && phoneMode !== "hand_off" && (
+                <button
+                  className={`p-4 rounded-full backdrop-blur-md transition-colors ${
+                    !isMicEnabled
+                      ? "bg-white text-black"
+                      : "bg-gray-600/50 text-white hover:bg-gray-600/70"
+                  }`}
+                  onClick={handleMicToggle}
+                >
+                  {isMicEnabled ? (
+                    <MicIcon className="w-6 h-6" />
+                  ) : (
+                    <MicOffIcon className="w-6 h-6" />
+                  )}
+                </button>
+              )}
 
               <button
                 className="p-4 rounded-full bg-red-500 text-white hover:bg-red-600 transition-colors"
@@ -660,8 +819,7 @@ export function PhoneSimulator({ onConnect, onDisconnect, phoneMode = "normal",
               </button>
             </div>
             </div>
-          )}
-        </div>
+          )
       )}
     </div>
   );
diff --git a/src/components/playground/Playground.tsx b/src/components/playground/Playground.tsx
index 0cfa4b4..4a14ee4 100644
--- a/src/components/playground/Playground.tsx
+++ b/src/components/playground/Playground.tsx
@@ -30,7 +30,7 @@ import {
 } from "@livekit/components-react";
 import { ConnectionState, LocalParticipant, Track, RpcError, RpcInvocationData } from "livekit-client";
 import { QRCodeSVG } from "qrcode.react";
-import { ReactNode, useCallback, useEffect, useMemo, useState } from "react";
+import { ReactNode, useCallback, useEffect, useMemo, useRef, useState } from "react";
 import tailwindTheme from "../../lib/tailwindTheme.preval";
 import { EditableNameValueRow } from "@/components/config/NameValueRow";
 import { AttributesInspector } from "@/components/config/AttributesInspector";
@@ -58,15 +58,19 @@ export default function Playground({
   const { name } = useRoomInfo();
   const [transcripts, setTranscripts] = useState<ChatMessageType[]>([]);
   const { localParticipant } = useLocalParticipant();
-  const { send: sendChat } = useChat();
+  const { send: sendChat, chatMessages } = useChat();
 
   const voiceAssistant = useVoiceAssistant();
 
   const roomState = useConnectionState();
   const tracks = useTracks();
   const room = useRoomContext();
-  const [phoneMode, setPhoneMode] = useState<"normal" | "capture">("normal");
+  const [phoneMode, setPhoneMode] = useState<"normal" | "capture" | "important_message" | "hand_off">("normal");
   const [capturePrompt, setCapturePrompt] = useState<string>("");
+  const [importantMessage, setImportantMessage] = useState<string>("");
+  const [importantMessageOptions, setImportantMessageOptions] = useState<string[]>([]);
+  const importantMessageResolverRef = useRef<((value: string) => void) | null>(null);
+  const imageCaptureResolverRef = useRef<((value: string) => void) | null>(null);
 
   const [rpcMethod, setRpcMethod] = useState("");
   const [rpcPayload, setRpcPayload] = useState("");
@@ -106,39 +110,115 @@ export default function Playground({
     );
 
     localParticipant.registerRpcMethod(
-      'enterImageCaptureMode',
+      'askImageCapture',
       async (data: RpcInvocationData) => {
-        if (data.payload) {
-            try {
-                const payload = JSON.parse(data.payload);
-                if (payload.prompt) {
-                    setCapturePrompt(payload.prompt);
-                }
-            } catch (e) {
-                console.error("Failed to parse enterImageCaptureMode payload", e);
-            }
-        }
-        setPhoneMode("capture");
+        // Return a promise that will be resolved when user captures/uploads an image
+        return new Promise<string>((resolve) => {
+          if (data.payload) {
+              try {
+                  const payload = JSON.parse(data.payload);
+                  if (payload.prompt) {
+                      setCapturePrompt(payload.prompt);
+                  }
+              } catch (e) {
+                  console.error("Failed to parse askImageCapture payload", e);
+              }
+          }
+          setPhoneMode("capture");
+          // Store the resolver to be called when user captures/uploads an image
+          imageCaptureResolverRef.current = (imageData: string) => {
+            resolve(imageData);
+          };
+        });
+      }
+    );
+
+    localParticipant.registerRpcMethod(
+      'enterHandOffToHumanMode',
+      async () => {
+        setPhoneMode("hand_off");
         return JSON.stringify({ success: true });
       }
     );
 
     localParticipant.registerRpcMethod(
-      'exitImageCaptureMode',
+      'hangUpCall',
       async () => {
-        setPhoneMode("normal");
-        setCapturePrompt("");
+        // Disconnect the call
+        onConnect(false);
         return JSON.stringify({ success: true });
       }
     );
-  }, [localParticipant, roomState]);
+
+    localParticipant.registerRpcMethod(
+      'askImportantQuestion',
+      async (data: RpcInvocationData) => {
+        // Return a promise that will be resolved when user makes a selection
+        return new Promise<string>((resolve) => {
+          if (data.payload) {
+              try {
+                  const payload = JSON.parse(data.payload);
+                  if (payload.message) {
+                      setImportantMessage(payload.message);
+                  }
+                  if (payload.options && Array.isArray(payload.options)) {
+                      setImportantMessageOptions(payload.options);
+                  } else {
+                      // Default fallback if no options provided
+                      setImportantMessageOptions(["确认"]);
+                  }
+              } catch (e) {
+                  console.error("Failed to parse askImportantQuestion payload", e);
+              }
+          }
+          setPhoneMode("important_message");
+          // Store the resolver to be called when user makes a selection
+          importantMessageResolverRef.current = (selection: string) => {
+            resolve(JSON.stringify({ selection }));
+          };
+        });
+      }
+    );
+  }, [localParticipant, roomState, onConnect]);
 
   useEffect(() => {
     if (roomState === ConnectionState.Connected) {
       setPhoneMode("normal");
+    } else if (roomState === ConnectionState.Disconnected) {
+      setPhoneMode("normal");
+      // Clean up any pending important message RPC
+      if (importantMessageResolverRef.current) {
+        importantMessageResolverRef.current("disconnected");
+        importantMessageResolverRef.current = null;
+      }
+      // Clean up any pending image capture RPC
+      if (imageCaptureResolverRef.current) {
+        imageCaptureResolverRef.current(JSON.stringify({ error: "disconnected" }));
+        imageCaptureResolverRef.current = null;
+      }
     }
   }, [roomState]);
 
+  useEffect(() => {
+    // When agent starts speaking, exit capture mode (and PhoneSimulator will clear processing image)
+    // Only exit if we are NOT waiting for capture (resolver is null), meaning this speech is likely the analysis result
+    if (voiceAssistant.state === "speaking" && phoneMode === "capture" && !imageCaptureResolverRef.current) {
+      setPhoneMode("normal");
+      setCapturePrompt("");
+    }
+  }, [voiceAssistant.state, phoneMode]);
+
+  useEffect(() => {
+    // Also exit capture mode if we receive a completion message in chat (in case agent doesn't speak immediately)
+    if (chatMessages.length > 0) {
+      const lastMsg = chatMessages[chatMessages.length - 1];
+      if (lastMsg.message && lastMsg.message.includes("✅ Result: ask_image_capture") && phoneMode === "capture") {
+         setPhoneMode("normal");
+         setCapturePrompt("");
+      }
+    }
+  }, [chatMessages, phoneMode]);
+
   useEffect(() => {
     if (!localParticipant || roomState !== ConnectionState.Connected) {
       return;
@@ -191,6 +271,18 @@ export default function Playground({
     ({ source }) => source === Track.Source.Microphone,
   );
 
+  const handleImportantMessageAction = useCallback(async (text: string) => {
+    setPhoneMode("normal");
+    setImportantMessage("");
+    setImportantMessageOptions([]);
+    
+    // Resolve the RPC with the user's selection
+    if (importantMessageResolverRef.current) {
+      importantMessageResolverRef.current(text || "确认");
+      importantMessageResolverRef.current = null;
+    }
+  }, []);
+
   const onDataReceived = useCallback(
     (msg: any) => {
       if (msg.topic === "transcription") {
@@ -317,6 +409,7 @@ export default function Playground({
         <TranscriptionTile
           agentAudioTrack={voiceAssistant.audioTrack}
           accentColor={config.settings.theme_color}
+          inputDisabled={phoneMode === "important_message" || phoneMode === "hand_off"}
         />
       );
     }
@@ -325,6 +418,7 @@ export default function Playground({
     config.settings.theme_color,
     voiceAssistant.audioTrack,
     voiceAssistant.agent,
+    phoneMode,
   ]);
 
   const instructionsContent = (
@@ -630,10 +724,23 @@ export default function Playground({
             onDisconnect={() => onConnect(false)}
             phoneMode={phoneMode} 
             capturePrompt={capturePrompt}
+            importantMessage={importantMessage}
+            importantMessageOptions={importantMessageOptions}
+            onImportantMessageAction={handleImportantMessageAction}
             onCapture={async (content: File) => {
               if (localParticipant) {
+                // Send file via LiveKit byte stream
                 await localParticipant.sendFile(content, { topic: "image" });
-                await sendChat("用户上传了照片" );
+                
+                // Resolve RPC to signal completion (without image data)
+                if (imageCaptureResolverRef.current) {
+                  // Do NOT exit capture mode immediately - wait for agent response (voiceAssistant.state === "speaking")
+                  // setPhoneMode("normal"); 
+                  // setCapturePrompt("");
+                  imageCaptureResolverRef.current(JSON.stringify({ success: true }));
+                  imageCaptureResolverRef.current = null;
+                }
+
               }
             }}
           />
@@ -717,10 +824,23 @@ export default function Playground({
             onDisconnect={() => onConnect(false)}
             phoneMode={phoneMode} 
             capturePrompt={capturePrompt}
+            importantMessage={importantMessage}
+            importantMessageOptions={importantMessageOptions}
+            onImportantMessageAction={handleImportantMessageAction}
             onCapture={async (content: File) => {
               if (localParticipant) {
+                // Send file via LiveKit byte stream
                 await localParticipant.sendFile(content, { topic: "image" });
-                await sendChat("用户上传了一张照片");
+                
+                // Resolve RPC to signal completion (without image data)
+                if (imageCaptureResolverRef.current) {
+                  // Do NOT exit capture mode immediately - wait for agent response (voiceAssistant.state === "speaking")
+                  // setPhoneMode("normal"); 
+                  // setCapturePrompt("");
+                  imageCaptureResolverRef.current(JSON.stringify({ success: true }));
+                  imageCaptureResolverRef.current = null;
+                }
+
               }
             }}
           />
diff --git a/src/transcriptions/TranscriptionTile.tsx b/src/transcriptions/TranscriptionTile.tsx
index 73dbb6e..f2129d3 100644
--- a/src/transcriptions/TranscriptionTile.tsx
+++ b/src/transcriptions/TranscriptionTile.tsx
@@ -16,9 +16,11 @@ import { useEffect, useState } from "react";
 export function TranscriptionTile({
   agentAudioTrack,
   accentColor,
+  inputDisabled,
 }: {
   agentAudioTrack?: TrackReferenceOrPlaceholder;
   accentColor: string;
+  inputDisabled?: boolean;
 }) {
   const agentMessages = useTrackTranscription(agentAudioTrack || undefined);
   const localParticipant = useLocalParticipant();
@@ -97,7 +99,7 @@ export function TranscriptionTile({
   ]);
 
   return (
-    <ChatTile messages={messages} accentColor={accentColor} onSend={sendChat} />
+    <ChatTile messages={messages} accentColor={accentColor} onSend={sendChat} inputDisabled={inputDisabled} />
   );
 }