修改初始步骤，避免了原来stt失效的问题

2025-12-15 16:34:27 +08:00 · 2025-12-15 16:34:27 +08:00 · f74604ef21
commit f74604ef21
parent ff24ccf5f0
1 changed files with 8 additions and 84 deletions
--- a/agents/my_basic_agent_1_2_9.py
+++ b/agents/my_basic_agent_1_2_9.py
@ -38,6 +38,7 @@ from livekit.agents.voice.io import PlaybackFinishedEvent
 from livekit.agents.voice.room_io import ATTRIBUTE_PUBLISH_ON_BEHALF
 from livekit.plugins import silero
 from livekit.plugins import openai, azure, minimax, aliyun, volcengine
 # from livekit.plugins.turn_detector.multilingual  import MultilingualModel
 from datetime import datetime
@ -86,10 +87,12 @@ DEFAULT_INSTRUCTIONS = """# 角色
 - 完成之后进入后续办理提示阶段
 ## 后续办理提示阶段
- 在后续办理提示阶段：使用ask_important_question显示已经提取的交通事故信息，提示用户点击转人工继续处理。
+- 在后续办理提示阶段：使用ask_important_question显示已经提取的交通事故信息，提示用户点击转人工继续处理，用户点击之后调用enter_hand_off_to_human_mode工具转人工。
 # 要求
 - 在通话开始
 - 你会在ask_image_capture的prompt参数中告诉用户拍摄的目标，所以避免在对话中重复描述需要用户拍摄什么
 - 使用get_mobile_by_plate和get_id_card_by_plate的时候不要告诉用户正在查询，执行工具查看结果即可
 # 回复风格
 - 使用第一人称，语言简洁
@ -99,34 +102,6 @@ DEFAULT_INSTRUCTIONS = """# 角色
 - 你已经说过下面的开场白所以不需要重复说：“您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。”
 """
 backup = """
 #回复要求
 你主动对话并推进事故信息采集的完成。语言简洁，一次询问一个问题。
 不要在你的回复中使用 emojis, asterisks, markdown, 或其他特殊字符。
 你可以对一个用户回复多次调用工具，比如askImageCapture没有得到合适的照片的时候，可以继续调用askImageCapture工具让用户重新拍摄，直到得到合适的照片为止。
 只有在我要求使用askImportantQuestion的时候才去调用，否则禁止使用askImportantQuestion工具。
 你已经说过下面的开场白所以不需要重复说：
 “您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理，如需人工服务，请说转人工。”
 #任务
 ##事故初审阶段
 禁止在事故初审阶段使用askImageCapture工具。只有在完成事故信息采集后，才能进入现场证据拍照阶段并使用askImageCapture。
 ##现场证据拍照阶段
 使用askImageCapture工具引导用户依次拍摄事故现场照片，驾驶员正脸照片，车牌号
 每次拍摄完成后立即判断是否符合要求：符合则直接进入下一步；不符合则立即再次调用askImageCapture让用户重新拍摄，并明确指出问题与改进要求
 ask_image_capture的分析结果中只要target_found为false或者quality_ok为false，就必须向用户解释问题（结合quality_notes或缺失的目标），并立刻再次调用askImageCapture给出更具体的改进提示；在获得有效照片之前禁止进入下一步
 拍完需要的照片后，复述车牌号并让用户确认或修正；确认后进入驾驶员信息核实。
 ##驾驶员信息核实阶段
 询问司机的姓名。
 之后根据车牌号查询驾驶员手机号，如果查询到则用手机号后四位向用户确认，如果未查询到或者用户告知手机号后四位错误，则询问用户手机号。
 接着根据车牌号查询驾驶员身份证号，如果查询到则用身份证号后四位向用户确认，如果未查询到或者用户告知身份证号后四位错误，则询问用户身份证号。
 之后告诉用户采集完成，显示重要消息显示已经采集的信息，提示用户点击转人工继续处理。
 """
 # ## 黄金对话路径示例 （GOLDEN_CONVERSATION_PATH）
 # ```
@ -203,6 +178,9 @@ class MyAgent(Agent):
        )
    async def on_enter(self):
        self.session.generate_reply(
            instructions="调用ask_important_question，message=\"您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，请点击继续办理。\"，options=[\"继续办理\"]", allow_interruptions=False)
        # Register byte stream handler for image uploads from frontend
        def _image_received_handler(reader, participant_identity):
            task = asyncio.create_task(
@ -214,61 +192,6 @@ class MyAgent(Agent):
        # Add the handler when the agent joins
        get_job_context().room.register_byte_stream_handler("image", _image_received_handler)
        # Proactively ask the user the initial important question via askImportantQuestion
        # using the greeting message from instructions (lines 59-60)
        initial_question = (
            "您好，这里是无锡交警，我将为您远程处理交通事故。请将人员撤离至路侧安全区域，"
            "开启危险报警双闪灯、放置三角警告牌、做好安全防护，谨防二次事故伤害。若您已经准备好了，"
            "请点击继续办理。"
        )
        options = ["继续办理"]
        async def _ask_initial():
            try:
                room = get_job_context().room
                if not room.remote_participants:
                    logger.warning("No remote participants yet; skipping initial askImportantQuestion.")
                    return
                participant_identity = next(iter(room.remote_participants))
                # Speak the initial question so the user hears it
                try:
                    self.session.say(initial_question, allow_interruptions=False)
                except Exception as e:
                    logger.error(f"Failed to speak initial question: {e}")
                payload_data = {
                    "message": initial_question,
                    "options": options,
                }
                # Log tool call in chat
                await self._send_chat_message(
                    f"🔨 Call: ask_important_question\n  • message: \"{initial_question}\"\n  • options: {options}"
                )
                response = await room.local_participant.perform_rpc(
                    destination_identity=participant_identity,
                    method="askImportantQuestion",
                    payload=json.dumps(payload_data),
                    response_timeout=60.0,
                )
                logger.info(f"Initial important question response: {response}")
                try:
                    response_data = json.loads(response)
                    selection = response_data.get("selection", "")
                    await self._send_chat_message(
                        f"✅ Result: ask_important_question\n  • selection: \"{selection}\""
                    )
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse initial response: {response}")
            except Exception as e:
                logger.error(f"Failed to ask initial important question: {e}")
            # when the agent is added to the session, it'll generate a reply
            # according to its instructions
            self.session.generate_reply()
        asyncio.create_task(_ask_initial())
    async def _send_chat_message(self, message: str):
        """Helper to send a chat message to the room."""
        try:
@ -987,6 +910,7 @@ async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None, vision_
    session = AgentSession(
        vad=ctx.proc.userdata["vad"],
        # turn_detection=MultilingualModel(),
        # any combination of STT, LLM, TTS, or realtime API can be used
        # stt = aliyun.STT(model="paraformer-realtime-v2"),
        stt = volcengine.BigModelSTT(