commit backend agent

2026-01-15 14:42:52 +08:00
parent 0218a5a002
commit 1548fd554a
2 changed files with 267 additions and 0 deletions
--- a/agents/.env.example
+++ b/agents/.env.example
@@ -0,0 +1,22 @@
+LIVEKIT_API_SECRET="secret"
+LIVEKIT_API_KEY="devkey"
+LIVEKIT_URL="ws://127.0.0.1:7880"
+
+MINIMAX_API_KEY="eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiJYaW4gV2FuZyIsIlVzZXJOYW1lIjoiWGluIFdhbmciLCJBY2NvdW50IjoiIiwiU3ViamVjdElEIjoiMTg1NzMzNTM0ODcwNzcyNTM2MyIsIlBob25lIjoiIiwiR3JvdXBJRCI6IjE4NTczMzUzNDg2OTkzMzY3NTUiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJ3YW5neGluMzE0MTU5MjZAZ21haWwuY29tIiwiQ3JlYXRlVGltZSI6IjIwMjUtMTItMDMgMTQ6MjE6MDkiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.MVL2R-3gxcItQHl7cgfQuqcdYTUI80_QtfXeZtNxRNUPFSpPp_a2Om6K0BrwRE7PvnSyr2Kv-w2yeQoXc_SpgoecVgct-Qfl4-w9pgRySMBfIdMWekxuvI6KDIHjnaFyvls11yiuWAeKWK6PHuil_JqttgSAOz-HitbAmuuGyhpnHrE9u-QhoXcXy9s9lAyhuXMq_OyB06Ps58B5u8ziIqQZQG6SNU6_uMiql3Nl2QU4ukGAoRqUQn6lqQExGdFv43FDwlG6NJ_avWnsTnkjcnPqqay6Sf50zyaRl1sm_c1EPkEWcrLOz1aWHSUrA7rhCJMaU5VCvPtTaGJUZk9_yA"
+
+DEEPSEEK_API_KEY="sk-230701ff1b6143ecbf322b3170606016"
+
+AZURE_SPEECH_KEY="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o"
+AZURE_SPEECH_REGION="eastasia"
+
+CARTESIA_API_KEY="sk_car_Yu-_vYZsCq8ZOe-WQn43t"
+CARTESIA_LANGUAGE="zh"
+
+SILICONFLOW_API_KEY="sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx"
+
+DASHSCOPE_API_KEY="sk-391f5126d18345d497c6e8717c8c9ad7"
+
+VOLCENGINE_TTS_ACCESS_TOKEN="4ustCTIpdCq8dE_msFrZvFn4nDpioIVo"
+VOLCENGINE_STT_ACCESS_TOKEN="QiO0AptfmU0GLTSitwn7t5-zeo4gJ6K1"
+VOLCENGINE_LLM_API_KEY="1224b4c6-ada7-4c43-be2b-8d48c686e3ff"
+VOLCENGINE_REALTIME_ACCESS_TOKEN="1224b4c6-ada7-4c43-be2b-8d48c686e3ff"
--- a/agents/my_basic_agent_debate.py
+++ b/agents/my_basic_agent_debate.py
@@ -0,0 +1,245 @@
+import argparse
+import asyncio
+import base64
+import json
+import logging
+import os
+import sys
+from dataclasses import asdict, dataclass
+from functools import partial
+
+import aiohttp
+import httpx
+from dotenv import load_dotenv
+
+from livekit import api, rtc
+from livekit.agents import (
+    Agent,
+    AgentSession,
+    AudioConfig,
+    BackgroundAudioPlayer,
+    BuiltinAudioClip,
+    JobContext,
+    JobProcess,
+    MetricsCollectedEvent,
+    RoomInputOptions,
+    RoomOutputOptions,
+    RunContext,
+    WorkerOptions,
+    cli,
+    get_job_context,
+    metrics,
+)
+from livekit.agents.llm import ImageContent, ToolError, function_tool
+from livekit.agents.voice.avatar import DataStreamAudioOutput
+from livekit.agents.voice.io import PlaybackFinishedEvent
+from livekit.agents.voice.room_io import ATTRIBUTE_PUBLISH_ON_BEHALF
+from livekit.plugins import silero
+# from livekit.plugins.turn_detector.multilingual import MultilingualModel
+from livekit.plugins import openai, azure, minimax, cartesia, deepgram, aliyun
+from pydantic_core.core_schema import SerializerFunction
+
+# uncomment to enable Krisp background voice/noise cancellation
+# from livekit.plugins import noise_cancellation
+
+logger = logging.getLogger("basic-agent")
+
+load_dotenv()
+
+AVATAR_IDENTITY = "avatar_worker"
+
+
+@dataclass
+class AvatarConnectionInfo:
+    room_name: str
+    url: str
+    """LiveKit server URL"""
+    token: str
+    """Token for avatar worker to join"""
+
+
+class MyAgent(Agent):
+    def __init__(self) -> None:
+        self._tasks = []  # Prevent garbage collection of running tasks
+        super().__init__(
+            instructions="Your name is Kelly. You would interact with users via voice."
+            "with that in mind keep your responses concise and to the point."
+            "do not use emojis, asterisks, markdown, or other special characters in your responses."
+            "You are curious and friendly, and have a sense of humor."
+            "你正在和用户辩论AI是否可以取代人工，你的观点是AI可以取代人工"
+            "you will speak chinese to the user",
+        )
+
+    async def on_enter(self):
+        # when the agent is added to the session, it'll generate a reply
+        # according to its instructions
+        self.session.generate_reply()
+
+def prewarm(proc: JobProcess):
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+async def launch_avatar(ctx: JobContext, avatar_dispatcher_url: str, avatar_identity: str) -> None:
+    """
+    Send a request to the avatar service for it to join the room
+
+    This function should be wrapped in a avatar plugin.
+    """
+
+    # create a token for the avatar to join the room
+    token = (
+        api.AccessToken()
+        .with_identity(avatar_identity)
+        .with_name("Avatar Runner")
+        .with_grants(api.VideoGrants(room_join=True, room=ctx.room.name))
+        .with_kind("agent")
+        .with_attributes({ATTRIBUTE_PUBLISH_ON_BEHALF: ctx.local_participant_identity})
+        .to_jwt()
+    )
+
+    logger.info(f"Sending connection info to avatar dispatcher {avatar_dispatcher_url}")
+    connection_info = AvatarConnectionInfo(room_name=ctx.room.name, url=ctx._info.url, token=token)
+    async with httpx.AsyncClient() as client:
+        response = await client.post(avatar_dispatcher_url, json=asdict(connection_info))
+        response.raise_for_status()
+    logger.info("Avatar handshake completed")
+
+
+async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None):
+    # each log entry will include these fields
+    ctx.log_context_fields = {
+        "room": ctx.room.name,
+    }
+
+    logger.info("connecting to room")
+    await ctx.connect()
+
+    logger.info("waiting for participant")
+    participant = await ctx.wait_for_participant()
+    logger.info(f"starting agent for participant {participant.identity}")
+
+    initial_voice_id = "Chinese (Mandarin)_Male_Announcer"
+    if participant.attributes.get("voice"):
+        initial_voice_id = participant.attributes.get("voice")
+        logger.info(f"User selected voice: {initial_voice_id}")
+
+    session = AgentSession(
+        # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
+        # See all available models at https://docs.livekit.io/agents/models/stt/
+        # stt="deepgram/nova-3",
+        # stt=azure.STT(
+        #     speech_key="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o",
+        #     speech_region="eastasia",
+        #     language="zh-CN"
+        # ),
+        # stt=deepgram.STT(
+        #     api_key="61dbb8aa4badb820c24029052e106b00f7498598",
+        #     language="zh-CN",
+        #     model="nova-2-general"
+        # ),
+        stt = aliyun.STT(model="paraformer-realtime-v2"),
+        # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
+        # See all available models at https://docs.livekit.io/agents/models/llm/
+        # llm="openai/gpt-4.1-mini",
+        llm=openai.LLM.with_deepseek(
+            model='deepseek-chat'
+        ),
+        # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
+        # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
+        # tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
+        # tts=minimax.TTS(
+        #     model="speech-2.6-turbo",
+        #     voice=initial_voice_id,
+        #     # voice="Friendly_Person",
+        #     # voice="Chinese (Mandarin)_Male_Announcer"
+        # ),
+        tts=aliyun.TTS(model="cosyvoice-v2", voice="longcheng_v2"),
+        # tts=azure.TTS(
+        #     speech_key="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o",
+        #     speech_region="eastasia",   
+        #     language='zh-CN'
+        # ),
+        # tts = openai.TTS(
+        #     model='kokoro',
+        #     voice='zf_xiaoyi',
+        #     base_url='http://127.0.0.1:8880/v1',
+        #     api_key='not-needed',
+        # ),
+        # tts=cartesia.TTS(),
+        # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
+        # See more at https://docs.livekit.io/agents/build/turns
+        # turn_detection=MultilingualModel(),
+        vad=ctx.proc.userdata["vad"],
+        # allow the LLM to generate a response while waiting for the end of turn
+        # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
+        preemptive_generation=True,
+        # sometimes background noise could interrupt the agent session, these are considered false positive interruptions
+        # when it's detected, you may resume the agent's speech
+        resume_false_interruption=True,
+        false_interruption_timeout=1.0,
+    )
+
+    # log metrics as they are emitted, and total usage after session is over
+    usage_collector = metrics.UsageCollector()
+
+    @session.on("metrics_collected")
+    def _on_metrics_collected(ev: MetricsCollectedEvent):
+        metrics.log_metrics(ev.metrics)
+        usage_collector.collect(ev.metrics)
+
+    async def log_usage():
+        summary = usage_collector.get_summary()
+        logger.info(f"Usage: {summary}")
+
+    # shutdown callbacks are triggered when the session is over
+    ctx.add_shutdown_callback(log_usage)
+
+    # Launch avatar if avatar_dispatcher_url is provided
+    if avatar_dispatcher_url:
+        await launch_avatar(ctx, avatar_dispatcher_url, AVATAR_IDENTITY)
+        session.output.audio = DataStreamAudioOutput(
+            ctx.room,
+            destination_identity=AVATAR_IDENTITY,
+            # (optional) wait for the avatar to publish video track before generating a reply
+            wait_remote_track=rtc.TrackKind.KIND_VIDEO,
+        )
+
+        @session.output.audio.on("playback_finished")
+        def on_playback_finished(ev: PlaybackFinishedEvent) -> None:
+            # the avatar should notify when the audio playback is finished
+            logger.info(
+                "playback_finished",
+                extra={
+                    "playback_position": ev.playback_position,
+                    "interrupted": ev.interrupted,
+                },
+            )
+
+    await session.start(
+        agent=MyAgent(),
+        room=ctx.room,
+        room_input_options=RoomInputOptions(
+            # uncomment to enable Krisp BVC noise cancellation
+            # noise_cancellation=noise_cancellation.BVC(),
+        ),
+        room_output_options=RoomOutputOptions(transcription_enabled=True),
+    )
+
+
+    # --- 3. 核心：监听 Metadata 变化 ---
+    @ctx.room.on("room_metadata_changed")
+    def on_metadata_changed(old_metadata: str, new_metadata: str):
+        logger.info(f"收到新的比赛状态: {new_metadata} (旧状态: {old_metadata})")
+        print(f"收到新的比赛状态: {new_metadata} (旧状态: {old_metadata})")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL (e.g., http://localhost:8089/launch)")
+    args, remaining_args = parser.parse_known_args()
+    sys.argv = sys.argv[:1] + remaining_args
+
+    if args.avatar_url:
+        cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, avatar_dispatcher_url=args.avatar_url), prewarm_fnc=prewarm))
+    else:
+        cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))