From 1548fd554a9a9cd701783b8dd5edc92a1a234f0e Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 15 Jan 2026 14:42:52 +0800 Subject: [PATCH] commit backend agent --- agents/.env.example | 22 +++ agents/my_basic_agent_debate.py | 245 ++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+) create mode 100644 agents/.env.example create mode 100755 agents/my_basic_agent_debate.py diff --git a/agents/.env.example b/agents/.env.example new file mode 100644 index 0000000..7c296ab --- /dev/null +++ b/agents/.env.example @@ -0,0 +1,22 @@ +LIVEKIT_API_SECRET="secret" +LIVEKIT_API_KEY="devkey" +LIVEKIT_URL="ws://127.0.0.1:7880" + +MINIMAX_API_KEY="eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJHcm91cE5hbWUiOiJYaW4gV2FuZyIsIlVzZXJOYW1lIjoiWGluIFdhbmciLCJBY2NvdW50IjoiIiwiU3ViamVjdElEIjoiMTg1NzMzNTM0ODcwNzcyNTM2MyIsIlBob25lIjoiIiwiR3JvdXBJRCI6IjE4NTczMzUzNDg2OTkzMzY3NTUiLCJQYWdlTmFtZSI6IiIsIk1haWwiOiJ3YW5neGluMzE0MTU5MjZAZ21haWwuY29tIiwiQ3JlYXRlVGltZSI6IjIwMjUtMTItMDMgMTQ6MjE6MDkiLCJUb2tlblR5cGUiOjEsImlzcyI6Im1pbmltYXgifQ.MVL2R-3gxcItQHl7cgfQuqcdYTUI80_QtfXeZtNxRNUPFSpPp_a2Om6K0BrwRE7PvnSyr2Kv-w2yeQoXc_SpgoecVgct-Qfl4-w9pgRySMBfIdMWekxuvI6KDIHjnaFyvls11yiuWAeKWK6PHuil_JqttgSAOz-HitbAmuuGyhpnHrE9u-QhoXcXy9s9lAyhuXMq_OyB06Ps58B5u8ziIqQZQG6SNU6_uMiql3Nl2QU4ukGAoRqUQn6lqQExGdFv43FDwlG6NJ_avWnsTnkjcnPqqay6Sf50zyaRl1sm_c1EPkEWcrLOz1aWHSUrA7rhCJMaU5VCvPtTaGJUZk9_yA" + +DEEPSEEK_API_KEY="sk-230701ff1b6143ecbf322b3170606016" + +AZURE_SPEECH_KEY="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o" +AZURE_SPEECH_REGION="eastasia" + +CARTESIA_API_KEY="sk_car_Yu-_vYZsCq8ZOe-WQn43t" +CARTESIA_LANGUAGE="zh" + +SILICONFLOW_API_KEY="sk-thmzysdpqqmhqxxshyqoxvjeiflexjdgaftyufrsgrhpjnyx" + +DASHSCOPE_API_KEY="sk-391f5126d18345d497c6e8717c8c9ad7" + +VOLCENGINE_TTS_ACCESS_TOKEN="4ustCTIpdCq8dE_msFrZvFn4nDpioIVo" +VOLCENGINE_STT_ACCESS_TOKEN="QiO0AptfmU0GLTSitwn7t5-zeo4gJ6K1" +VOLCENGINE_LLM_API_KEY="1224b4c6-ada7-4c43-be2b-8d48c686e3ff" +VOLCENGINE_REALTIME_ACCESS_TOKEN="1224b4c6-ada7-4c43-be2b-8d48c686e3ff" \ No newline at end of file diff --git a/agents/my_basic_agent_debate.py b/agents/my_basic_agent_debate.py new file mode 100755 index 0000000..3457b31 --- /dev/null +++ b/agents/my_basic_agent_debate.py @@ -0,0 +1,245 @@ +import argparse +import asyncio +import base64 +import json +import logging +import os +import sys +from dataclasses import asdict, dataclass +from functools import partial + +import aiohttp +import httpx +from dotenv import load_dotenv + +from livekit import api, rtc +from livekit.agents import ( + Agent, + AgentSession, + AudioConfig, + BackgroundAudioPlayer, + BuiltinAudioClip, + JobContext, + JobProcess, + MetricsCollectedEvent, + RoomInputOptions, + RoomOutputOptions, + RunContext, + WorkerOptions, + cli, + get_job_context, + metrics, +) +from livekit.agents.llm import ImageContent, ToolError, function_tool +from livekit.agents.voice.avatar import DataStreamAudioOutput +from livekit.agents.voice.io import PlaybackFinishedEvent +from livekit.agents.voice.room_io import ATTRIBUTE_PUBLISH_ON_BEHALF +from livekit.plugins import silero +# from livekit.plugins.turn_detector.multilingual import MultilingualModel +from livekit.plugins import openai, azure, minimax, cartesia, deepgram, aliyun +from pydantic_core.core_schema import SerializerFunction + +# uncomment to enable Krisp background voice/noise cancellation +# from livekit.plugins import noise_cancellation + +logger = logging.getLogger("basic-agent") + +load_dotenv() + +AVATAR_IDENTITY = "avatar_worker" + + +@dataclass +class AvatarConnectionInfo: + room_name: str + url: str + """LiveKit server URL""" + token: str + """Token for avatar worker to join""" + + +class MyAgent(Agent): + def __init__(self) -> None: + self._tasks = [] # Prevent garbage collection of running tasks + super().__init__( + instructions="Your name is Kelly. You would interact with users via voice." + "with that in mind keep your responses concise and to the point." + "do not use emojis, asterisks, markdown, or other special characters in your responses." + "You are curious and friendly, and have a sense of humor." + "你正在和用户辩论AI是否可以取代人工,你的观点是AI可以取代人工" + "you will speak chinese to the user", + ) + + async def on_enter(self): + # when the agent is added to the session, it'll generate a reply + # according to its instructions + self.session.generate_reply() + +def prewarm(proc: JobProcess): + proc.userdata["vad"] = silero.VAD.load() + + +async def launch_avatar(ctx: JobContext, avatar_dispatcher_url: str, avatar_identity: str) -> None: + """ + Send a request to the avatar service for it to join the room + + This function should be wrapped in a avatar plugin. + """ + + # create a token for the avatar to join the room + token = ( + api.AccessToken() + .with_identity(avatar_identity) + .with_name("Avatar Runner") + .with_grants(api.VideoGrants(room_join=True, room=ctx.room.name)) + .with_kind("agent") + .with_attributes({ATTRIBUTE_PUBLISH_ON_BEHALF: ctx.local_participant_identity}) + .to_jwt() + ) + + logger.info(f"Sending connection info to avatar dispatcher {avatar_dispatcher_url}") + connection_info = AvatarConnectionInfo(room_name=ctx.room.name, url=ctx._info.url, token=token) + async with httpx.AsyncClient() as client: + response = await client.post(avatar_dispatcher_url, json=asdict(connection_info)) + response.raise_for_status() + logger.info("Avatar handshake completed") + + +async def entrypoint(ctx: JobContext, avatar_dispatcher_url: str = None): + # each log entry will include these fields + ctx.log_context_fields = { + "room": ctx.room.name, + } + + logger.info("connecting to room") + await ctx.connect() + + logger.info("waiting for participant") + participant = await ctx.wait_for_participant() + logger.info(f"starting agent for participant {participant.identity}") + + initial_voice_id = "Chinese (Mandarin)_Male_Announcer" + if participant.attributes.get("voice"): + initial_voice_id = participant.attributes.get("voice") + logger.info(f"User selected voice: {initial_voice_id}") + + session = AgentSession( + # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand + # See all available models at https://docs.livekit.io/agents/models/stt/ + # stt="deepgram/nova-3", + # stt=azure.STT( + # speech_key="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o", + # speech_region="eastasia", + # language="zh-CN" + # ), + # stt=deepgram.STT( + # api_key="61dbb8aa4badb820c24029052e106b00f7498598", + # language="zh-CN", + # model="nova-2-general" + # ), + stt = aliyun.STT(model="paraformer-realtime-v2"), + # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response + # See all available models at https://docs.livekit.io/agents/models/llm/ + # llm="openai/gpt-4.1-mini", + llm=openai.LLM.with_deepseek( + model='deepseek-chat' + ), + # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear + # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ + # tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc", + # tts=minimax.TTS( + # model="speech-2.6-turbo", + # voice=initial_voice_id, + # # voice="Friendly_Person", + # # voice="Chinese (Mandarin)_Male_Announcer" + # ), + tts=aliyun.TTS(model="cosyvoice-v2", voice="longcheng_v2"), + # tts=azure.TTS( + # speech_key="48KfrDwcw6hM0g0WtmF0ZDigaMW8YdUwjrlYDYIL6Rftp5U0V1yfJQQJ99BAAC3pKaRXJ3w3AAAYACOGCI1o", + # speech_region="eastasia", + # language='zh-CN' + # ), + # tts = openai.TTS( + # model='kokoro', + # voice='zf_xiaoyi', + # base_url='http://127.0.0.1:8880/v1', + # api_key='not-needed', + # ), + # tts=cartesia.TTS(), + # VAD and turn detection are used to determine when the user is speaking and when the agent should respond + # See more at https://docs.livekit.io/agents/build/turns + # turn_detection=MultilingualModel(), + vad=ctx.proc.userdata["vad"], + # allow the LLM to generate a response while waiting for the end of turn + # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation + preemptive_generation=True, + # sometimes background noise could interrupt the agent session, these are considered false positive interruptions + # when it's detected, you may resume the agent's speech + resume_false_interruption=True, + false_interruption_timeout=1.0, + ) + + # log metrics as they are emitted, and total usage after session is over + usage_collector = metrics.UsageCollector() + + @session.on("metrics_collected") + def _on_metrics_collected(ev: MetricsCollectedEvent): + metrics.log_metrics(ev.metrics) + usage_collector.collect(ev.metrics) + + async def log_usage(): + summary = usage_collector.get_summary() + logger.info(f"Usage: {summary}") + + # shutdown callbacks are triggered when the session is over + ctx.add_shutdown_callback(log_usage) + + # Launch avatar if avatar_dispatcher_url is provided + if avatar_dispatcher_url: + await launch_avatar(ctx, avatar_dispatcher_url, AVATAR_IDENTITY) + session.output.audio = DataStreamAudioOutput( + ctx.room, + destination_identity=AVATAR_IDENTITY, + # (optional) wait for the avatar to publish video track before generating a reply + wait_remote_track=rtc.TrackKind.KIND_VIDEO, + ) + + @session.output.audio.on("playback_finished") + def on_playback_finished(ev: PlaybackFinishedEvent) -> None: + # the avatar should notify when the audio playback is finished + logger.info( + "playback_finished", + extra={ + "playback_position": ev.playback_position, + "interrupted": ev.interrupted, + }, + ) + + await session.start( + agent=MyAgent(), + room=ctx.room, + room_input_options=RoomInputOptions( + # uncomment to enable Krisp BVC noise cancellation + # noise_cancellation=noise_cancellation.BVC(), + ), + room_output_options=RoomOutputOptions(transcription_enabled=True), + ) + + + # --- 3. 核心:监听 Metadata 变化 --- + @ctx.room.on("room_metadata_changed") + def on_metadata_changed(old_metadata: str, new_metadata: str): + logger.info(f"收到新的比赛状态: {new_metadata} (旧状态: {old_metadata})") + print(f"收到新的比赛状态: {new_metadata} (旧状态: {old_metadata})") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--avatar-url", type=str, default=None, help="Avatar dispatcher URL (e.g., http://localhost:8089/launch)") + args, remaining_args = parser.parse_known_args() + sys.argv = sys.argv[:1] + remaining_args + + if args.avatar_url: + cli.run_app(WorkerOptions(entrypoint_fnc=partial(entrypoint, avatar_dispatcher_url=args.avatar_url), prewarm_fnc=prewarm)) + else: + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint, prewarm_fnc=prewarm))