From d175e5e5fc4a94ce910bf3ea96bf1b44dcf9222c Mon Sep 17 00:00:00 2001 From: James Hush Date: Mon, 7 Jul 2025 11:58:03 +0800 Subject: [PATCH] Hidden assistant demo --- examples/simple-chatbot/server/bot-openai.py | 157 ++++--------------- 1 file changed, 31 insertions(+), 126 deletions(-) diff --git a/examples/simple-chatbot/server/bot-openai.py b/examples/simple-chatbot/server/bot-openai.py index 8316d3918..68478fe46 100644 --- a/examples/simple-chatbot/server/bot-openai.py +++ b/examples/simple-chatbot/server/bot-openai.py @@ -4,18 +4,6 @@ # SPDX-License-Identifier: BSD 2-Clause License # -"""OpenAI Bot Implementation. - -This module implements a chatbot using OpenAI's GPT-4 model for natural language -processing. It includes: -- Real-time audio/video interaction through Daily -- Animated robot avatar -- Text-to-speech using ElevenLabs -- Support for both English and Spanish - -The bot runs as part of a pipeline that processes audio/video frames and manages -the conversation flow. -""" import asyncio import os @@ -24,150 +12,72 @@ import sys import aiohttp from dotenv import load_dotenv from loguru import logger -from PIL import Image -from runner import configure from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.frames.frames import ( - BotStartedSpeakingFrame, - BotStoppedSpeakingFrame, - Frame, - OutputImageRawFrame, - SpriteFrame, -) from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext -from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor -from pipecat.services.elevenlabs.tts import ElevenLabsTTSService from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.transports.services.helpers.daily_rest import ( + DailyMeetingTokenParams, + DailyMeetingTokenProperties, + DailyRESTHelper, + DailyRoomParams, +) load_dotenv(override=True) logger.remove(0) logger.add(sys.stderr, level="DEBUG") -sprites = [] -script_dir = os.path.dirname(__file__) - -# Load sequential animation frames -for i in range(1, 26): - # Build the full path to the image file - full_path = os.path.join(script_dir, f"assets/robot0{i}.png") - # Get the filename without the extension to use as the dictionary key - # Open the image and convert it to bytes - with Image.open(full_path) as img: - sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format)) - -# Create a smooth animation by adding reversed frames -flipped = sprites[::-1] -sprites.extend(flipped) - -# Define static and animated states -quiet_frame = sprites[0] # Static frame for when bot is listening -talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking - - -class TalkingAnimation(FrameProcessor): - """Manages the bot's visual animation states. - - Switches between static (listening) and animated (talking) states based on - the bot's current speaking status. - """ - - def __init__(self): - super().__init__() - self._is_talking = False - - async def process_frame(self, frame: Frame, direction: FrameDirection): - """Process incoming frames and update animation state. - - Args: - frame: The incoming frame to process - direction: The direction of frame flow in the pipeline - """ - await super().process_frame(frame, direction) - - # Switch to talking animation when bot starts speaking - if isinstance(frame, BotStartedSpeakingFrame): - if not self._is_talking: - await self.push_frame(talking_frame) - self._is_talking = True - # Return to static frame when bot stops speaking - elif isinstance(frame, BotStoppedSpeakingFrame): - await self.push_frame(quiet_frame) - self._is_talking = False - - await self.push_frame(frame, direction) - async def main(): - """Main bot execution function. - - Sets up and runs the bot pipeline including: - - Daily video transport - - Speech-to-text and text-to-speech services - - Language model integration - - Animation processing - - RTVI event handling - """ + """Main bot execution function.""" async with aiohttp.ClientSession() as session: - (room_url, token) = await configure(session) + daily_rest_helper = DailyRESTHelper( + daily_api_key=os.getenv("DAILY_API_KEY"), + daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"), + aiohttp_session=session, + ) + + room = await daily_rest_helper.create_room( + DailyRoomParams(properties={"enable_prejoin_ui": False}) + ) + + token_params = DailyMeetingTokenParams( + properties=DailyMeetingTokenProperties( + is_owner=True, + permissions={ + "hasPresence": False, # Example: join as a hidden participant + }, + start_video_off=True, + start_audio_off=True, + ) + ) + + token = await daily_rest_helper.get_token(room_url=room.url, params=token_params) # Set up Daily transport with video/audio parameters transport = DailyTransport( - room_url, + room.url, token, "Chatbot", DailyParams( audio_in_enabled=True, - audio_out_enabled=True, - video_out_enabled=True, - video_out_width=1024, - video_out_height=576, vad_analyzer=SileroVADAnalyzer(), transcription_enabled=True, - # - # Spanish - # - # transcription_settings=DailyTranscriptionSettings( - # language="es", - # tier="nova", - # model="2-general" - # ) ), ) - # Initialize text-to-speech service - tts = ElevenLabsTTSService( - api_key=os.getenv("ELEVENLABS_API_KEY"), - # - # English - # - voice_id="pNInz6obpgDQGcFmaJgB", - # - # Spanish - # - # model="eleven_multilingual_v2", - # voice_id="gD1IexrzCvsXPHUuT0s3", - ) - # Initialize LLM service llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY")) messages = [ { "role": "system", - # - # English - # - "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.", - # - # Spanish - # - # "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.", + "content": "Summerize the conversation so far in a single sentence.", }, ] @@ -176,8 +86,6 @@ async def main(): context = OpenAILLMContext(messages) context_aggregator = llm.create_context_aggregator(context) - ta = TalkingAnimation() - # # RTVI events for Pipecat client UI # @@ -189,8 +97,6 @@ async def main(): rtvi, context_aggregator.user(), llm, - tts, - ta, transport.output(), context_aggregator.assistant(), ] @@ -204,7 +110,6 @@ async def main(): ), observers=[RTVIObserver(rtvi)], ) - await task.queue_frame(quiet_frame) @rtvi.event_handler("on_client_ready") async def on_client_ready(rtvi):