Hidden assistant demo

2025-07-07 11:58:03 +08:00
1 changed files with 31 additions and 126 deletions
--- a/examples/simple-chatbot/server/bot-openai.py
+++ b/examples/simple-chatbot/server/bot-openai.py
@@ -4,18 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 """OpenAI Bot Implementation.
 This module implements a chatbot using OpenAI's GPT-4 model for natural language
 processing. It includes:
 - Real-time audio/video interaction through Daily
 - Animated robot avatar
 - Text-to-speech using ElevenLabs
 - Support for both English and Spanish
 The bot runs as part of a pipeline that processes audio/video frames and manages
 the conversation flow.
 """
 import asyncio
 import os
@@ -24,150 +12,72 @@ import sys
 import aiohttp
 from dotenv import load_dotenv
 from loguru import logger
 from PIL import Image
 from runner import configure
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.frames.frames import (
    BotStartedSpeakingFrame,
    BotStoppedSpeakingFrame,
    Frame,
    OutputImageRawFrame,
    SpriteFrame,
 )
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
 from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
 from pipecat.transports.services.helpers.daily_rest import (
    DailyMeetingTokenParams,
    DailyMeetingTokenProperties,
    DailyRESTHelper,
    DailyRoomParams,
 )
 load_dotenv(override=True)
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 sprites = []
 script_dir = os.path.dirname(__file__)
 # Load sequential animation frames
 for i in range(1, 26):
    # Build the full path to the image file
    full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
    # Get the filename without the extension to use as the dictionary key
    # Open the image and convert it to bytes
    with Image.open(full_path) as img:
        sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
 # Create a smooth animation by adding reversed frames
 flipped = sprites[::-1]
 sprites.extend(flipped)
 # Define static and animated states
 quiet_frame = sprites[0]  # Static frame for when bot is listening
 talking_frame = SpriteFrame(images=sprites)  # Animation sequence for when bot is talking
 class TalkingAnimation(FrameProcessor):
    """Manages the bot's visual animation states.
    Switches between static (listening) and animated (talking) states based on
    the bot's current speaking status.
    """
    def __init__(self):
        super().__init__()
        self._is_talking = False
    async def process_frame(self, frame: Frame, direction: FrameDirection):
        """Process incoming frames and update animation state.
        Args:
            frame: The incoming frame to process
            direction: The direction of frame flow in the pipeline
        """
        await super().process_frame(frame, direction)
        # Switch to talking animation when bot starts speaking
        if isinstance(frame, BotStartedSpeakingFrame):
            if not self._is_talking:
                await self.push_frame(talking_frame)
                self._is_talking = True
        # Return to static frame when bot stops speaking
        elif isinstance(frame, BotStoppedSpeakingFrame):
            await self.push_frame(quiet_frame)
            self._is_talking = False
        await self.push_frame(frame, direction)
 async def main():
-    """Main bot execution function.
+    """Main bot execution function."""
    Sets up and runs the bot pipeline including:
    - Daily video transport
    - Speech-to-text and text-to-speech services
    - Language model integration
    - Animation processing
    - RTVI event handling
    """
    async with aiohttp.ClientSession() as session:
-        (room_url, token) = await configure(session)
+        daily_rest_helper = DailyRESTHelper(
            daily_api_key=os.getenv("DAILY_API_KEY"),
            daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
            aiohttp_session=session,
        )
        room = await daily_rest_helper.create_room(
            DailyRoomParams(properties={"enable_prejoin_ui": False})
        )
        token_params = DailyMeetingTokenParams(
            properties=DailyMeetingTokenProperties(
                is_owner=True,
                permissions={
                    "hasPresence": False,  # Example: join as a hidden participant
                },
                start_video_off=True,
                start_audio_off=True,
            )
        )
        token = await daily_rest_helper.get_token(room_url=room.url, params=token_params)
        # Set up Daily transport with video/audio parameters
        transport = DailyTransport(
-            room_url,
+            room.url,
            token,
            "Chatbot",
            DailyParams(
                audio_in_enabled=True,
                audio_out_enabled=True,
                video_out_enabled=True,
                video_out_width=1024,
                video_out_height=576,
                vad_analyzer=SileroVADAnalyzer(),
                transcription_enabled=True,
                #
                # Spanish
                #
                # transcription_settings=DailyTranscriptionSettings(
                #     language="es",
                #     tier="nova",
                #     model="2-general"
                # )
            ),
        )
        # Initialize text-to-speech service
        tts = ElevenLabsTTSService(
            api_key=os.getenv("ELEVENLABS_API_KEY"),
            #
            # English
            #
            voice_id="pNInz6obpgDQGcFmaJgB",
            #
            # Spanish
            #
            # model="eleven_multilingual_v2",
            # voice_id="gD1IexrzCvsXPHUuT0s3",
        )
        # Initialize LLM service
        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
        messages = [
            {
                "role": "system",
-                #
+                "content": "Summerize the conversation so far in a single sentence.",
                # English
                #
                "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
                #
                # Spanish
                #
                # "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
            },
        ]
@@ -176,8 +86,6 @@ async def main():
        context = OpenAILLMContext(messages)
        context_aggregator = llm.create_context_aggregator(context)
        ta = TalkingAnimation()
        #
        # RTVI events for Pipecat client UI
        #
@@ -189,8 +97,6 @@ async def main():
                rtvi,
                context_aggregator.user(),
                llm,
                tts,
                ta,
                transport.output(),
                context_aggregator.assistant(),
            ]
@@ -204,7 +110,6 @@ async def main():
            ),
            observers=[RTVIObserver(rtvi)],
        )
        await task.queue_frame(quiet_frame)
        @rtvi.event_handler("on_client_ready")
        async def on_client_ready(rtvi):