Hidden assistant demo

2025-07-07 11:58:03 +08:00
1 changed files with 31 additions and 126 deletions
--- a/examples/simple-chatbot/server/bot-openai.py
+++ b/examples/simple-chatbot/server/bot-openai.py
@@ -4,18 +4,6 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #

-"""OpenAI Bot Implementation.
-
-This module implements a chatbot using OpenAI's GPT-4 model for natural language
-processing. It includes:
- Real-time audio/video interaction through Daily
- Animated robot avatar
- Text-to-speech using ElevenLabs
- Support for both English and Spanish
-
-The bot runs as part of a pipeline that processes audio/video frames and manages
-the conversation flow.
-"""

 import asyncio
 import os
@@ -24,150 +12,72 @@ import sys
 import aiohttp
 from dotenv import load_dotenv
 from loguru import logger
-from PIL import Image
-from runner import configure

 from pipecat.audio.vad.silero import SileroVADAnalyzer
-from pipecat.frames.frames import (
-    BotStartedSpeakingFrame,
-    BotStoppedSpeakingFrame,
-    Frame,
-    OutputImageRawFrame,
-    SpriteFrame,
-)
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
-from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.transports.services.daily import DailyParams, DailyTransport
+from pipecat.transports.services.helpers.daily_rest import (
+    DailyMeetingTokenParams,
+    DailyMeetingTokenProperties,
+    DailyRESTHelper,
+    DailyRoomParams,
+)

 load_dotenv(override=True)
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")

-sprites = []
-script_dir = os.path.dirname(__file__)
-
-# Load sequential animation frames
-for i in range(1, 26):
-    # Build the full path to the image file
-    full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
-    # Get the filename without the extension to use as the dictionary key
-    # Open the image and convert it to bytes
-    with Image.open(full_path) as img:
-        sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
-
-# Create a smooth animation by adding reversed frames
-flipped = sprites[::-1]
-sprites.extend(flipped)
-
-# Define static and animated states
-quiet_frame = sprites[0]  # Static frame for when bot is listening
-talking_frame = SpriteFrame(images=sprites)  # Animation sequence for when bot is talking
-
-
-class TalkingAnimation(FrameProcessor):
-    """Manages the bot's visual animation states.
-
-    Switches between static (listening) and animated (talking) states based on
-    the bot's current speaking status.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._is_talking = False
-
-    async def process_frame(self, frame: Frame, direction: FrameDirection):
-        """Process incoming frames and update animation state.
-
-        Args:
-            frame: The incoming frame to process
-            direction: The direction of frame flow in the pipeline
-        """
-        await super().process_frame(frame, direction)
-
-        # Switch to talking animation when bot starts speaking
-        if isinstance(frame, BotStartedSpeakingFrame):
-            if not self._is_talking:
-                await self.push_frame(talking_frame)
-                self._is_talking = True
-        # Return to static frame when bot stops speaking
-        elif isinstance(frame, BotStoppedSpeakingFrame):
-            await self.push_frame(quiet_frame)
-            self._is_talking = False
-
-        await self.push_frame(frame, direction)
-

 async def main():
-    """Main bot execution function.
-
-    Sets up and runs the bot pipeline including:
-    - Daily video transport
-    - Speech-to-text and text-to-speech services
-    - Language model integration
-    - Animation processing
-    - RTVI event handling
-    """
+    """Main bot execution function."""
    async with aiohttp.ClientSession() as session:
-        (room_url, token) = await configure(session)
+        daily_rest_helper = DailyRESTHelper(
+            daily_api_key=os.getenv("DAILY_API_KEY"),
+            daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
+            aiohttp_session=session,
+        )
+
+        room = await daily_rest_helper.create_room(
+            DailyRoomParams(properties={"enable_prejoin_ui": False})
+        )
+
+        token_params = DailyMeetingTokenParams(
+            properties=DailyMeetingTokenProperties(
+                is_owner=True,
+                permissions={
+                    "hasPresence": False,  # Example: join as a hidden participant
+                },
+                start_video_off=True,
+                start_audio_off=True,
+            )
+        )
+
+        token = await daily_rest_helper.get_token(room_url=room.url, params=token_params)

        # Set up Daily transport with video/audio parameters
        transport = DailyTransport(
-            room_url,
+            room.url,
            token,
            "Chatbot",
            DailyParams(
                audio_in_enabled=True,
-                audio_out_enabled=True,
-                video_out_enabled=True,
-                video_out_width=1024,
-                video_out_height=576,
                vad_analyzer=SileroVADAnalyzer(),
                transcription_enabled=True,
-                #
-                # Spanish
-                #
-                # transcription_settings=DailyTranscriptionSettings(
-                #     language="es",
-                #     tier="nova",
-                #     model="2-general"
-                # )
            ),
        )

-        # Initialize text-to-speech service
-        tts = ElevenLabsTTSService(
-            api_key=os.getenv("ELEVENLABS_API_KEY"),
-            #
-            # English
-            #
-            voice_id="pNInz6obpgDQGcFmaJgB",
-            #
-            # Spanish
-            #
-            # model="eleven_multilingual_v2",
-            # voice_id="gD1IexrzCvsXPHUuT0s3",
-        )
-
        # Initialize LLM service
        llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))

        messages = [
            {
                "role": "system",
-                #
-                # English
-                #
-                "content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
-                #
-                # Spanish
-                #
-                # "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
+                "content": "Summerize the conversation so far in a single sentence.",
            },
        ]

@@ -176,8 +86,6 @@ async def main():
        context = OpenAILLMContext(messages)
        context_aggregator = llm.create_context_aggregator(context)

-        ta = TalkingAnimation()
-
        #
        # RTVI events for Pipecat client UI
        #
@@ -189,8 +97,6 @@ async def main():
                rtvi,
                context_aggregator.user(),
                llm,
-                tts,
-                ta,
                transport.output(),
                context_aggregator.assistant(),
            ]
@@ -204,7 +110,6 @@ async def main():
            ),
            observers=[RTVIObserver(rtvi)],
        )
-        await task.queue_frame(quiet_frame)

        @rtvi.event_handler("on_client_ready")
        async def on_client_ready(rtvi):