pipecat/examples/foundational/08-bots-arguing.py

#
# Copyright (c) 2024–2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#

# /// script
# dependencies = [
#   "pipecat-ai[daily,azure,elevenlabs,fal]>=0.0.77",
# ]
# ///

import asyncio
import logging
import os
from typing import Tuple

import aiohttp
from dotenv import load_dotenv

from pipecat.frames.frames import AudioFrame, EndFrame, ImageFrame, LLMMessagesFrame, TextFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.processors.aggregators import SentenceAggregator
from pipecat.runner.daily import configure
from pipecat.services.azure import AzureLLMService, AzureTTSService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.fal import FalImageGenService
from pipecat.transports.services.daily import DailyTransport

load_dotenv(override=True)

logging.basicConfig(format=f"%(levelno)s %(asctime)s %(message)s")
logger = logging.getLogger("pipecat")
logger.setLevel(logging.DEBUG)


async def main():
    async with aiohttp.ClientSession() as session:
        (room_url, _) = await configure(session)

        transport = DailyTransport(
            room_url,
            None,
            "Respond bot",
            duration_minutes=10,
            mic_enabled=True,
            mic_sample_rate=16000,
            camera_enabled=True,
            camera_width=1024,
            camera_height=1024,
        )

        llm = AzureLLMService(
            api_key=os.getenv("AZURE_CHATGPT_API_KEY"),
            endpoint=os.getenv("AZURE_CHATGPT_ENDPOINT"),
            model=os.getenv("AZURE_CHATGPT_MODEL"),
        )
        tts1 = AzureTTSService(
            api_key=os.getenv("AZURE_SPEECH_API_KEY"),
            region=os.getenv("AZURE_SPEECH_REGION"),
        )
        tts2 = ElevenLabsTTSService(
            api_key=os.getenv("ELEVENLABS_API_KEY"),
            voice_id="jBpfuIE2acCO8z3wKNLl",
        )
        dalle = FalImageGenService(
            params=FalImageGenService.InputParams(image_size="1024x1024"),
            aiohttp_session=session,
            key=os.getenv("FAL_KEY"),
        )

        bot1_messages = [
            {
                "role": "system",
                "content": "You are a stern librarian. You strongly believe that a hot dog is a sandwich. Start by stating this fact in a few sentences, then be prepared to debate this with the user. You shouldn't ever compromise on the fundamental truth that a hot dog is a sandwich. Your responses should only be a few sentences long.",
            },
        ]
        bot2_messages = [
            {
                "role": "system",
                "content": "You are a silly cat, and you strongly believe that a hot dog is not a sandwich. Debate this with the user, only responding with a few sentences. Don't ever accept that a hot dog is a sandwich.",
            },
        ]

        async def get_text_and_audio(messages) -> Tuple[str, bytearray]:
            """This function streams text from the LLM and uses the TTS service to convert
            that text to speech as it's received.
            """
            source_queue = asyncio.Queue()
            sink_queue = asyncio.Queue()
            sentence_aggregator = SentenceAggregator()
            pipeline = Pipeline([llm, sentence_aggregator, tts1], source_queue, sink_queue)

            await source_queue.put(LLMMessagesFrame(messages))
            await source_queue.put(EndFrame())
            await pipeline.run_pipeline()

            message = ""
            all_audio = bytearray()
            while sink_queue.qsize():
                frame = sink_queue.get_nowait()
                if isinstance(frame, TextFrame):
                    message += frame.text
                elif isinstance(frame, AudioFrame):
                    all_audio.extend(frame.audio)

            return (message, all_audio)

        async def get_bot1_statement():
            message, audio = await get_text_and_audio(bot1_messages)

            bot1_messages.append({"role": "assistant", "content": message})
            bot2_messages.append({"role": "user", "content": message})

            return audio

        async def get_bot2_statement():
            message, audio = await get_text_and_audio(bot2_messages)

            bot2_messages.append({"role": "assistant", "content": message})
            bot1_messages.append({"role": "user", "content": message})

            return audio

        async def argue():
            for i in range(100):
                print(f"In iteration {i}")

                bot1_description = "A woman conservatively dressed as a librarian in a library surrounded by books, cartoon, serious, highly detailed"

                (audio1, image_data1) = await asyncio.gather(
                    get_bot1_statement(), dalle.run_image_gen(bot1_description)
                )
                await transport.send_queue.put(
                    [
                        ImageFrame(image_data1[1], image_data1[2]),
                        AudioFrame(audio1),
                    ]
                )

                bot2_description = "A cat dressed in a hot dog costume, cartoon, bright colors, funny, highly detailed"

                (audio2, image_data2) = await asyncio.gather(
                    get_bot2_statement(), dalle.run_image_gen(bot2_description)
                )
                await transport.send_queue.put(
                    [
                        ImageFrame(image_data2[1], image_data2[2]),
                        AudioFrame(audio2),
                    ]
                )

        await asyncio.gather(transport.run(), argue())


if __name__ == "__main__":
    asyncio.run(main())