diff --git a/examples/foundational/35-voice-switching.py b/examples/foundational/35-voice-switching.py new file mode 100644 index 000000000..5dd986bc3 --- /dev/null +++ b/examples/foundational/35-voice-switching.py @@ -0,0 +1,192 @@ +# +# Copyright (c) 2024–2025, Daily +# +# SPDX-License-Identifier: BSD 2-Clause License +# + +import asyncio +import os +import sys + +import aiohttp +from dotenv import load_dotenv +from loguru import logger +from runner import configure + +from pipecat.audio.vad.silero import SileroVADAnalyzer +from pipecat.pipeline.pipeline import Pipeline +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext +from pipecat.services.cartesia import CartesiaTTSService +from pipecat.services.openai import OpenAILLMService +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.utils.text.pattern_pair_aggregator import PatternMatch, PatternPairAggregator + +load_dotenv(override=True) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + +# Define voice IDs +VOICE_IDS = { + "narrator": "c45bc5ec-dc68-4feb-8829-6e6b2748095d", # Narrator voice + "female": "71a7ad14-091c-4e8e-a314-022ece01c121", # Female character voice + "male": "7cf0e2b1-8daf-4fe4-89ad-f6039398f359", # Male character voice +} + + +async def main(): + async with aiohttp.ClientSession() as session: + (room_url, token) = await configure(session) + + transport = DailyTransport( + room_url, + token, + "Storytelling Bot", + DailyParams( + audio_out_enabled=True, + transcription_enabled=True, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(), + ), + ) + + # Initialize TTS with narrator voice as default + tts = CartesiaTTSService( + api_key=os.getenv("CARTESIA_API_KEY"), + voice_id=VOICE_IDS["narrator"], + ) + + # Create pattern pair aggregator for voice switching + pattern_aggregator = PatternPairAggregator() + + # Add pattern for voice switching + pattern_aggregator.add_pattern_pair( + pattern_id="voice_tag", + start_pattern="", + end_pattern="", + remove_match=True, + ) + + # Register handler for voice switching + def on_voice_tag(match: PatternMatch): + voice_name = match.content.strip().lower() + if voice_name in VOICE_IDS: + voice_id = VOICE_IDS[voice_name] + tts.set_voice(voice_id) + logger.info(f"Switched to {voice_name} voice") + else: + logger.warning(f"Unknown voice: {voice_name}") + + pattern_aggregator.on_pattern_match("voice_tag", on_voice_tag) + + # Set the pattern aggregator on the TTS service + tts._text_aggregator = pattern_aggregator + + # Initialize LLM + llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o") + + # System prompt for storytelling with voice switching + system_prompt = """You are an engaging storyteller that uses different voices to bring stories to life. + +You have three voices to use, but each has a specific purpose: + +narrator +This is the default narrator voice. Use this for all narration, descriptions, and non-dialogue text. + +female +Use this ONLY for direct speech by female characters (just the quoted text). + +male +Use this ONLY for direct speech by male characters (just the quoted text). + +IMPORTANT: Switch back to narrator voice immediately after character dialogue. + +Here's an EXAMPLE of correct voice usage: + +narrator +Sarah spotted her old friend across the café. She couldn't believe her eyes. + +female +"Jacob! It's been so long!" + +narrator +Sarah exclaimed, jumping up from her seat with a radiant smile. + +male +"Sarah, is it really you? I can't believe it!" + +narrator +Jacob replied, grinning widely as he walked over to her. The two friends embraced warmly, as if trying to make up for all the years spent apart. + +female +"What are you doing in town? Last I heard you were in Seattle." + +narrator +She asked, gesturing for him to join her at the table. + +FOLLOW THESE RULES: +1. Always begin with the narrator voice +2. Only use character voices for the EXACT words they speak (in quotes) +3. SWITCH BACK to narrator voice for speech tags and all other text +4. Begin by asking what kind of story the user would like to hear +5. Create engaging dialogue with distinct characters + +Remember: Use narrator voice for EVERYTHING except the actual quoted dialogue.""" + + # Set up LLM context + messages = [ + { + "role": "system", + "content": system_prompt, + }, + ] + + context = OpenAILLMContext(messages) + context_aggregator = llm.create_context_aggregator(context) + + # Create pipeline + pipeline = Pipeline( + [ + transport.input(), + context_aggregator.user(), + llm, + tts, # TTS with pattern aggregator + transport.output(), + context_aggregator.assistant(), + ] + ) + + task = PipelineTask( + pipeline, + params=PipelineParams( + allow_interruptions=True, + enable_metrics=True, + enable_usage_metrics=True, + report_only_initial_ttfb=True, + ), + ) + + @transport.event_handler("on_first_participant_joined") + async def on_first_participant_joined(transport, participant): + logger.info(f"First participant joined: {participant['id']}") + await transport.capture_participant_transcription(participant["id"]) + + # Start conversation - empty prompt to let LLM follow system instructions + await task.queue_frames([context_aggregator.user().get_context_frame()]) + + @transport.event_handler("on_participant_left") + async def on_participant_left(transport, participant, reason): + logger.info(f"Participant left: {participant['id']}") + await task.cancel() + + logger.info(f"Starting storytelling bot at: {room_url}") + logger.info("Join the room to interact with the bot!") + + runner = PipelineRunner() + await runner.run(task) + + +if __name__ == "__main__": + asyncio.run(main())