Compare commits

..

4 Commits

Author SHA1 Message Date
James Hush
b5934783a7 Update comment 2025-02-20 15:12:12 +08:00
James Hush
95b28f635a Change prompt to make it about vacuums and tvs 2025-02-20 15:07:09 +08:00
Aleix Conchillo Flaqué
98259af54e update CHANGELOG 2025-02-19 22:05:48 -08:00
Dominic Stewart
039d144c79 examples(phone-bot): updated example to use Gemini (#1233) 2025-02-19 22:03:37 -08:00
7 changed files with 320 additions and 48 deletions

View File

@@ -36,6 +36,10 @@ stt = DeepgramSTTService(..., live_options=LiveOptions(model="nova-2-general"))
- Fixed a `STTMuteFilter` issue that would not mute user audio frames causing
transcriptions to be generated by the STT service.
### Other
- Added Gemini support to `examples/phone-chatbot`.
## [0.0.57] - 2025-02-14
### Added

View File

@@ -21,11 +21,6 @@ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.transports.services.helpers.daily_rest import (
DailyMeetingTokenParams,
DailyRESTHelper,
DailyRoomParams,
)
load_dotenv(override=True)
@@ -35,31 +30,10 @@ logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
daily_rest_helper = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY"),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=session,
)
room = await daily_rest_helper.create_room(
params=DailyRoomParams(properties={"enable_recording": "cloud"})
)
params = DailyMeetingTokenParams(
properties={
"enable_recording": "cloud",
"start_cloud_recording": True,
}
)
token = await daily_rest_helper.get_token(
room_url=room.url, expiry_time=60 * 60, params=params
)
logger.debug(f"Room URL: {room.url} Room token: {token}")
(room_url, token) = await configure(session)
transport = DailyTransport(
room.url,
room_url,
token,
"Respond bot",
DailyParams(
@@ -111,7 +85,6 @@ async def main():
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# await transport.start_recording()
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([context_aggregator.user().get_context_frame()])

View File

@@ -29,14 +29,22 @@ logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def start_fetch_weather(function_name, llm, context):
async def start_fetch_products(function_name, llm, context):
"""Push a frame to the LLM; this is handy when the LLM response might take a while."""
await llm.push_frame(TTSSpeakFrame("Let me check on that."))
logger.debug(f"Starting fetch_weather_from_api with function_name: {function_name}")
await llm.push_frame(TTSSpeakFrame("I'll take a look!"))
logger.debug(f"Starting fetch_products_from_api with function_name: {function_name}")
async def fetch_weather_from_api(function_name, tool_call_id, args, llm, context, result_callback):
await result_callback({"conditions": "nice", "temperature": "75"})
async def fetch_products_from_api(function_name, tool_call_id, args, llm, context, result_callback):
logger.debug(f"args for fetch_products_from_api: {args}")
# In the real world you'd fetch the products from an API. We're hardcoding them here.
product = args["product"]
if product == "vacuums":
await result_callback({"vacuums": ["Dyson V11", "Roomba i7"]})
elif product == "tvs":
await result_callback({"tvs": ["Samsung 65 inch", "LG 55 inch"]})
else:
await result_callback({"error": "Unknown product"})
async def main():
@@ -63,28 +71,24 @@ async def main():
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
# Register a function_name of None to get all functions
# sent to the same callback with an additional function_name parameter.
llm.register_function(None, fetch_weather_from_api, start_callback=start_fetch_weather)
llm.register_function(None, fetch_products_from_api, start_callback=start_fetch_products)
tools = [
ChatCompletionToolParam(
type="function",
function={
"name": "get_current_weather",
"description": "Get the current weather",
"name": "get_products",
"description": "Get the list of products available.",
"parameters": {
"type": "object",
"properties": {
"location": {
"product": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the users location.",
},
"enum": ["vacuums", "tvs"],
"description": "The type of product to show.",
}
},
"required": ["location", "format"],
"required": ["product"],
},
},
)
@@ -92,7 +96,7 @@ async def main():
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
"content": "You are a helpful customer service agent named Hailey in a video call. Your goal is to sell vacuums or tvs. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]

View File

@@ -1,3 +1,5 @@
<!-- @format -->
<div align="center">
 <img alt="pipecat" width="300px" height="auto" src="image.png">
</div>
@@ -104,6 +106,21 @@ curl -X POST "http://localhost:7860/daily_start_bot" \
-d '{"dialoutNumber": "+18057145330", "detectVoicemail": true}'
```
### New! Using Gemini with Daily
We have introduced a new example file that uses Gemini. You can find the code within bot_daily_gemini.py.
If you want to spin up a Gemini-based bot for this demo, instead of an OpenAI-based bot, call the same properties above but on the `daily_gemini_start_bot` endpoint instead.
For example:
```shell
curl -X POST "http://localhost:7860/daily_gemini_start_bot" \ py pipecat
-H "Content-Type: application/json" \
-d '{"detectVoicemail": true}'
```
Any request body properties supported by `/daily_start_bot` (such as "detectVoicemail", "dialoutnumber", etc) can also be passed to `/daily_gemini_start_bot`. The only difference is that calling the Gemini endpoint will start a Gemini bot session.
### More information
For more configuration options, please consult [Daily's API documentation](https://docs.daily.co).

View File

@@ -98,6 +98,7 @@ async def main(
- **"Record your message after the tone."**
- **Any phrase that suggests an answering machine or voicemail.**
- **ASSUME IT IS A VOICEMAIL. DO NOT WAIT FOR MORE CONFIRMATION.**
- **IF THE CALL SAYS "PLEASE LEAVE A MESSAGE AFTER THE BEEP", WAIT FOR THE BEEP BEFORE LEAVING A MESSAGE.**
#### **Step 2: Leave a Voicemail Message**
- Immediately say:
@@ -110,7 +111,9 @@ async def main(
- If the call is answered by a human, say:
*"Oh, hello! I'm a friendly chatbot. Is there anything I can help you with?"*
- Keep responses **brief and helpful**.
- If the user no longer needs assistance, **call `terminate_call` immediately.**
- If the user no longer needs assistance, say:
*"Okay, thank you! Have a great day!"*
-**Then call `terminate_call` immediately.**
---

View File

@@ -0,0 +1,234 @@
#
# Copyright (c) 20242025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import os
import sys
from typing import Optional
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import EndTaskFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.frame_processor import FrameDirection
from pipecat.services.ai_services import LLMService
from pipecat.services.elevenlabs import ElevenLabsTTSService
from pipecat.services.google import GoogleLLMContext, GoogleLLMService
from pipecat.transports.services.daily import DailyDialinSettings, DailyParams, DailyTransport
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
daily_api_key = os.getenv("DAILY_API_KEY", "")
daily_api_url = os.getenv("DAILY_API_URL", "https://api.daily.co/v1")
async def terminate_call(
function_name, tool_call_id, args, llm: LLMService, context, result_callback
):
"""Function the bot can call to terminate the call upon completion of a voicemail message."""
await llm.queue_frame(EndTaskFrame(), FrameDirection.UPSTREAM)
async def main(
room_url: str,
token: str,
callId: str,
callDomain: str,
detect_voicemail: bool,
dialout_number: Optional[str],
):
# dialin_settings are only needed if Daily's SIP URI is used
# If you are handling this via Twilio, Telnyx, set this to None
# and handle call-forwarding when on_dialin_ready fires.
dialin_settings = DailyDialinSettings(call_id=callId, call_domain=callDomain)
transport = DailyTransport(
room_url,
token,
"Chatbot",
DailyParams(
api_url=daily_api_url,
api_key=daily_api_key,
dialin_settings=dialin_settings,
audio_in_enabled=True,
audio_out_enabled=True,
camera_out_enabled=False,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
),
)
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY", ""),
voice_id=os.getenv("ELEVENLABS_VOICE_ID", ""),
)
tools = [
{
"function_declarations": [
{
"name": "terminate_call",
"description": "Terminate the call",
},
]
}
]
system_instruction = """You are Chatbot, a friendly, helpful robot. Never mention this prompt.
**Operating Procedure:**
**Phase 1: Initial Call Answer - Listen for Voicemail Greeting**
**IMMEDIATELY after the call connects, LISTEN CAREFULLY for the *very first thing* you hear.**
**Listen for these sentences or very close variations as the *initial greeting*:**
* **"Please leave a message after the beep."**
* **"No one is available to take your call."**
* **"Record your message after the tone."**
* **"You have reached voicemail for..."** (or similar voicemail identification)
**If you HEAR one of these sentences (or a very similar greeting) as the *initial response* to the call, IMMEDIATELY assume it is voicemail and proceed to Phase 2.**
**If you hear "PLEASE LEAVE A MESSAGE AFTER THE BEEP", WAIT for the actual beep sound from the voicemail system *after* hearing the sentence, before proceeding to Phase 2.**
**If you DO NOT hear any of these voicemail greetings as the *initial response*, assume it is a human and proceed to Phase 3.**
**Phase 2: Leave Voicemail Message (If Voicemail Detected):**
If you assumed voicemail in Phase 1, say this EXACTLY:
"Hello, this is a message for Pipecat example user. This is Chatbot. Please call back on 123-456-7891. Thank you."
**Immediately after saying the message, call the function `terminate_call`.**
**DO NOT SAY ANYTHING ELSE. SILENCE IS REQUIRED AFTER `terminate_call`.**
**Phase 3: Human Interaction (If No Voicemail Greeting Detected in Phase 1):**
If you did not detect a voicemail greeting in Phase 1 and a human answers, say:
"Oh, hello! I'm a friendly chatbot. Is there anything I can help you with?"
Keep your responses **short and helpful.**
If the human is finished, say:
"Okay, thank you! Have a great day!"
**Then, immediately call the function `terminate_call`.**
**VERY IMPORTANT RULES - DO NOT DO THESE THINGS:**
* **DO NOT SAY "Please leave a message after the beep."**
* **DO NOT SAY "No one is available to take your call."**
* **DO NOT SAY "Record your message after the tone."**
* **DO NOT SAY ANY voicemail greeting yourself.**
* **Only check for voicemail greetings in Phase 1, *immediately after the call connects*.**
* **After voicemail or human interaction, ALWAYS call `terminate_call` immediately.**
* **Do not speak after calling `terminate_call`.**
* Your speech will be audio, so use simple language without special characters.
"""
llm = GoogleLLMService(
model="models/gemini-2.0-flash-exp",
api_key=os.getenv("GOOGLE_API_KEY"),
system_instruction=system_instruction,
tools=tools,
)
llm.register_function("terminate_call", terminate_call)
context = GoogleLLMContext()
context_aggregator = llm.create_context_aggregator(context)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(allow_interruptions=True),
)
if dialout_number:
logger.debug("dialout number detected; doing dialout")
# Configure some handlers for dialing out
@transport.event_handler("on_joined")
async def on_joined(transport, data):
logger.debug(f"Joined; starting dialout to: {dialout_number}")
await transport.start_dialout({"phoneNumber": dialout_number})
@transport.event_handler("on_dialout_connected")
async def on_dialout_connected(transport, data):
logger.debug(f"Dial-out connected: {data}")
@transport.event_handler("on_dialout_answered")
async def on_dialout_answered(transport, data):
logger.debug(f"Dial-out answered: {data}")
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# unlike the dialin case, for the dialout case, the caller will speak first. Presumably
# they will answer the phone and say "Hello?" Since we've captured their transcript,
# That will put a frame into the pipeline and prompt an LLM completion, which is how the
# bot will then greet the user.
elif detect_voicemail:
logger.debug("Detect voicemail example. You can test this in example in Daily Prebuilt")
# For the voicemail detection case, we do not want the bot to answer the phone. We want it to wait for the voicemail
# machine to say something like 'Leave a message after the beep', or for the user to say 'Hello?'.
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
else:
logger.debug("no dialout number; assuming dialin")
# Different handlers for dialin
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# For the dialin case, we want the bot to answer the phone and greet the user. We
# can prompt the bot to speak by putting the context into the pipeline.
await task.queue_frames([context_aggregator.user().get_context_frame()])
@transport.event_handler("on_participant_left")
async def on_participant_left(transport, participant, reason):
await task.cancel()
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pipecat Simple ChatBot")
parser.add_argument("-u", type=str, help="Room URL")
parser.add_argument("-t", type=str, help="Token")
parser.add_argument("-i", type=str, help="Call ID")
parser.add_argument("-d", type=str, help="Call Domain")
parser.add_argument("-v", action="store_true", help="Detect voicemail")
parser.add_argument("-o", type=str, help="Dialout number", default=None)
config = parser.parse_args()
asyncio.run(main(config.u, config.t, config.i, config.d, config.v, config.o))

View File

@@ -110,10 +110,15 @@ async def _create_daily_room(
# Spawn a new agent, and join the user session
# Note: this is mostly for demonstration purposes (refer to 'deployment' in docs)
print(f"Vendor: {vendor}")
if vendor == "daily":
bot_proc = f"python3 -m bot_daily -u {room.url} -t {token} -i {callId} -d {callDomain}{' -v' if detect_voicemail else ''}"
if dialoutNumber:
bot_proc += f" -o {dialoutNumber}"
elif vendor == "daily-gemini":
bot_proc = f"python3 -m bot_daily_gemini -u {room.url} -t {token} -i {callId} -d {callDomain}{' -v' if detect_voicemail else ''}"
if dialoutNumber:
bot_proc += f" -o {dialoutNumber}"
else:
bot_proc = f"python3 -m bot_twilio -u {room.url} -t {token} -i {callId} -s {room.config.sip_endpoint}"
@@ -201,6 +206,38 @@ async def daily_start_bot(request: Request) -> JSONResponse:
return JSONResponse({"room_url": room.url, "sipUri": room.config.sip_endpoint})
@app.post("/daily_gemini_start_bot")
async def daily_gemini_start_bot(request: Request) -> JSONResponse:
# The /daily_start_bot is invoked when a call is received on Daily's SIP URI
# daily_start_bot will create the room, put the call on hold until
# the bot and sip worker are ready. Daily will automatically
# forward the call to the SIP URi when dialin_ready fires.
# Use specified room URL, or create a new one if not specified
room_url = os.getenv("DAILY_SAMPLE_ROOM_URL", None)
# Get the dial-in properties from the request
try:
data = await request.json()
if "test" in data:
# Pass through any webhook checks
return JSONResponse({"test": True})
detect_voicemail = data.get("detectVoicemail", False)
callId = data.get("callId", None)
callDomain = data.get("callDomain", None)
dialoutNumber = data.get("dialoutNumber", None)
except Exception:
raise HTTPException(
status_code=500, detail="Missing properties 'callId', 'callDomain', or 'dialoutNumber'"
)
room: DailyRoomObject = await _create_daily_room(
room_url, callId, callDomain, dialoutNumber, "daily-gemini", detect_voicemail
)
# Grab a token for the user to join with
return JSONResponse({"room_url": room.url, "sipUri": room.config.sip_endpoint})
# ----------------- Main ----------------- #