Compare commits

...

13 Commits

Author SHA1 Message Date
James Hush
d175e5e5fc Hidden assistant demo 2025-07-07 11:58:03 +08:00
Mark Backman
6eed6ff779 Merge pull request #2147 from pipecat-ai/mb/user-idle-long-function-call
UserIdleProcessor: Account for function calls in progress
2025-07-04 14:11:16 -07:00
Mark Backman
1375211610 UserIdleProcessor: Account for function calls in progress 2025-07-04 14:05:05 -07:00
Mark Backman
4e9369a702 Merge pull request #2149 from pipecat-ai/mb/twilio-hang-up-handling 2025-07-04 12:44:17 -07:00
Mark Backman
f9e8748a96 TwilioFrameSerializer: Handle user hanging up before the serializer 2025-07-04 09:42:16 -07:00
Filipi da Silva Fuchter
20d6bf267a Merge pull request #2146 from pipecat-ai/remove_gemini_duplicated_code
Removing duplicated code inside Gemini.
2025-07-04 11:59:10 -03:00
Filipi Fuchter
b573f9dab2 Removing duplicated code inside Gemini. 2025-07-04 10:57:53 -03:00
Mark Backman
dbc76389d8 Merge pull request #2140 from pipecat-ai/mb/fix-26-imports
Fix: missing import in 26f foundational example
2025-07-03 14:12:54 -07:00
Aleix Conchillo Flaqué
c27f838444 Merge pull request #2124 from pipecat-ai/aleix/frame-processor-no-push-queue
FrameProcessor: remove unnecessary push task
2025-07-03 14:03:05 -07:00
Aleix Conchillo Flaqué
ce84485e26 Merge pull request #2142 from pipecat-ai/aleix/publish-workflow-message
github: update publish message to make it clear
2025-07-03 14:02:51 -07:00
Mark Backman
6cf254e2f9 Fix: missing import in 26f foundational example, update twilio transport_params to FastAPIWebsocketParams 2025-07-03 13:58:18 -07:00
Aleix Conchillo Flaqué
02b63c28a5 FrameProcessor: remove unnecessary push task
When we call `FrameProcessor.push_frame()` we end up calling
`FrameProcessor.queue_frame()` on the next or previous processor which already
uses the input queue and guarantees frame ordering. So, there's no need to have
a two queues next to each other.
2025-07-03 13:57:32 -07:00
Aleix Conchillo Flaqué
57c6ce7ffa github: update publish message to make it clear 2025-07-03 13:55:02 -07:00
14 changed files with 89 additions and 176 deletions

View File

@@ -5,7 +5,7 @@ on:
inputs:
gitref:
type: string
description: "what git ref to build"
description: "what git tag to build (e.g. v0.0.74)"
required: true
jobs:

View File

@@ -5,6 +5,25 @@ All notable changes to **Pipecat** will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- Added call hang-up error handling in `TwilioFrameSerializer`, which handles
the case where the user has hung up before the `TwilioFrameSerializer` hangs
up the call.
### Changed
- The `UserIdleProcessor` now handles the scenario where function calls take
longer than the idle timeout duration. This allows you to use the
`UserIdleProcessor` in conjunction with function calls that take a while to
return a result.
### Performance
- Remove unncessary push task in each `FrameProcessor`.
## [0.0.74] - 2025-07-03
### Added

View File

@@ -35,7 +35,7 @@ transport_params = {
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),

View File

@@ -42,7 +42,7 @@ transport_params = {
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),

View File

@@ -33,7 +33,7 @@ transport_params = {
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),

View File

@@ -55,7 +55,7 @@ transport_params = {
# endpointing, for now.
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.5)),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
# set stop_secs to something roughly similar to the internal setting

View File

@@ -18,10 +18,10 @@ from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.services.gemini_multimodal_live.gemini import (
GeminiMultimodalLiveContext,
GeminiMultimodalLiveLLMService,
)
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
load_dotenv(override=True)

View File

@@ -24,6 +24,7 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
from pipecat.utils.tracing.setup import setup_tracing
@@ -61,7 +62,7 @@ transport_params = {
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),

View File

@@ -24,6 +24,7 @@ from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.llm_service import FunctionCallParams
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.network.fastapi_websocket import FastAPIWebsocketParams
from pipecat.transports.services.daily import DailyParams
from pipecat.utils.tracing.setup import setup_tracing
@@ -58,7 +59,7 @@ transport_params = {
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
"twilio": lambda: TransportParams(
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(),

View File

@@ -4,18 +4,6 @@
# SPDX-License-Identifier: BSD 2-Clause License
#
"""OpenAI Bot Implementation.
This module implements a chatbot using OpenAI's GPT-4 model for natural language
processing. It includes:
- Real-time audio/video interaction through Daily
- Animated robot avatar
- Text-to-speech using ElevenLabs
- Support for both English and Spanish
The bot runs as part of a pipeline that processes audio/video frames and manages
the conversation flow.
"""
import asyncio
import os
@@ -24,150 +12,72 @@ import sys
import aiohttp
from dotenv import load_dotenv
from loguru import logger
from PIL import Image
from runner import configure
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import (
BotStartedSpeakingFrame,
BotStoppedSpeakingFrame,
Frame,
OutputImageRawFrame,
SpriteFrame,
)
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.frameworks.rtvi import RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.services.elevenlabs.tts import ElevenLabsTTSService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from pipecat.transports.services.helpers.daily_rest import (
DailyMeetingTokenParams,
DailyMeetingTokenProperties,
DailyRESTHelper,
DailyRoomParams,
)
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
sprites = []
script_dir = os.path.dirname(__file__)
# Load sequential animation frames
for i in range(1, 26):
# Build the full path to the image file
full_path = os.path.join(script_dir, f"assets/robot0{i}.png")
# Get the filename without the extension to use as the dictionary key
# Open the image and convert it to bytes
with Image.open(full_path) as img:
sprites.append(OutputImageRawFrame(image=img.tobytes(), size=img.size, format=img.format))
# Create a smooth animation by adding reversed frames
flipped = sprites[::-1]
sprites.extend(flipped)
# Define static and animated states
quiet_frame = sprites[0] # Static frame for when bot is listening
talking_frame = SpriteFrame(images=sprites) # Animation sequence for when bot is talking
class TalkingAnimation(FrameProcessor):
"""Manages the bot's visual animation states.
Switches between static (listening) and animated (talking) states based on
the bot's current speaking status.
"""
def __init__(self):
super().__init__()
self._is_talking = False
async def process_frame(self, frame: Frame, direction: FrameDirection):
"""Process incoming frames and update animation state.
Args:
frame: The incoming frame to process
direction: The direction of frame flow in the pipeline
"""
await super().process_frame(frame, direction)
# Switch to talking animation when bot starts speaking
if isinstance(frame, BotStartedSpeakingFrame):
if not self._is_talking:
await self.push_frame(talking_frame)
self._is_talking = True
# Return to static frame when bot stops speaking
elif isinstance(frame, BotStoppedSpeakingFrame):
await self.push_frame(quiet_frame)
self._is_talking = False
await self.push_frame(frame, direction)
async def main():
"""Main bot execution function.
Sets up and runs the bot pipeline including:
- Daily video transport
- Speech-to-text and text-to-speech services
- Language model integration
- Animation processing
- RTVI event handling
"""
"""Main bot execution function."""
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
daily_rest_helper = DailyRESTHelper(
daily_api_key=os.getenv("DAILY_API_KEY"),
daily_api_url=os.getenv("DAILY_API_URL", "https://api.daily.co/v1"),
aiohttp_session=session,
)
room = await daily_rest_helper.create_room(
DailyRoomParams(properties={"enable_prejoin_ui": False})
)
token_params = DailyMeetingTokenParams(
properties=DailyMeetingTokenProperties(
is_owner=True,
permissions={
"hasPresence": False, # Example: join as a hidden participant
},
start_video_off=True,
start_audio_off=True,
)
)
token = await daily_rest_helper.get_token(room_url=room.url, params=token_params)
# Set up Daily transport with video/audio parameters
transport = DailyTransport(
room_url,
room.url,
token,
"Chatbot",
DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
video_out_enabled=True,
video_out_width=1024,
video_out_height=576,
vad_analyzer=SileroVADAnalyzer(),
transcription_enabled=True,
#
# Spanish
#
# transcription_settings=DailyTranscriptionSettings(
# language="es",
# tier="nova",
# model="2-general"
# )
),
)
# Initialize text-to-speech service
tts = ElevenLabsTTSService(
api_key=os.getenv("ELEVENLABS_API_KEY"),
#
# English
#
voice_id="pNInz6obpgDQGcFmaJgB",
#
# Spanish
#
# model="eleven_multilingual_v2",
# voice_id="gD1IexrzCvsXPHUuT0s3",
)
# Initialize LLM service
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
messages = [
{
"role": "system",
#
# English
#
"content": "You are Chatbot, a friendly, helpful robot. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way, but keep your responses brief. Start by introducing yourself.",
#
# Spanish
#
# "content": "Eres Chatbot, un amigable y útil robot. Tu objetivo es demostrar tus capacidades de una manera breve. Tus respuestas se convertiran a audio así que nunca no debes incluir caracteres especiales. Contesta a lo que el usuario pregunte de una manera creativa, útil y breve. Empieza por presentarte a ti mismo.",
"content": "Summerize the conversation so far in a single sentence.",
},
]
@@ -176,8 +86,6 @@ async def main():
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
ta = TalkingAnimation()
#
# RTVI events for Pipecat client UI
#
@@ -189,8 +97,6 @@ async def main():
rtvi,
context_aggregator.user(),
llm,
tts,
ta,
transport.output(),
context_aggregator.assistant(),
]
@@ -204,7 +110,6 @@ async def main():
),
observers=[RTVIObserver(rtvi)],
)
await task.queue_frame(quiet_frame)
@rtvi.event_handler("on_client_ready")
async def on_client_ready(rtvi):

View File

@@ -152,11 +152,6 @@ class FrameProcessor(BaseObject):
self.__input_event = None
self.__input_frame_task: Optional[asyncio.Task] = None
# Every processor in Pipecat should only output frames from a single
# task. This avoid problems like audio overlapping. System frames are the
# exception to this rule. This create this task.
self.__push_frame_task: Optional[asyncio.Task] = None
@property
def id(self) -> int:
"""Get the unique identifier for this processor.
@@ -385,7 +380,6 @@ class FrameProcessor(BaseObject):
"""Clean up processor resources."""
await super().cleanup()
await self.__cancel_input_task()
await self.__cancel_push_task()
if self._metrics is not None:
await self._metrics.cleanup()
@@ -512,10 +506,7 @@ class FrameProcessor(BaseObject):
if not self._check_started(frame):
return
if isinstance(frame, SystemFrame):
await self.__internal_push_frame(frame, direction)
else:
await self.__push_queue.put((frame, direction))
await self.__internal_push_frame(frame, direction)
async def __start(self, frame: StartFrame):
"""Handle the start frame to initialize processor state.
@@ -530,7 +521,6 @@ class FrameProcessor(BaseObject):
self._interruption_strategies = frame.interruption_strategies
self._report_only_initial_ttfb = frame.report_only_initial_ttfb
self.__create_input_task()
self.__create_push_task()
async def __cancel(self, frame: CancelFrame):
"""Handle the cancel frame to stop processor operation.
@@ -540,7 +530,6 @@ class FrameProcessor(BaseObject):
"""
self._cancelling = True
await self.__cancel_input_task()
await self.__cancel_push_task()
async def __pause(self, frame: FrameProcessorPauseFrame | FrameProcessorPauseUrgentFrame):
"""Handle pause frame to pause processor operation.
@@ -567,9 +556,6 @@ class FrameProcessor(BaseObject):
async def _start_interruption(self):
"""Start handling an interruption by canceling current tasks."""
try:
# Cancel the push frame task. This will stop pushing frames downstream.
await self.__cancel_push_task()
# Cancel the input task. This will stop processing queued frames.
await self.__cancel_input_task()
except Exception as e:
@@ -579,9 +565,6 @@ class FrameProcessor(BaseObject):
# Create a new input queue and task.
self.__create_input_task()
# Create a new output queue and task.
self.__create_push_task()
async def _stop_interruption(self):
"""Stop handling an interruption."""
# Nothing to do right now.
@@ -677,23 +660,3 @@ class FrameProcessor(BaseObject):
await self.push_error(ErrorFrame(str(e)))
finally:
self.__input_queue.task_done()
def __create_push_task(self):
"""Create the frame pushing task."""
if not self.__push_frame_task:
self.__push_queue = WatchdogQueue(self.task_manager)
self.__push_frame_task = self.create_task(self.__push_frame_task_handler())
async def __cancel_push_task(self):
"""Cancel the frame pushing task."""
if self.__push_frame_task:
self.__push_queue.cancel()
await self.cancel_task(self.__push_frame_task)
self.__push_frame_task = None
async def __push_frame_task_handler(self):
"""Handle frames from the push queue."""
while True:
(frame, direction) = await self.__push_queue.get()
await self.__internal_push_frame(frame, direction)
self.__push_queue.task_done()

View File

@@ -15,6 +15,8 @@ from pipecat.frames.frames import (
CancelFrame,
EndFrame,
Frame,
FunctionCallInProgressFrame,
FunctionCallResultFrame,
StartFrame,
UserStartedSpeakingFrame,
UserStoppedSpeakingFrame,
@@ -168,6 +170,13 @@ class UserIdleProcessor(FrameProcessor):
self._idle_event.set()
elif isinstance(frame, BotSpeakingFrame):
self._idle_event.set()
elif isinstance(frame, FunctionCallInProgressFrame):
# Function calls can take longer than the timeout, so we want to prevent idle callbacks
self._interrupted = True
self._idle_event.set()
elif isinstance(frame, FunctionCallResultFrame):
self._interrupted = False
self._idle_event.set()
async def cleanup(self) -> None:
"""Cleans up resources when processor is shutting down."""

View File

@@ -185,8 +185,26 @@ class TwilioFrameSerializer(FrameSerializer):
async with session.post(endpoint, auth=auth, data=params) as response:
if response.status == 200:
logger.info(f"Successfully terminated Twilio call {call_sid}")
elif response.status == 404:
# Handle the case where the call has already ended
# Error code 20404: "The requested resource was not found"
# Source: https://www.twilio.com/docs/errors/20404
try:
error_data = await response.json()
if error_data.get("code") == 20404:
logger.debug(f"Twilio call {call_sid} was already terminated")
return
except:
pass # Fall through to log the raw error
# Log other 404 errors
error_text = await response.text()
logger.error(
f"Failed to terminate Twilio call {call_sid}: "
f"Status {response.status}, Response: {error_text}"
)
else:
# Get the error details for better debugging
# Log other errors
error_text = await response.text()
logger.error(
f"Failed to terminate Twilio call {call_sid}: "

View File

@@ -572,9 +572,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
# Initialize the File API client
self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
# Initialize the File API client
self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
def can_generate_metrics(self) -> bool:
"""Check if the service can generate usage metrics.