Files
pipecat/examples/foundational/46-video-processing.py
2025-12-30 14:33:58 -08:00

190 lines
6.4 KiB
Python

#
# Copyright (c) 2025, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import os
import cv2
import numpy as np
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import Frame, InputImageRawFrame, LLMRunFrame, OutputImageRawFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import (
LLMContextAggregatorPair,
LLMUserAggregatorParams,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
from pipecat.processors.frameworks.rtvi import RTVIObserver, RTVIProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.google.gemini_live.llm import GeminiLiveLLMService
from pipecat.transports.base_transport import TransportParams
from pipecat.transports.daily.transport import DailyParams, DailyTransport
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
from pipecat.turns.user_turn_strategies import UserTurnStrategies
load_dotenv(override=True)
transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_out_10ms_chunks=2,
video_in_enabled=True,
video_out_enabled=True,
video_out_is_live=True,
vad_analyzer=SileroVADAnalyzer(),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_out_10ms_chunks=2,
video_in_enabled=True,
video_out_enabled=True,
video_out_is_live=True,
vad_analyzer=SileroVADAnalyzer(),
),
}
class EdgeDetectionProcessor(FrameProcessor):
def __init__(self, video_out_width, video_out_height: int):
super().__init__()
self._video_out_width = video_out_width
self._video_out_height = video_out_height
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
# Send back the user's camera video with edge detection applied
if isinstance(frame, InputImageRawFrame) and frame.transport_source == "camera":
# Convert bytes to NumPy array
img = np.frombuffer(frame.image, dtype=np.uint8).reshape(
(frame.size[1], frame.size[0], 3)
)
# perform edge detection only on camera frames
img = cv2.cvtColor(cv2.Canny(img, 100, 200), cv2.COLOR_GRAY2BGR)
# convert the size if needed
desired_size = (self._video_out_width, self._video_out_height)
if frame.size != desired_size:
resized_image = cv2.resize(img, desired_size)
out_frame = OutputImageRawFrame(resized_image.tobytes(), desired_size, frame.format)
await self.push_frame(out_frame)
else:
out_frame = OutputImageRawFrame(
image=img.tobytes(), size=frame.size, format=frame.format
)
await self.push_frame(out_frame)
else:
await self.push_frame(frame, direction)
SYSTEM_INSTRUCTION = f"""
"You are Gemini Chatbot, a friendly, helpful robot.
Your goal is to demonstrate your capabilities in a succinct way.
Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points.
Respond to what the user said in a creative and helpful way. Keep your responses brief. One or two sentences at most.
"""
async def run_bot(pipecat_transport):
llm = GeminiLiveLLMService(
api_key=os.getenv("GOOGLE_API_KEY"),
voice_id="Puck", # Aoede, Charon, Fenrir, Kore, Puck
transcribe_user_audio=True,
system_instruction=SYSTEM_INSTRUCTION,
)
messages = [
{
"role": "user",
"content": "Start by greeting the user warmly and introducing yourself.",
}
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(
context,
user_params=LLMUserAggregatorParams(
user_turn_strategies=UserTurnStrategies(
stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
),
),
)
# RTVI events for Pipecat client UI
rtvi = RTVIProcessor()
pipeline = Pipeline(
[
pipecat_transport.input(),
context_aggregator.user(),
rtvi,
llm, # LLM
EdgeDetectionProcessor(
pipecat_transport._params.video_out_width,
pipecat_transport._params.video_out_height,
), # Sending the video back to the user
pipecat_transport.output(),
context_aggregator.assistant(),
]
)
task = PipelineTask(
pipeline,
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
),
observers=[RTVIObserver(rtvi)],
)
@rtvi.event_handler("on_client_ready")
async def on_client_ready(rtvi):
logger.info("Pipecat client ready.")
await rtvi.set_bot_ready()
# Kick off the conversation.
await task.queue_frames([LLMRunFrame()])
@pipecat_transport.event_handler("on_client_connected")
async def on_client_connected(transport, participant):
logger.info("Pipecat Client connected")
if isinstance(transport, DailyTransport):
await pipecat_transport.capture_participant_video(participant["id"], framerate=30)
else:
await pipecat_transport.capture_participant_video("camera")
@pipecat_transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info("Pipecat Client disconnected")
await task.cancel()
runner = PipelineRunner(handle_sigint=False, force_gc=True)
await runner.run(task)
async def bot(runner_args: RunnerArguments):
"""Main bot entry point compatible with Pipecat Cloud."""
transport = await create_transport(runner_args, transport_params)
await run_bot(transport)
if __name__ == "__main__":
from pipecat.runner.run import main
main()