We now distinguish between input and output audio and image frames. We introduce `InputAudioRawFrame`, `OutputAudioRawFrame`, `InputImageRawFrame` and `OutputImageRawFrame` (and other subclasses of those). The input frames usually come from an input transport and are meant to be processed inside the pipeline to generate new frames. However, the input frames will not be sent through an output transport. The output frames can also be processed by any frame processor in the pipeline and they are allowed to be sent by the output transport.
91 lines
2.8 KiB
Python
91 lines
2.8 KiB
Python
#
|
|
# Copyright (c) 2024, Daily
|
|
#
|
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
#
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
from pipecat.frames.frames import LLMMessagesFrame
|
|
from pipecat.pipeline.pipeline import Pipeline
|
|
from pipecat.pipeline.runner import PipelineRunner
|
|
from pipecat.pipeline.task import PipelineTask
|
|
from pipecat.processors.aggregators.llm_response import (
|
|
LLMAssistantResponseAggregator,
|
|
LLMUserResponseAggregator
|
|
)
|
|
from pipecat.services.cartesia import CartesiaTTSService
|
|
from pipecat.services.deepgram import DeepgramSTTService
|
|
from pipecat.services.openai import OpenAILLMService
|
|
from pipecat.transports.network.websocket_server import WebsocketServerParams, WebsocketServerTransport
|
|
from pipecat.vad.silero import SileroVADAnalyzer
|
|
|
|
from loguru import logger
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(override=True)
|
|
|
|
logger.remove(0)
|
|
logger.add(sys.stderr, level="DEBUG")
|
|
|
|
|
|
async def main():
|
|
transport = WebsocketServerTransport(
|
|
params=WebsocketServerParams(
|
|
audio_out_enabled=True,
|
|
add_wav_header=True,
|
|
vad_enabled=True,
|
|
vad_analyzer=SileroVADAnalyzer(),
|
|
vad_audio_passthrough=True
|
|
)
|
|
)
|
|
|
|
llm = OpenAILLMService(
|
|
api_key=os.getenv("OPENAI_API_KEY"),
|
|
model="gpt-4o")
|
|
|
|
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
|
|
|
|
tts = CartesiaTTSService(
|
|
api_key=os.getenv("CARTESIA_API_KEY"),
|
|
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
|
|
)
|
|
|
|
messages = [
|
|
{
|
|
"role": "system",
|
|
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
|
},
|
|
]
|
|
|
|
tma_in = LLMUserResponseAggregator(messages)
|
|
tma_out = LLMAssistantResponseAggregator(messages)
|
|
|
|
pipeline = Pipeline([
|
|
transport.input(), # Websocket input from client
|
|
stt, # Speech-To-Text
|
|
tma_in, # User responses
|
|
llm, # LLM
|
|
tts, # Text-To-Speech
|
|
transport.output(), # Websocket output to client
|
|
tma_out # LLM responses
|
|
])
|
|
|
|
task = PipelineTask(pipeline)
|
|
|
|
@transport.event_handler("on_client_connected")
|
|
async def on_client_connected(transport, client):
|
|
# Kick off the conversation.
|
|
messages.append(
|
|
{"role": "system", "content": "Please introduce yourself to the user."})
|
|
await task.queue_frames([LLMMessagesFrame(messages)])
|
|
|
|
runner = PipelineRunner()
|
|
|
|
await runner.run(task)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|