Compare commits
12 Commits
vp-moq-vib
...
aleix/back
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd89ac1bb8 | ||
|
|
41b76f2944 | ||
|
|
cca6f1fe45 | ||
|
|
f96cf2292b | ||
|
|
21df297e91 | ||
|
|
03726a4470 | ||
|
|
184f2cdb55 | ||
|
|
f6b2c3800f | ||
|
|
9c2083254b | ||
|
|
47f42ac0cb | ||
|
|
98525a5f27 | ||
|
|
cff62650ee |
@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Added
|
||||
|
||||
- Added `BotBackgroundSound` processor. This processors allows you to add
|
||||
background sound to the bots output. The background sound will always be
|
||||
playing even if the bot is not talking. The volume of the background sound and
|
||||
the sample rate can be configure. You can load any file format supported by
|
||||
the `soundfile` library.
|
||||
(see https://github.com/bastibe/python-soundfile)
|
||||
|
||||
- Added `GatedOpenAILLMContextAggregator`. This aggregator keeps the last
|
||||
received OpenAI LLM context frame and it doesn't let it through until the
|
||||
notifier is notified.
|
||||
@@ -57,6 +64,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
### Other
|
||||
|
||||
- Add `23-bot-background-sound.py` foundational example.
|
||||
|
||||
- Added a new foundational example 22-natural-conversation.py. This examples
|
||||
shows how to achieve a more natural conversation detecting when the user ends
|
||||
statement.
|
||||
|
||||
@@ -31,11 +31,11 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
(room_url, token) = await configure(session)
|
||||
(room_url, _) = await configure(session)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
None,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
|
||||
@@ -32,11 +32,11 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
(room_url, token) = await configure(session)
|
||||
(room_url, _) = await configure(session)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
None,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
|
||||
@@ -32,11 +32,11 @@ logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
(room_url, token) = await configure(session)
|
||||
(room_url, _) = await configure(session)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
None,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
|
||||
111
examples/foundational/23-bot-background-sound.py
Normal file
111
examples/foundational/23-bot-background-sound.py
Normal file
@@ -0,0 +1,111 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pipecat.audio.vad.silero import SileroVADAnalyzer
|
||||
from pipecat.frames.frames import LLMMessagesFrame
|
||||
from pipecat.pipeline.pipeline import Pipeline
|
||||
from pipecat.pipeline.runner import PipelineRunner
|
||||
from pipecat.pipeline.task import PipelineParams, PipelineTask
|
||||
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
|
||||
from pipecat.processors.audio.bot_background_sound import BotBackgroundSound
|
||||
from pipecat.services.cartesia import CartesiaTTSService
|
||||
from pipecat.services.openai import OpenAILLMService
|
||||
from pipecat.transports.services.daily import DailyParams, DailyTransport
|
||||
|
||||
from runner import configure_with_args
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(override=True)
|
||||
|
||||
logger.remove(0)
|
||||
logger.add(sys.stderr, level="DEBUG")
|
||||
|
||||
|
||||
async def main():
|
||||
async with aiohttp.ClientSession() as session:
|
||||
parser = argparse.ArgumentParser(description="Bot Background Sound")
|
||||
parser.add_argument("-i", "--input", type=str, required=True, help="Input audio file")
|
||||
|
||||
(room_url, token, args) = await configure_with_args(session, parser)
|
||||
|
||||
transport = DailyTransport(
|
||||
room_url,
|
||||
token,
|
||||
"Respond bot",
|
||||
DailyParams(
|
||||
audio_out_enabled=True,
|
||||
transcription_enabled=True,
|
||||
vad_enabled=True,
|
||||
vad_analyzer=SileroVADAnalyzer(),
|
||||
),
|
||||
)
|
||||
|
||||
tts = CartesiaTTSService(
|
||||
api_key=os.getenv("CARTESIA_API_KEY"),
|
||||
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
|
||||
)
|
||||
|
||||
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
|
||||
},
|
||||
]
|
||||
|
||||
context = OpenAILLMContext(messages)
|
||||
context_aggregator = llm.create_context_aggregator(context)
|
||||
|
||||
background_sound = BotBackgroundSound(
|
||||
sound_files={"office": args.input}, default_sound="office"
|
||||
)
|
||||
|
||||
pipeline = Pipeline(
|
||||
[
|
||||
transport.input(), # Transport user input
|
||||
context_aggregator.user(), # User responses
|
||||
llm, # LLM
|
||||
tts, # TTS
|
||||
background_sound, # Bot background sound
|
||||
transport.output(), # Transport bot output
|
||||
context_aggregator.assistant(), # Assistant spoken responses
|
||||
]
|
||||
)
|
||||
|
||||
task = PipelineTask(
|
||||
pipeline,
|
||||
PipelineParams(
|
||||
allow_interruptions=True,
|
||||
enable_metrics=True,
|
||||
enable_usage_metrics=True,
|
||||
report_only_initial_ttfb=True,
|
||||
),
|
||||
)
|
||||
|
||||
@transport.event_handler("on_first_participant_joined")
|
||||
async def on_first_participant_joined(transport, participant):
|
||||
await transport.capture_participant_transcription(participant["id"])
|
||||
# Kick off the conversation.
|
||||
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
|
||||
await task.queue_frames([LLMMessagesFrame(messages)])
|
||||
|
||||
runner = PipelineRunner()
|
||||
|
||||
await runner.run(task)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -60,6 +60,7 @@ openai = [ "openai~=1.50.2", "websockets~=13.1", "python-deepcompare~=1.0.1" ]
|
||||
openpipe = [ "openpipe~=4.24.0" ]
|
||||
playht = [ "pyht~=0.1.4", "websockets~=13.1" ]
|
||||
silero = [ "onnxruntime~=1.19.2" ]
|
||||
soundfile = [ "soundfile~=0.12.1" ]
|
||||
together = [ "openai~=1.50.2" ]
|
||||
websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
|
||||
whisper = [ "faster-whisper~=1.0.3" ]
|
||||
|
||||
195
src/pipecat/processors/audio/bot_background_sound.py
Normal file
195
src/pipecat/processors/audio/bot_background_sound.py
Normal file
@@ -0,0 +1,195 @@
|
||||
#
|
||||
# Copyright (c) 2024, Daily
|
||||
#
|
||||
# SPDX-License-Identifier: BSD 2-Clause License
|
||||
#
|
||||
|
||||
import asyncio
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Mapping, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pipecat.audio.utils import resample_audio
|
||||
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
|
||||
from pipecat.frames.frames import (
|
||||
CancelFrame,
|
||||
ControlFrame,
|
||||
ErrorFrame,
|
||||
OutputAudioRawFrame,
|
||||
Frame,
|
||||
EndFrame,
|
||||
StartFrame,
|
||||
TTSAudioRawFrame,
|
||||
TTSStartedFrame,
|
||||
TTSStoppedFrame,
|
||||
)
|
||||
|
||||
from loguru import logger
|
||||
|
||||
try:
|
||||
import soundfile as sf
|
||||
except ModuleNotFoundError as e:
|
||||
logger.error(f"Exception: {e}")
|
||||
logger.error(
|
||||
"In order to use background sound, you need to `pip install pipecat-ai[soundfile]`."
|
||||
)
|
||||
raise Exception(f"Missing module: {e}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChangeBotBackgroundSoundFrame(ControlFrame):
|
||||
sound_name: str
|
||||
volume: Optional[float] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class UpdateBotBackgroundSoundVolumeFrame(ControlFrame):
|
||||
volume: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class PlayBotBackgroundSoundFrame(ControlFrame):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class PauseBotBackgroundSoundFrame(ControlFrame):
|
||||
pass
|
||||
|
||||
|
||||
class BotBackgroundSound(FrameProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
sound_files: Mapping[str, str],
|
||||
default_sound: str,
|
||||
volume: float = 0.4,
|
||||
sample_rate: int = 24000,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self._sound_files = sound_files
|
||||
self._volume = volume
|
||||
self._sample_rate = sample_rate
|
||||
|
||||
self._sound_pos = 0
|
||||
self._sounds: Dict[str, Any] = {}
|
||||
self._current_sound = default_sound
|
||||
self._playing = True
|
||||
|
||||
self._bot_speaking = False
|
||||
self._sleep_time = 0.02
|
||||
|
||||
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
||||
await super().process_frame(frame, direction)
|
||||
|
||||
if isinstance(frame, StartFrame):
|
||||
await self._start()
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, (EndFrame, CancelFrame)):
|
||||
await self._stop()
|
||||
await self.push_frame(frame, direction)
|
||||
elif isinstance(frame, TTSStartedFrame):
|
||||
self._bot_speaking = True
|
||||
elif isinstance(frame, TTSStoppedFrame):
|
||||
self._bot_speaking = False
|
||||
elif isinstance(frame, TTSAudioRawFrame):
|
||||
frame.audio = self._mix_with_sound(frame.audio)
|
||||
await self.push_frame(frame)
|
||||
elif isinstance(frame, ChangeBotBackgroundSoundFrame):
|
||||
await self._change_background_sound(frame)
|
||||
elif isinstance(frame, UpdateBotBackgroundSoundVolumeFrame):
|
||||
await self._update_background_volume(frame.volume)
|
||||
elif isinstance(frame, PlayBotBackgroundSoundFrame):
|
||||
await self._play_background_sound(True)
|
||||
elif isinstance(frame, PauseBotBackgroundSoundFrame):
|
||||
await self._play_background_sound(False)
|
||||
else:
|
||||
await self.push_frame(frame, direction)
|
||||
|
||||
async def _start(self):
|
||||
for sound_name, file_name in self._sound_files.items():
|
||||
await asyncio.to_thread(self._load_sound_file, sound_name, file_name)
|
||||
|
||||
self._audio_queue = asyncio.Queue()
|
||||
self._audio_task = self.get_event_loop().create_task(self._audio_task_handler())
|
||||
|
||||
async def _stop(self):
|
||||
self._audio_task.cancel()
|
||||
await self._audio_task
|
||||
|
||||
async def _play_background_sound(self, play: bool):
|
||||
self._playing = play
|
||||
|
||||
async def _change_background_sound(self, frame: ChangeBotBackgroundSoundFrame):
|
||||
if frame.sound_name in self._sound_files:
|
||||
if frame.volume:
|
||||
await self._update_background_volume(frame.volume)
|
||||
self._current_sound = frame.sound_name
|
||||
self._sound_pos = 0
|
||||
else:
|
||||
error_msg = f"{self} sound {frame.sound_name} is not available"
|
||||
logger.error(error_msg)
|
||||
await self.push_error(ErrorFrame(error_msg))
|
||||
|
||||
async def _update_background_volume(self, volume: float):
|
||||
self._volume = volume
|
||||
|
||||
def _load_sound_file(self, sound_name: str, file_name: str):
|
||||
try:
|
||||
logger.debug(f"{self} loading background sound from {file_name}")
|
||||
sound, sample_rate = sf.read(file_name, dtype="int16")
|
||||
|
||||
audio = sound.tobytes()
|
||||
if sample_rate != self._sample_rate:
|
||||
logger.debug(f"{self} resampling background sound to {self._sample_rate}")
|
||||
audio = resample_audio(audio, sample_rate, self._sample_rate)
|
||||
|
||||
# Convert from np to bytes again.
|
||||
self._sounds[sound_name] = np.frombuffer(audio, dtype=np.int16)
|
||||
except Exception as ex:
|
||||
logger.error(f"{self} unable to open file {file_name}")
|
||||
|
||||
def _mix_with_sound(self, audio: bytes):
|
||||
"""Mixes raw audio frames with chunks of the same length from the sound
|
||||
file.
|
||||
|
||||
"""
|
||||
if audio:
|
||||
audio_np = np.frombuffer(audio, dtype=np.int16)
|
||||
else:
|
||||
num_samples = int(self._sleep_time * self._sample_rate)
|
||||
audio_np = np.zeros(num_samples, dtype=np.int16)
|
||||
|
||||
chunk_size = len(audio_np)
|
||||
|
||||
# Sound currently playing.
|
||||
sound = self._sounds[self._current_sound]
|
||||
|
||||
# Go back to the beginning if we don't have enough data.
|
||||
if self._sound_pos + chunk_size > len(sound):
|
||||
self._sound_pos = 0
|
||||
|
||||
start_pos = self._sound_pos
|
||||
end_pos = self._sound_pos + chunk_size
|
||||
self._sound_pos = end_pos
|
||||
|
||||
sound_np = sound[start_pos:end_pos]
|
||||
|
||||
mixed_audio = np.clip(audio_np + sound_np * self._volume, -32768, 32767).astype(np.int16)
|
||||
|
||||
return mixed_audio.astype(np.int16).tobytes()
|
||||
|
||||
async def _audio_task_handler(self):
|
||||
while True:
|
||||
try:
|
||||
if self._playing and not self._bot_speaking:
|
||||
audio = self._mix_with_sound(b"")
|
||||
frame = OutputAudioRawFrame(
|
||||
audio=audio, sample_rate=self._sample_rate, num_channels=1
|
||||
)
|
||||
await self.push_frame(frame)
|
||||
await asyncio.sleep(self._sleep_time)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
@@ -590,12 +590,12 @@ class RTVIProcessor(FrameProcessor):
|
||||
self._registered_services: Dict[str, RTVIService] = {}
|
||||
|
||||
# A task to process incoming action frames.
|
||||
self._action_task = self.get_event_loop().create_task(self._action_task_handler())
|
||||
self._action_queue = asyncio.Queue()
|
||||
self._action_task = self.get_event_loop().create_task(self._action_task_handler())
|
||||
|
||||
# A task to process incoming transport messages.
|
||||
self._message_task = self.get_event_loop().create_task(self._message_task_handler())
|
||||
self._message_queue = asyncio.Queue()
|
||||
self._message_task = self.get_event_loop().create_task(self._message_task_handler())
|
||||
|
||||
self._register_event_handler("on_bot_ready")
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ from pipecat.frames.frames import (
|
||||
StartInterruptionFrame,
|
||||
StopInterruptionFrame,
|
||||
SystemFrame,
|
||||
TTSAudioRawFrame,
|
||||
TransportMessageFrame,
|
||||
TransportMessageUrgentFrame,
|
||||
)
|
||||
@@ -174,9 +175,10 @@ class BaseOutputTransport(FrameProcessor):
|
||||
if self._params.audio_out_is_live:
|
||||
await self._audio_out_queue.put(frame)
|
||||
else:
|
||||
cls = type(frame)
|
||||
self._audio_buffer.extend(frame.audio)
|
||||
while len(self._audio_buffer) >= self._audio_chunk_size:
|
||||
chunk = OutputAudioRawFrame(
|
||||
chunk = cls(
|
||||
bytes(self._audio_buffer[: self._audio_chunk_size]),
|
||||
sample_rate=frame.sample_rate,
|
||||
num_channels=frame.num_channels,
|
||||
@@ -397,13 +399,15 @@ class BaseOutputTransport(FrameProcessor):
|
||||
frame = await asyncio.wait_for(self._audio_out_queue.get(), timeout=wait_time)
|
||||
|
||||
# Notify the bot started speaking upstream if necessary.
|
||||
await self._bot_started_speaking()
|
||||
if isinstance(frame, TTSAudioRawFrame):
|
||||
await self._bot_started_speaking()
|
||||
|
||||
# Send audio.
|
||||
await self.write_raw_audio_frames(frame.audio)
|
||||
|
||||
# Notify the bot is speaking upstream.
|
||||
await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
if isinstance(frame, TTSAudioRawFrame):
|
||||
await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
|
||||
|
||||
# Push frame downstream in case anyone else needs it.
|
||||
await self.push_frame(frame)
|
||||
|
||||
Reference in New Issue
Block a user