Compare commits

...

12 Commits

Author SHA1 Message Date
Aleix Conchillo Flaqué
dd89ac1bb8 audio: add frames to update bot background sound volume 2024-11-01 14:22:29 -07:00
Aleix Conchillo Flaqué
41b76f2944 audio: add frames to allow playing/pausing bot background sound 2024-11-01 09:05:36 -07:00
Aleix Conchillo Flaqué
cca6f1fe45 audio: add frame to allow changing bot background sound 2024-11-01 09:00:10 -07:00
Aleix Conchillo Flaqué
f96cf2292b audio: rename BotBackgroundSound file and allow loading multiple files 2024-11-01 09:00:10 -07:00
Aleix Conchillo Flaqué
21df297e91 rtvi: create queues before tasks 2024-10-31 15:56:22 -07:00
Aleix Conchillo Flaqué
03726a4470 examples: don't require room token if using an STT 2024-10-31 15:56:22 -07:00
Aleix Conchillo Flaqué
184f2cdb55 examples: add bot background sound example 2024-10-31 15:56:14 -07:00
Aleix Conchillo Flaqué
f6b2c3800f pyproject: add soundfile dependency 2024-10-31 15:55:54 -07:00
Aleix Conchillo Flaqué
9c2083254b base_output: generate smaller audio frames of the same incoming type 2024-10-31 15:55:54 -07:00
Aleix Conchillo Flaqué
47f42ac0cb base_output: only generate bot speaking with TTS audio frames 2024-10-31 15:55:54 -07:00
Aleix Conchillo Flaqué
98525a5f27 audio: add BotBackgroundSound processor 2024-10-31 15:55:52 -07:00
daniil
cff62650ee add background_noise service and example 2024-10-31 15:55:24 -07:00
9 changed files with 331 additions and 11 deletions

View File

@@ -9,6 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Added `BotBackgroundSound` processor. This processors allows you to add
background sound to the bots output. The background sound will always be
playing even if the bot is not talking. The volume of the background sound and
the sample rate can be configure. You can load any file format supported by
the `soundfile` library.
(see https://github.com/bastibe/python-soundfile)
- Added `GatedOpenAILLMContextAggregator`. This aggregator keeps the last
received OpenAI LLM context frame and it doesn't let it through until the
notifier is notified.
@@ -57,6 +64,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Other
- Add `23-bot-background-sound.py` foundational example.
- Added a new foundational example 22-natural-conversation.py. This examples
shows how to achieve a more natural conversation detecting when the user ends
statement.

View File

@@ -31,11 +31,11 @@ logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
token,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,

View File

@@ -32,11 +32,11 @@ logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
token,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,

View File

@@ -32,11 +32,11 @@ logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
(room_url, token) = await configure(session)
(room_url, _) = await configure(session)
transport = DailyTransport(
room_url,
token,
None,
"Respond bot",
DailyParams(
audio_out_enabled=True,

View File

@@ -0,0 +1,111 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import argparse
import asyncio
import aiohttp
import os
import sys
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.frames.frames import LLMMessagesFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.audio.bot_background_sound import BotBackgroundSound
from pipecat.services.cartesia import CartesiaTTSService
from pipecat.services.openai import OpenAILLMService
from pipecat.transports.services.daily import DailyParams, DailyTransport
from runner import configure_with_args
from loguru import logger
from dotenv import load_dotenv
load_dotenv(override=True)
logger.remove(0)
logger.add(sys.stderr, level="DEBUG")
async def main():
async with aiohttp.ClientSession() as session:
parser = argparse.ArgumentParser(description="Bot Background Sound")
parser.add_argument("-i", "--input", type=str, required=True, help="Input audio file")
(room_url, token, args) = await configure_with_args(session, parser)
transport = DailyTransport(
room_url,
token,
"Respond bot",
DailyParams(
audio_out_enabled=True,
transcription_enabled=True,
vad_enabled=True,
vad_analyzer=SileroVADAnalyzer(),
),
)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="79a125e8-cd45-4c13-8a67-188112f4dd22", # British Lady
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4o")
messages = [
{
"role": "system",
"content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.",
},
]
context = OpenAILLMContext(messages)
context_aggregator = llm.create_context_aggregator(context)
background_sound = BotBackgroundSound(
sound_files={"office": args.input}, default_sound="office"
)
pipeline = Pipeline(
[
transport.input(), # Transport user input
context_aggregator.user(), # User responses
llm, # LLM
tts, # TTS
background_sound, # Bot background sound
transport.output(), # Transport bot output
context_aggregator.assistant(), # Assistant spoken responses
]
)
task = PipelineTask(
pipeline,
PipelineParams(
allow_interruptions=True,
enable_metrics=True,
enable_usage_metrics=True,
report_only_initial_ttfb=True,
),
)
@transport.event_handler("on_first_participant_joined")
async def on_first_participant_joined(transport, participant):
await transport.capture_participant_transcription(participant["id"])
# Kick off the conversation.
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
await task.queue_frames([LLMMessagesFrame(messages)])
runner = PipelineRunner()
await runner.run(task)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -60,6 +60,7 @@ openai = [ "openai~=1.50.2", "websockets~=13.1", "python-deepcompare~=1.0.1" ]
openpipe = [ "openpipe~=4.24.0" ]
playht = [ "pyht~=0.1.4", "websockets~=13.1" ]
silero = [ "onnxruntime~=1.19.2" ]
soundfile = [ "soundfile~=0.12.1" ]
together = [ "openai~=1.50.2" ]
websocket = [ "websockets~=13.1", "fastapi~=0.115.0" ]
whisper = [ "faster-whisper~=1.0.3" ]

View File

@@ -0,0 +1,195 @@
#
# Copyright (c) 2024, Daily
#
# SPDX-License-Identifier: BSD 2-Clause License
#
import asyncio
from dataclasses import dataclass
from typing import Any, Dict, Mapping, Optional
import numpy as np
from pipecat.audio.utils import resample_audio
from pipecat.processors.frame_processor import FrameProcessor, FrameDirection
from pipecat.frames.frames import (
CancelFrame,
ControlFrame,
ErrorFrame,
OutputAudioRawFrame,
Frame,
EndFrame,
StartFrame,
TTSAudioRawFrame,
TTSStartedFrame,
TTSStoppedFrame,
)
from loguru import logger
try:
import soundfile as sf
except ModuleNotFoundError as e:
logger.error(f"Exception: {e}")
logger.error(
"In order to use background sound, you need to `pip install pipecat-ai[soundfile]`."
)
raise Exception(f"Missing module: {e}")
@dataclass
class ChangeBotBackgroundSoundFrame(ControlFrame):
sound_name: str
volume: Optional[float] = None
@dataclass
class UpdateBotBackgroundSoundVolumeFrame(ControlFrame):
volume: float
@dataclass
class PlayBotBackgroundSoundFrame(ControlFrame):
pass
@dataclass
class PauseBotBackgroundSoundFrame(ControlFrame):
pass
class BotBackgroundSound(FrameProcessor):
def __init__(
self,
sound_files: Mapping[str, str],
default_sound: str,
volume: float = 0.4,
sample_rate: int = 24000,
**kwargs,
):
super().__init__(**kwargs)
self._sound_files = sound_files
self._volume = volume
self._sample_rate = sample_rate
self._sound_pos = 0
self._sounds: Dict[str, Any] = {}
self._current_sound = default_sound
self._playing = True
self._bot_speaking = False
self._sleep_time = 0.02
async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)
if isinstance(frame, StartFrame):
await self._start()
await self.push_frame(frame, direction)
elif isinstance(frame, (EndFrame, CancelFrame)):
await self._stop()
await self.push_frame(frame, direction)
elif isinstance(frame, TTSStartedFrame):
self._bot_speaking = True
elif isinstance(frame, TTSStoppedFrame):
self._bot_speaking = False
elif isinstance(frame, TTSAudioRawFrame):
frame.audio = self._mix_with_sound(frame.audio)
await self.push_frame(frame)
elif isinstance(frame, ChangeBotBackgroundSoundFrame):
await self._change_background_sound(frame)
elif isinstance(frame, UpdateBotBackgroundSoundVolumeFrame):
await self._update_background_volume(frame.volume)
elif isinstance(frame, PlayBotBackgroundSoundFrame):
await self._play_background_sound(True)
elif isinstance(frame, PauseBotBackgroundSoundFrame):
await self._play_background_sound(False)
else:
await self.push_frame(frame, direction)
async def _start(self):
for sound_name, file_name in self._sound_files.items():
await asyncio.to_thread(self._load_sound_file, sound_name, file_name)
self._audio_queue = asyncio.Queue()
self._audio_task = self.get_event_loop().create_task(self._audio_task_handler())
async def _stop(self):
self._audio_task.cancel()
await self._audio_task
async def _play_background_sound(self, play: bool):
self._playing = play
async def _change_background_sound(self, frame: ChangeBotBackgroundSoundFrame):
if frame.sound_name in self._sound_files:
if frame.volume:
await self._update_background_volume(frame.volume)
self._current_sound = frame.sound_name
self._sound_pos = 0
else:
error_msg = f"{self} sound {frame.sound_name} is not available"
logger.error(error_msg)
await self.push_error(ErrorFrame(error_msg))
async def _update_background_volume(self, volume: float):
self._volume = volume
def _load_sound_file(self, sound_name: str, file_name: str):
try:
logger.debug(f"{self} loading background sound from {file_name}")
sound, sample_rate = sf.read(file_name, dtype="int16")
audio = sound.tobytes()
if sample_rate != self._sample_rate:
logger.debug(f"{self} resampling background sound to {self._sample_rate}")
audio = resample_audio(audio, sample_rate, self._sample_rate)
# Convert from np to bytes again.
self._sounds[sound_name] = np.frombuffer(audio, dtype=np.int16)
except Exception as ex:
logger.error(f"{self} unable to open file {file_name}")
def _mix_with_sound(self, audio: bytes):
"""Mixes raw audio frames with chunks of the same length from the sound
file.
"""
if audio:
audio_np = np.frombuffer(audio, dtype=np.int16)
else:
num_samples = int(self._sleep_time * self._sample_rate)
audio_np = np.zeros(num_samples, dtype=np.int16)
chunk_size = len(audio_np)
# Sound currently playing.
sound = self._sounds[self._current_sound]
# Go back to the beginning if we don't have enough data.
if self._sound_pos + chunk_size > len(sound):
self._sound_pos = 0
start_pos = self._sound_pos
end_pos = self._sound_pos + chunk_size
self._sound_pos = end_pos
sound_np = sound[start_pos:end_pos]
mixed_audio = np.clip(audio_np + sound_np * self._volume, -32768, 32767).astype(np.int16)
return mixed_audio.astype(np.int16).tobytes()
async def _audio_task_handler(self):
while True:
try:
if self._playing and not self._bot_speaking:
audio = self._mix_with_sound(b"")
frame = OutputAudioRawFrame(
audio=audio, sample_rate=self._sample_rate, num_channels=1
)
await self.push_frame(frame)
await asyncio.sleep(self._sleep_time)
except asyncio.CancelledError:
break

View File

@@ -590,12 +590,12 @@ class RTVIProcessor(FrameProcessor):
self._registered_services: Dict[str, RTVIService] = {}
# A task to process incoming action frames.
self._action_task = self.get_event_loop().create_task(self._action_task_handler())
self._action_queue = asyncio.Queue()
self._action_task = self.get_event_loop().create_task(self._action_task_handler())
# A task to process incoming transport messages.
self._message_task = self.get_event_loop().create_task(self._message_task_handler())
self._message_queue = asyncio.Queue()
self._message_task = self.get_event_loop().create_task(self._message_task_handler())
self._register_event_handler("on_bot_ready")

View File

@@ -28,6 +28,7 @@ from pipecat.frames.frames import (
StartInterruptionFrame,
StopInterruptionFrame,
SystemFrame,
TTSAudioRawFrame,
TransportMessageFrame,
TransportMessageUrgentFrame,
)
@@ -174,9 +175,10 @@ class BaseOutputTransport(FrameProcessor):
if self._params.audio_out_is_live:
await self._audio_out_queue.put(frame)
else:
cls = type(frame)
self._audio_buffer.extend(frame.audio)
while len(self._audio_buffer) >= self._audio_chunk_size:
chunk = OutputAudioRawFrame(
chunk = cls(
bytes(self._audio_buffer[: self._audio_chunk_size]),
sample_rate=frame.sample_rate,
num_channels=frame.num_channels,
@@ -397,13 +399,15 @@ class BaseOutputTransport(FrameProcessor):
frame = await asyncio.wait_for(self._audio_out_queue.get(), timeout=wait_time)
# Notify the bot started speaking upstream if necessary.
await self._bot_started_speaking()
if isinstance(frame, TTSAudioRawFrame):
await self._bot_started_speaking()
# Send audio.
await self.write_raw_audio_frames(frame.audio)
# Notify the bot is speaking upstream.
await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
if isinstance(frame, TTSAudioRawFrame):
await self.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
# Push frame downstream in case anyone else needs it.
await self.push_frame(frame)