Compare commits

...

1 Commits

Author SHA1 Message Date
Filipi Fuchter
5fd9348311 Recording high quality audio. 2025-11-26 10:02:12 -03:00
2 changed files with 9 additions and 68 deletions

View File

@@ -50,25 +50,14 @@ import aiofiles
from dotenv import load_dotenv
from loguru import logger
from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
load_dotenv(override=True)
@@ -94,20 +83,10 @@ transport_params = {
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}
@@ -115,38 +94,13 @@ transport_params = {
async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True)
tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",
)
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4")
# Create audio buffer processor
audiobuffer = AudioBufferProcessor()
messages = [
{
"role": "system",
"content": "You are a helpful assistant demonstrating audio recording capabilities. Keep your responses brief and clear.",
},
]
context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
audiobuffer = AudioBufferProcessor(sample_rate=48000)
pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
audiobuffer, # Add audio buffer to pipeline
context_aggregator.assistant(),
]
)
@@ -155,6 +109,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
audio_in_sample_rate=48000,
audio_out_sample_rate= 48000
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
@@ -165,7 +121,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
# Start recording audio
await audiobuffer.start_recording()
# Start conversation - empty prompt to let LLM follow system instructions
await task.queue_frames([LLMRunFrame()])
# await task.queue_frames([LLMRunFrame()])
@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):

View File

@@ -235,7 +235,7 @@ class SmallWebRTCClient:
# We are always resampling it for 16000 if the sample_rate that we receive is bigger than that.
# otherwise we face issues with Silero VAD
self._pipecat_resampler = AudioResampler("s16", "mono", 16000)
self._pipecat_resampler = AudioResampler("s16", "mono", 48000)
@self._webrtc_connection.event_handler("connected")
async def on_connected(connection: SmallWebRTCConnection):
@@ -366,31 +366,16 @@ class SmallWebRTCClient:
await asyncio.sleep(0.01)
continue
if frame.sample_rate > self._in_sample_rate:
resampled_frames = self._pipecat_resampler.resample(frame)
for resampled_frame in resampled_frames:
# 16-bit PCM bytes
pcm_array = resampled_frame.to_ndarray().astype(np.int16)
pcm_bytes = pcm_array.tobytes()
del pcm_array # free NumPy array immediately
audio_frame = InputAudioRawFrame(
audio=pcm_bytes,
sample_rate=resampled_frame.sample_rate,
num_channels=self._audio_in_channels,
)
del pcm_bytes # reference kept in audio_frame
yield audio_frame
else:
resampled_frames = self._pipecat_resampler.resample(frame)
for resampled_frame in resampled_frames:
# 16-bit PCM bytes
pcm_array = frame.to_ndarray().astype(np.int16)
pcm_array = resampled_frame.to_ndarray().astype(np.int16)
pcm_bytes = pcm_array.tobytes()
del pcm_array # free NumPy array immediately
audio_frame = InputAudioRawFrame(
audio=pcm_bytes,
sample_rate=frame.sample_rate,
sample_rate=resampled_frame.sample_rate,
num_channels=self._audio_in_channels,
)
del pcm_bytes # reference kept in audio_frame