from loguru import logger import asyncio import math import struct import time from dataclasses import dataclass, field from typing import List from pipecat.processors.frame_processor import FrameDirection, FrameProcessor from pipecat.frames.frames import ( Frame, AudioRawFrame, InterimTranscriptionFrame, TranscriptionFrame, TextFrame, StartInterruptionFrame, LLMFullResponseStartFrame, TTSStoppedFrame, MetricsFrame ) from pipecat.vad.vad_analyzer import VADAnalyzer, VADState from pipecat.services.deepgram import DeepgramTTSService from pipecat.services.openai import OpenAILLMContext, OpenAILLMContextFrame class GreedyLLMAggregator(FrameProcessor): def __init__(self, context: OpenAILLMContext = None, **kwargs): super().__init__(**kwargs) self.context: OpenAILLMContext = context if context else OpenAILLMContext() async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) logger.debug(f"{frame}") try: if isinstance(frame, InterimTranscriptionFrame): return if isinstance(frame, TranscriptionFrame): # append transcribed text to last "user" frame if self.context.messages and self.context.messages[-1]["role"] == "user": last_frame = self.context.messages.pop() else: last_frame = {"role": "user", "content": ""} last_frame["content"] += " " + frame.text self.context.messages.append(last_frame) oai_context_frame = OpenAILLMContextFrame(context=self.context) logger.debug(f"pushing frame {oai_context_frame}") await self.push_frame(oai_context_frame) return await self.push_frame(frame, direction) except Exception as e: logger.debug(f"error: {e}") class ClearableDeepgramTTSService(DeepgramTTSService): def __init___(self, **kwargs): super().__init(**kwargs) async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if isinstance(frame, StartInterruptionFrame): self._current_sentence = "" @dataclass class BufferedSentence: audio_frames: List[AudioRawFrame] = field(default_factory=list) text_frame: TextFrame = None class VADGate(FrameProcessor): def __init__( self, vad_analyzer: VADAnalyzer = None, context: OpenAILLMContext = None, **kwargs): super().__init__(**kwargs) self.vad_analyzer = vad_analyzer self.context = context self._audio_pusher_task = None self._expect_text_frame_next = False self._sentences: List[BufferedSentence] = [] # queue output from tts one sentence at a time. associate a buffer of audio frames with the content of # each text frame. # # start a coroutine to service the queue and send sentences down the pipeline when possible. # 1. do not send anything when we are not in VADState.QUIET # 2. if we are in VADState.QUIET, send a sentence, estimate how long it will take for that sentence # to output, sleep until it's time to send another sentence # 3. each time we send a sentence, append it to the conversation context # 3. when the sentence buffer becomes empty, cancel the coroutine # 4. if we get a new LLMFullResponse, treat that as a cancellation, too async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) try: # A TTSService will emit a series of AudioRawFrame objects, then a TTSStoppedFrame, # then a TextFrame. if self._expect_text_frame_next: self._expect_text_frame_next = False if isinstance(frame, TextFrame): self._sentences[-1].text_frame = frame else: logger.debug(f"expected a text frame, but received {frame}") await self.push_frame(frame, direction) return else: if isinstance(frame, TextFrame): logger.error(f"XXXXXXXXXXXXXXXXXXX received a text frame, wasn't expecting it.") if isinstance(frame, AudioRawFrame): # if our buffer is empty or has a "finished" sentence at the end, # then we need to start buffering a new sentence if not self._sentences or self._sentences[-1].text_frame: self._sentences.append(BufferedSentence()) self._sentences[-1].audio_frames.append(frame) await self.maybe_start_audio_pusher_task() return if isinstance(frame, TTSStoppedFrame): self._expect_text_frame_next = True await self.push_frame(frame, direction) return # There are two ways we can be interrupted. During greedy inference, a new # LLM response can start. Or, during playout, we can get a traditional # user interruption frame. if (isinstance(frame, LLMFullResponseStartFrame) or isinstance(frame, StartInterruptionFrame)): logger.debug(f"{frame} - Handle interruption in VADGate") self._sentences = [] if self._audio_pusher_task: self._audio_pusher_task.cancel() self._audio_pusher_task = None await self.push_frame(frame, direction) return await self.push_frame(frame, direction) except Exception as e: logger.debug(f"error: {e}") async def maybe_start_audio_pusher_task(self): try: if self._audio_pusher_task: return self._audio_pusher_task = self.get_event_loop().create_task(self.push_audio()) except Exception as e: logger.debug(f"Exception {e}") async def push_audio(self): try: while True: if not self._sentences: await asyncio.sleep(0.01) continue if self.vad_analyzer._vad_state != VADState.QUIET: await asyncio.sleep(0.01) continue # we only want to push completed sentence buffers if not self._sentences[0].text_frame: await asyncio.sleep(0.01) continue s = self._sentences.pop(0) if not s.audio_frames: continue sample_rate = s.audio_frames[0].sample_rate duration = 0 logger.debug(f"Pushing {len(s.audio_frames)} audio frames") for frame in s.audio_frames: await self.push_frame(frame) # assume linear16 encoding (2 bytes per sample). todo: add some more # metadata to AudioRawFrame, maybe duration += (len(frame.audio) / 2 / frame.num_channels) / sample_rate await asyncio.sleep(duration - 20 / 1000) if self.context: logger.debug(f"Appending assistant message to context: [{s.text_frame.text}]") self.context.messages.append( {"role": "assistant", "content": s.text_frame.text} ) await self.push_frame(s.text_frame) except Exception as e: logger.debug(f"Exception {e}") class TranscriptionTimingLogger(FrameProcessor): def __init__(self, avt): super().__init__() self.name = "Transcription" self._avt = avt async def process_frame(self, frame: Frame, direction: FrameDirection): try: await super().process_frame(frame, direction) if isinstance(frame, TranscriptionFrame): elapsed = time.time() - self._avt.last_transition_ts logger.debug(f"Transcription TTF: {elapsed}") await self.push_frame(MetricsFrame(ttfb={self.name: elapsed})) await self.push_frame(frame, direction) except Exception as e: logger.debug(f"Exception {e}") class AudioVolumeTimer(FrameProcessor): def __init__(self): super().__init__() self.last_transition_ts = 0 self._prev_volume = -80 self._speech_volume_threshold = -50 async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) if isinstance(frame, AudioRawFrame): volume = self.calculate_volume(frame) # print(f"Audio volume: {volume:.2f} dB") if (volume >= self._speech_volume_threshold and self._prev_volume < self._speech_volume_threshold): # logger.debug("transition above speech volume threshold") self.last_transition_ts = time.time() elif (volume < self._speech_volume_threshold and self._prev_volume >= self._speech_volume_threshold): # logger.debug("transition below non-speech volume threshold") self.last_transition_ts = time.time() self._prev_volume = volume await self.push_frame(frame, direction) def calculate_volume(self, frame: AudioRawFrame) -> float: if frame.num_channels != 1: raise ValueError(f"Expected 1 channel, got {frame.num_channels}") # Unpack audio data into 16-bit integers fmt = f"{len(frame.audio) // 2}h" audio_samples = struct.unpack(fmt, frame.audio) # Calculate RMS sum_squares = sum(sample**2 for sample in audio_samples) rms = math.sqrt(sum_squares / len(audio_samples)) # Convert RMS to decibels (dB) # Reference: maximum value for 16-bit audio is 32767 if rms > 0: db = 20 * math.log10(rms / 32767) else: db = -96 # Minimum value (almost silent) return db