diff --git a/CHANGELOG.md b/CHANGELOG.md index 57104786a..1bee0ca5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -89,6 +89,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `UserStoppedSpeakingFrame`. This helps in faster transcriptions and clearing the `Deepgram` audio buffer. +- Changed `DeepgramSTTService` to generate metrics using pipeline VAD. + ### Fixed - Fixed `UserIdleProcessor` not properly propagating `EndFrame`s through the diff --git a/src/pipecat/services/deepgram.py b/src/pipecat/services/deepgram.py index bba2e72ae..a5d36370f 100644 --- a/src/pipecat/services/deepgram.py +++ b/src/pipecat/services/deepgram.py @@ -20,6 +20,7 @@ from pipecat.frames.frames import ( TTSAudioRawFrame, TTSStartedFrame, TTSStoppedFrame, + UserStartedSpeakingFrame, UserStoppedSpeakingFrame, ) from pipecat.processors.frame_processor import FrameDirection @@ -169,7 +170,7 @@ class DeepgramSTTService(STTService): return self._settings["vad_events"] def can_generate_metrics(self) -> bool: - return self.vad_enabled + return True async def set_model(self, model: str): await super().set_model(model) @@ -210,9 +211,12 @@ class DeepgramSTTService(STTService): logger.debug("Disconnecting from Deepgram") await self._connection.finish() - async def _on_speech_started(self, *args, **kwargs): + async def start_metrics(self): await self.start_ttfb_metrics() await self.start_processing_metrics() + + async def _on_speech_started(self, *args, **kwargs): + await self.start_metrics() await self._call_event_handler("on_speech_started", *args, **kwargs) async def _on_utterance_end(self, *args, **kwargs): @@ -243,7 +247,10 @@ class DeepgramSTTService(STTService): async def process_frame(self, frame: Frame, direction: FrameDirection): await super().process_frame(frame, direction) - if isinstance(frame, UserStoppedSpeakingFrame): + if isinstance(frame, UserStartedSpeakingFrame) and not self.vad_enabled: + # Start metrics if Deepgram VAD is disabled & pipeline VAD has detected speech + await self.start_metrics() + elif isinstance(frame, UserStoppedSpeakingFrame): # https://developers.deepgram.com/docs/finalize await self._connection.finalize() - logger.debug(f"Triggering finalize event on: {frame.name=}, {direction=}") + logger.trace(f"Triggered finalize event on: {frame.name=}, {direction=}")