From 23ad1815156614667e23c7bfd37f540e86619cbc Mon Sep 17 00:00:00 2001 From: Mark Backman Date: Tue, 24 Feb 2026 13:09:29 -0500 Subject: [PATCH] Fix Soniox processing metrics to measure token-to-transcript time Move start_processing_metrics from run_stt (called per audio chunk, producing noisy 0ms logs) to _receive_messages when the first final token arrives for a new utterance. The existing stop_processing_metrics in send_endpoint_transcript completes the pair, giving a meaningful measurement of time from first recognition to finalized transcript. --- src/pipecat/services/soniox/stt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py index 61dbb794f..630e11862 100644 --- a/src/pipecat/services/soniox/stt.py +++ b/src/pipecat/services/soniox/stt.py @@ -301,10 +301,8 @@ class SonioxSTTService(WebsocketSTTService): Yields: Frame: None (transcription results come via WebSocket callbacks). """ - await self.start_processing_metrics() if self._websocket and self._websocket.state is State.OPEN: await self._websocket.send(audio) - await self.stop_processing_metrics() yield None @@ -485,6 +483,8 @@ class SonioxSTTService(WebsocketSTTService): # the rest will be sent as interim tokens (even final tokens). await send_endpoint_transcript() else: + if not self._final_transcription_buffer: + await self.start_processing_metrics() self._final_transcription_buffer.append(token) else: non_final_transcription.append(token)