From 23ad1815156614667e23c7bfd37f540e86619cbc Mon Sep 17 00:00:00 2001
From: Mark Backman <mark@daily.co>
Date: Tue, 24 Feb 2026 13:09:29 -0500
Subject: [PATCH] Fix Soniox processing metrics to measure token-to-transcript
 time

Move start_processing_metrics from run_stt (called per audio chunk,
producing noisy 0ms logs) to _receive_messages when the first final
token arrives for a new utterance. The existing stop_processing_metrics
in send_endpoint_transcript completes the pair, giving a meaningful
measurement of time from first recognition to finalized transcript.
---
 src/pipecat/services/soniox/stt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipecat/services/soniox/stt.py b/src/pipecat/services/soniox/stt.py
index 61dbb794f..630e11862 100644
--- a/src/pipecat/services/soniox/stt.py
+++ b/src/pipecat/services/soniox/stt.py
@@ -301,10 +301,8 @@ class SonioxSTTService(WebsocketSTTService):
         Yields:
             Frame: None (transcription results come via WebSocket callbacks).
         """
-        await self.start_processing_metrics()
         if self._websocket and self._websocket.state is State.OPEN:
             await self._websocket.send(audio)
-        await self.stop_processing_metrics()
 
         yield None
 
@@ -485,6 +483,8 @@ class SonioxSTTService(WebsocketSTTService):
                             # the rest will be sent as interim tokens (even final tokens).
                             await send_endpoint_transcript()
                         else:
+                            if not self._final_transcription_buffer:
+                                await self.start_processing_metrics()
                             self._final_transcription_buffer.append(token)
                     else:
                         non_final_transcription.append(token)